<a href="https://colab.research.google.com/github/Yanju10299/CLEAR-VISIONs/blob/main/Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [110]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score



In [111]:
train = pd.read_csv('Train_Data.csv')
test = pd.read_csv('Test_Data.csv')


In [112]:
train_clean = train.dropna(subset=['age_group'])

In [113]:
# Feature Engineering on Training Data
X = train_clean.drop(['SEQN', 'age_group'], axis=1)
X['GLU_ratio'] = X['LBXGLU'] / (X['LBXGLT'] + 1e-5)
X['BMI_INS'] = X['BMXBMI'] * X['LBXIN']
X['active_diabetic'] = ((X['PAQ605'] == 1) & (X['DIQ010'] == 1)).astype(int)

# Feature Engineering on Test Data
X_test_final = test.drop(['SEQN'], axis=1)
X_test_final['GLU_ratio'] = X_test_final['LBXGLU'] / (X_test_final['LBXGLT'] + 1e-5)
X_test_final['BMI_INS'] = X_test_final['BMXBMI'] * X_test_final['LBXIN']
X_test_final['active_diabetic'] = ((X_test_final['PAQ605'] == 1) & (X_test_final['DIQ010'] == 1)).astype(int)

# Fill missing values
X = X.fillna(X.median())
X_test_final = X_test_final.fillna(X_test_final.median())

# Match test set column order to training set
X_test_final = X_test_final[X.columns]


In [115]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

In [116]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [117]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_scaled, y)


In [118]:
preds = clf.predict(X_test_scaled)

submission = pd.DataFrame({'age_group': preds})
submission.to_csv('my_submission.csv', index=False)

In [119]:
val_preds = clf.predict(X_val)


In [120]:
accuracy = accuracy_score(y_val, val_preds)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.9974424552429667


In [121]:
scores = cross_val_score(clf, X_scaled, y, cv=5, scoring='accuracy')

In [122]:
print("Cross-validation scores:", scores)

Cross-validation scores: [0.8286445  0.84398977 0.84358974 0.84102564 0.82820513]


In [123]:
print("Mean accuracy:", scores.mean())

Mean accuracy: 0.8370909567840515


In [124]:
print("Standard deviation:", scores.std())

Standard deviation: 0.007149967153517573


In [125]:
#using xgboost
!pip install xgboost




In [126]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [127]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [128]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)


In [129]:
val_preds = model.predict(X_val)
accuracy = accuracy_score(y_val, val_preds)
print("Validation Accuracy (XGBoost):", accuracy)


Validation Accuracy (XGBoost): 0.8491048593350383


In [131]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb

# ✅ Load Data
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")

# 🧹 Drop missing target values
train_clean = train.dropna(subset=['age_group'])

# 🎯 Define X and y
X = train_clean.drop(['SEQN', 'age_group'], axis=1)
y = train_clean['age_group'].map({'Adult': 0, 'Senior': 1})

# 🧪 Feature Engineering (Train)
X['GLU_ratio'] = X['LBXGLU'] / (X['LBXGLT'] + 1e-5)
X['BMI_INS'] = X['BMXBMI'] * X['LBXIN']
X['active_diabetic'] = ((X['PAQ605'] == 1) & (X['DIQ010'] == 1)).astype(int)

# 🧪 Feature Engineering (Test)
X_test_final = test.drop(['SEQN'], axis=1)
X_test_final['GLU_ratio'] = X_test_final['LBXGLU'] / (X_test_final['LBXGLT'] + 1e-5)
X_test_final['BMI_INS'] = X_test_final['BMXBMI'] * X_test_final['LBXIN']
X_test_final['active_diabetic'] = ((X_test_final['PAQ605'] == 1) & (X_test_final['DIQ010'] == 1)).astype(int)

# 🧼 Fill missing values
X = X.fillna(X.median())
X_test_final = X_test_final.fillna(X_test_final.median())

# 📐 Match column order
X_test_final = X_test_final[X.columns]

# 🔃 Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# 🎯 Grid Search CV with XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

grid = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_scaled, y)

print("✅ Best Parameters:", grid.best_params_)
print("✅ Best Cross-Validation Accuracy:", grid.best_score_)

# 🧪 Optional: Train-test split evaluation
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)
val_preds = best_model.predict(X_val)
print("✅ Validation Accuracy:", accuracy_score(y_val, val_preds))

# 📤 Predict on test set
test_preds = best_model.predict(X_test_scaled)

# 💾 Save submission
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv("submission.csv", index=False)
print("📁 submission.csv saved!")


✅ Best Parameters: {'colsample_bytree': 1, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
✅ Best Cross-Validation Accuracy: 0.8411804052724768
✅ Validation Accuracy: 0.8567774936061381
📁 submission.csv saved!


In [133]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# ✅ Load Data
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")

# 🧹 Drop rows with missing target
df = train.dropna(subset=['age_group'])
X = df.drop(['SEQN', 'age_group'], axis=1)
y = df['age_group'].map({'Adult': 0, 'Senior': 1})

# 🧪 Feature Engineering (Train)
X['GLU_ratio'] = X['LBXGLU'] / (X['LBXGLT'] + 1e-5)
X['BMI_INS'] = X['BMXBMI'] * X['LBXIN']
X['active_diabetic'] = ((X['PAQ605'] == 1) & (X['DIQ010'] == 1)).astype(int)
X['high_glucose'] = (X['LBXGLU'] > 140).astype(int)
X['high_bmi'] = (X['BMXBMI'] > 30).astype(int)
X['insulin_resistant'] = (X['LBXIN'] > 100).astype(int)

# 🧪 Feature Engineering (Test)
X_test_final = test.drop(['SEQN'], axis=1)
X_test_final['GLU_ratio'] = X_test_final['LBXGLU'] / (X_test_final['LBXGLT'] + 1e-5)
X_test_final['BMI_INS'] = X_test_final['BMXBMI'] * X_test_final['LBXIN']
X_test_final['active_diabetic'] = ((X_test_final['PAQ605'] == 1) & (X_test_final['DIQ010'] == 1)).astype(int)
X_test_final['high_glucose'] = (X_test_final['LBXGLU'] > 140).astype(int)
X_test_final['high_bmi'] = (X_test_final['BMXBMI'] > 30).astype(int)
X_test_final['insulin_resistant'] = (X_test_final['LBXIN'] > 100).astype(int)

# 🧼 Fill missing values
X = X.fillna(X.median())
X_test_final = X_test_final.fillna(X_test_final.median())
X_test_final = X_test_final[X.columns]  # Ensure same order

# 🔃 Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# ⚖️ SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 🔍 XGBoost Grid Search
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_grid = {
    'max_depth': [6, 8],
    'n_estimators': [200, 300],
    'learning_rate': [0.01, 0.05],
    'subsample': [0.9],
    'colsample_bytree': [0.8],
    'gamma': [0, 1],
    'reg_alpha': [0, 0.5],
    'reg_lambda': [1, 2]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(xgb_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid.fit(X_resampled, y_resampled)

print("\n✅ Best Parameters:", grid.best_params_)
print("✅ Best Cross-Validation Accuracy:", grid.best_score_)

# 🧪 Final train-test evaluation
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
final_model = grid.best_estimator_
final_model.fit(X_train, y_train)
val_preds = final_model.predict(X_val)
print("\n✅ Final Validation Accuracy:", accuracy_score(y_val, val_preds))

# 📤 Predict on Test Set
test_preds = final_model.predict(X_test_scaled)
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv("submission.csv", index=False)
print("\n📁 submission.csv saved!")



✅ Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.9}
✅ Best Cross-Validation Accuracy: 0.8498180040960716

✅ Final Validation Accuracy: 0.8689024390243902

📁 submission.csv saved!
