In [None]:
#using xgboost
!pip install xgboost




In [2]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import f1_score

# ✅ Load Data
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")

# 🧹 Drop rows with missing target
df = train.dropna(subset=['age_group'])
X = df.drop(['SEQN', 'age_group'], axis=1)
y = df['age_group'].map({'Adult': 0, 'Senior': 1})

# 🧪 Feature Engineering (Train)
X['GLU_ratio'] = X['LBXGLU'] / (X['LBXGLT'] + 1e-5)
X['BMI_INS'] = X['BMXBMI'] * X['LBXIN']
X['active_diabetic'] = ((X['PAQ605'] == 1) & (X['DIQ010'] == 1)).astype(int)
X['high_glucose'] = (X['LBXGLU'] > 140).astype(int)
X['high_bmi'] = (X['BMXBMI'] > 30).astype(int)
X['insulin_resistant'] = (X['LBXIN'] > 100).astype(int)

# 🧪 Feature Engineering (Test)
X_test_final = test.drop(['SEQN'], axis=1)
X_test_final['GLU_ratio'] = X_test_final['LBXGLU'] / (X_test_final['LBXGLT'] + 1e-5)
X_test_final['BMI_INS'] = X_test_final['BMXBMI'] * X_test_final['LBXIN']
X_test_final['active_diabetic'] = ((X_test_final['PAQ605'] == 1) & (X_test_final['DIQ010'] == 1)).astype(int)
X_test_final['high_glucose'] = (X_test_final['LBXGLU'] > 140).astype(int)
X_test_final['high_bmi'] = (X_test_final['BMXBMI'] > 30).astype(int)
X_test_final['insulin_resistant'] = (X_test_final['LBXIN'] > 100).astype(int)

# 🧼 Fill missing values
X = X.fillna(X.median())
X_test_final = X_test_final.fillna(X_test_final.median())
X_test_final = X_test_final[X.columns]  # Ensure same order

# 🔃 Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# ⚖️ SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 🔍 XGBoost Grid Search
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_grid = {
    'max_depth': [6, 8],
    'n_estimators': [200, 300],
    'learning_rate': [0.01, 0.05],
    'subsample': [0.9],
    'colsample_bytree': [0.8],
    'gamma': [0, 1],
    'reg_alpha': [0, 0.5],
    'reg_lambda': [1, 2]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(xgb_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid.fit(X_resampled, y_resampled)

print("\n✅ Best Parameters:", grid.best_params_)
print("✅ Best Cross-Validation Accuracy:", grid.best_score_)

# 🧪 Final train-test evaluation
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
final_model = grid.best_estimator_
final_model.fit(X_train, y_train)
val_preds = final_model.predict(X_val)
print("\n✅ Final Validation Accuracy:", accuracy_score(y_val, val_preds))

# 📤 Predict on Test Set
test_preds = final_model.predict(X_test_scaled)
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv("submission.csv", index=False)
print("\n📁 submission.csv saved!")


Parameters: { "use_label_encoder" } are not used.




✅ Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.9}
✅ Best Cross-Validation Accuracy: 0.8498180040960716


Parameters: { "use_label_encoder" } are not used.




✅ Final Validation Accuracy: 0.8689024390243902

📁 submission.csv saved!


In [5]:
score = f1_score(y_val, val_preds)
print("F1 Score:", score)

F1 Score: 0.8680981595092024
