In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
# Validation
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
import optuna

In [59]:
train_df = pd.read_csv('Dataset/SmokerStatusPrediction/train.csv', encoding='utf-8')
test_df = pd.read_csv('Dataset/SmokerStatusPrediction/test.csv', encoding='utf-8')

In [36]:
# EDA session will be done in another notebook
# Training models
seed = np.random.seed(42)
X = train_df.drop(["id", "smoking"],axis=1)
y = train_df["smoking"]

In [40]:
test_cv = test_df.drop(columns = ['id'], axis = 1)

In [39]:
# Baseline xgboost model
xgbmodel = XGBClassifier(random_state=seed, tree_method='exact')
print("CV score of XGB is ",cross_val_score(xgbmodel, X, y, cv=4, scoring = 'roc_auc').mean())

CV score of LGBM is  0.8645777542659799


In [50]:
# Baseline xgboost model
xgbmodel_md = XGBClassifier(random_state=seed, tree_method='exact').fit(X, y)
print("CV score of XGB is ",cross_val_score(xgbmodel_md, X, y, cv=4, scoring = 'roc_auc').mean())

CV score of XGB is  0.8645777542659799


In [52]:
# Prediction
XGB_pred_test = xgbmodel_md.predict_proba(test_cv)[:, 1]
XGB_pred_test

array([0.65870994, 0.10153673, 0.6664567 , ..., 0.49006492, 0.0941251 ,
       0.03032043], dtype=float32)

In [53]:
# Generate submission file
submission_df = pd.read_csv(r"Dataset/SmokerStatusPrediction/sample_submission.csv", encoding="utf8")
submission_df['smoking'] = XGB_pred_test
submission_df

Unnamed: 0,id,smoking
0,159256,0.658710
1,159257,0.101537
2,159258,0.666457
3,159259,0.014916
4,159260,0.631548
...,...,...
106166,265422,0.514796
106167,265423,0.571328
106168,265424,0.490065
106169,265425,0.094125


In [54]:
# 0.86840
submission_df.to_csv('Dataset/SmokerStatusPrediction/submissionBaselineXGB.csv', index=False)

In [60]:
# Training models
seed = np.random.seed(42)
X = train_df.drop(["id", "smoking"],axis=1)
y = train_df["smoking"]

In [61]:
# Optuna finetuning XGBoost
def objective(trial):
    params = {
        'n_estimators' : trial.suggest_int('n_estimators',500,750),
        'max_depth':  trial.suggest_int('max_depth',3,50),
        'min_child_weight': trial.suggest_float('min_child_weight', 2,50),
        "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.2,log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1),
        'gamma': trial.suggest_float("gamma", 1e-4, 1.0),
        "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
        "colsample_bylevel" : trial.suggest_float('colsample_bylevel',0.2,1),
        "colsample_bynode" : trial.suggest_float('colsample_bynode',0.2,1),
    }
    xgbmodel_optuna = XGBClassifier(**params, random_state=seed, tree_method = "exact", eval_metric= "auc")
    cv = cross_val_score(xgbmodel_optuna, X, y, cv = 4,scoring='roc_auc').mean()
    return cv

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100,timeout=5000)

[I 2023-10-29 22:54:14,212] A new study created in memory with name: no-name-56aae036-0632-441c-993d-8ac38bf3b7ba
[I 2023-10-29 22:57:20,487] Trial 0 finished with value: 0.8527717945783985 and parameters: {'n_estimators': 505, 'max_depth': 22, 'min_child_weight': 45.03590733733973, 'learning_rate': 0.00042618605850378965, 'subsample': 0.37744087558820755, 'gamma': 0.32329841312787294, 'colsample_bytree': 0.8781314599968304, 'colsample_bylevel': 0.5962752960158647, 'colsample_bynode': 0.7961141384180592}. Best is trial 0 with value: 0.8527717945783985.
[I 2023-10-29 23:00:34,830] Trial 1 finished with value: 0.8635866511980441 and parameters: {'n_estimators': 649, 'max_depth': 11, 'min_child_weight': 6.578852380488485, 'learning_rate': 0.08672802923020059, 'subsample': 0.48357297501931706, 'gamma': 0.8843420026918414, 'colsample_bytree': 0.5816684448668059, 'colsample_bylevel': 0.4833370938341621, 'colsample_bynode': 0.8630403160153373}. Best is trial 1 with value: 0.8635866511980441.


In [62]:
study.best_params

{'n_estimators': 636,
 'max_depth': 43,
 'min_child_weight': 30.644998538115118,
 'learning_rate': 0.018116523135118634,
 'subsample': 0.9026427111624884,
 'gamma': 0.6384381222977351,
 'colsample_bytree': 0.48192772659999944,
 'colsample_bylevel': 0.8068169345075475,
 'colsample_bynode': 0.45375884437473546}

In [64]:
# Use the hyperparameter found by optuna to train the model
xgbmodel_bestParams = study.best_params
xgb_opt = XGBClassifier(**xgbmodel_bestParams, random_state=seed, tree_method = "exact",eval_metric= "auc").fit(X, y)
print("CV score of XGB is ",cross_val_score(xgb_opt, X, y, cv=4, scoring = 'roc_auc').mean())

CV score of XGB is  0.8700532756990391


In [65]:
# Prediction
XGB_pred_test = xgb_opt.predict_proba(test_cv)[:, 1]
XGB_pred_test

array([0.67422473, 0.2562991 , 0.45875537, ..., 0.45729458, 0.07917994,
       0.02491962], dtype=float32)

In [66]:
# Generate submission file
submission_df = pd.read_csv(r"Dataset/SmokerStatusPrediction/sample_submission.csv", encoding="utf8")
submission_df['smoking'] = XGB_pred_test
submission_df

Unnamed: 0,id,smoking
0,159256,0.674225
1,159257,0.256299
2,159258,0.458755
3,159259,0.020538
4,159260,0.605822
...,...,...
106166,265422,0.609186
106167,265423,0.518093
106168,265424,0.457295
106169,265425,0.079180


In [67]:
# 0.87000 (0.87223 in Kaggle)
submission_df.to_csv('Dataset/SmokerStatusPrediction/submissionBaselineXGB_Finetuned.csv', index=False)