In [37]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from tqdm import tqdm

import optuna

In [38]:
train_df = pd.read_csv('Dataset/MultiClassPredictionObesityRisk/train.csv')
test_df = pd.read_csv('Dataset/MultiClassPredictionObesityRisk/test.csv')
original_df = pd.read_csv('Dataset/MultiClassPredictionObesityRisk/ObesityDataSet.csv')

In [39]:
train_df = pd.concat([train_df, original_df], axis=0)
train_df.drop_duplicates(inplace=True)

In [40]:
train_df.shape

(22845, 18)

In [41]:
train_df.head(5)

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0.0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1.0,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2.0,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3.0,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4.0,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [42]:
train_df.describe()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,22845.0,22845.0,22845.0,22845.0,22845.0,22845.0,22845.0,22845.0
mean,10378.5,23.888513,1.700467,87.793761,2.443675,2.755837,2.027165,0.984585,0.620984
std,5992.46278,5.755338,0.087865,26.363367,0.533392,0.711185,0.608479,0.839728,0.602802
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.755907,0.01586,0.0
50%,10378.5,22.815416,1.7,84.0,2.393837,3.0,2.0,1.0,0.58284
75%,15567.75,26.0,1.763029,111.531208,3.0,3.0,2.531984,1.600431,1.0
max,20757.0,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [43]:
train_df.describe(include=[object])

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
count,22845,22845,22845,22845,22845,22845,22845,22845,22845
unique,2,2,2,4,2,2,4,5,7
top,Female,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation,Obesity_Type_III
freq,11457,18736,20826,19290,22556,22062,16446,18245,4370


In [44]:
train_df.isnull().sum()

id                                2087
Gender                               0
Age                                  0
Height                               0
Weight                               0
family_history_with_overweight       0
FAVC                                 0
FCVC                                 0
NCP                                  0
CAEC                                 0
SMOKE                                0
CH2O                                 0
SCC                                  0
FAF                                  0
TUE                                  0
CALC                                 0
MTRANS                               0
NObeyesdad                           0
dtype: int64

In [45]:
train_df.isna().sum()

id                                2087
Gender                               0
Age                                  0
Height                               0
Weight                               0
family_history_with_overweight       0
FAVC                                 0
FCVC                                 0
NCP                                  0
CAEC                                 0
SMOKE                                0
CH2O                                 0
SCC                                  0
FAF                                  0
TUE                                  0
CALC                                 0
MTRANS                               0
NObeyesdad                           0
dtype: int64

In [46]:
# Data Preprocessing
# Standard Scaler -> Numerical features
# One hot encoder -> Categorical features
# Label encoder -> Target feature
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
        ('cat', OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=object))
    ])

X_train, y_train = train_df.drop('NObeyesdad', axis=1), train_df['NObeyesdad']

preprocessor.fit(pd.concat([X_train, test_df]))
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(test_df)

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

In [ ]:
# Optuna Hyperparameter Tuning
from sklearn.model_selection import cross_validate
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 4, 10)
    n_estimators = trial.suggest_int('n_estimators', 500, 2000)
    gamma = trial.suggest_float('gamma', 0, 1)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 1)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 0, 10)
    subsample = trial.suggest_float('subsample', 0, 1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
    learning_rate = trial.suggest_float('learning_rate', 0, 1)

    print('Training the model with', X_train.shape[1], 'features')

    params = {'n_estimators': n_estimators,
              'learning_rate': learning_rate,
              'gamma': gamma,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_depth': max_depth,
              'min_child_weight': min_child_weight,
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'eval_metric':'mlogloss'}

    clf = XGBClassifier(**params)

    cv_results = cross_validate(clf, X_train, y_train, cv=5, scoring='accuracy')

    validation_score = np.mean(cv_results['test_score'])

    return validation_score

In [47]:
params = {
    'n_estimators': 1312,
    'learning_rate': 0.018279520260162645,
    'gamma': 0.0024196354156454324,
    'reg_alpha': 0.9025931173755949,
    'reg_lambda': 0.06835667255875388,
    'max_depth': 5,
    'min_child_weight': 5,
    'subsample': 0.883274050086088,
    'colsample_bytree': 0.6579828557036317
}

xgb = XGBClassifier(random_state=42, **params)

# Wrap the training loop with tqdm for progress bar
for i in tqdm(range(5), desc="Training XGBoost"):
    score = cross_val_score(xgb, np.array(X_train), y_train, scoring='accuracy', cv=5, n_jobs=-1).mean()
    print("Accuracy: ", score)

    # Optionally fit the model at each iteration
    xgb.fit(np.array(X_train), y_train)

Training XGBoost:   0%|          | 0/5 [00:00<?, ?it/s]

Accuracy:  0.8949441891004597


Training XGBoost:  20%|██        | 1/5 [00:23<01:33, 23.39s/it]

Accuracy:  0.8949441891004597


Training XGBoost:  40%|████      | 2/5 [00:47<01:10, 23.65s/it]

Accuracy:  0.8949441891004597


Training XGBoost:  60%|██████    | 3/5 [01:11<00:47, 23.88s/it]

Accuracy:  0.8949441891004597


Training XGBoost:  80%|████████  | 4/5 [01:36<00:24, 24.47s/it]

Accuracy:  0.8949441891004597


Training XGBoost: 100%|██████████| 5/5 [02:05<00:00, 25.15s/it]


In [48]:
y_pred = xgb.predict(np.array(X_test))
y_pred = label_encoder.inverse_transform(y_pred)
y_pred

array(['Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_III', ...,
       'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_II'],
      dtype=object)

In [52]:
test_df.index

RangeIndex(start=0, stop=13840, step=1)

In [54]:
submission_df = pd.DataFrame({'id': test_df.id, "NObeyesdad": y_pred})
submission_df

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Overweight_Level_I
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight


In [55]:
submission_df.to_csv('Dataset/MultiClassPredictionObesityRisk/Prediction/20240205_FineTuned_XGBoost.csv', index=False)