In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from lightgbm import LGBMClassifier

from optuna.samplers import TPESampler
import optuna

import warnings
warnings.filterwarnings("ignore")

In [12]:
# Loading the dataset   
train_df = pd.read_csv('Dataset/MultiClassPredictionObesityRisk/train.csv')
test_df = pd.read_csv('Dataset/MultiClassPredictionObesityRisk/test.csv')
submission_df = pd.read_csv('Dataset/MultiClassPredictionObesityRisk/sample_submission.csv')
original_df = pd.read_csv('Dataset/MultiClassPredictionObesityRisk/ObesityDataSet.csv')

In [13]:
def get_variable_types(dataframe):
    continuous_vars = []
    categorical_vars = []

    for column in dataframe.columns:
        if dataframe[column].dtype == 'object':
            categorical_vars.append(column)
        else:
            continuous_vars.append(column)

    return continuous_vars, categorical_vars

continuous_vars, categorical_vars = get_variable_types(train_df)
continuous_vars.remove('id'), categorical_vars.remove('NObeyesdad')

print("Continuous Variables:", continuous_vars)
print("Categorical Variables:", categorical_vars)

Continuous Variables: ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
Categorical Variables: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']


In [14]:
train = pd.concat([train_df, original_df]).drop(['id'], axis=1).drop_duplicates()
test = test_df.drop(['id'], axis=1)
train

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,24.443011,1.699998,81.669950,yes,yes,2.000000,2.983297,Sometimes,no,2.763573,no,0.000000,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.000000,1.560000,57.000000,yes,yes,2.000000,3.000000,Frequently,no,2.000000,no,1.000000,1.000000,no,Automobile,Normal_Weight
2,Female,18.000000,1.711460,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.710730,131.274851,yes,yes,3.000000,3.000000,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.000000,3.000000,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.000000,3.000000,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.000000,3.000000,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.000000,3.000000,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [15]:
# One-Hot encoding the categorical variables
train = pd.get_dummies(train, columns=categorical_vars)
test = pd.get_dummies(test, columns=categorical_vars)

In [16]:
train

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad,Gender_Female,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,24.443011,1.699998,81.669950,2.000000,2.983297,2.763573,0.000000,0.976473,Overweight_Level_II,False,...,False,False,False,True,False,False,False,False,True,False
1,18.000000,1.560000,57.000000,2.000000,3.000000,2.000000,1.000000,1.000000,Normal_Weight,True,...,False,False,False,False,True,True,False,False,False,False
2,18.000000,1.711460,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,Insufficient_Weight,True,...,False,False,False,False,True,False,False,False,True,False
3,20.952737,1.710730,131.274851,3.000000,3.000000,1.674061,1.467863,0.780199,Obesity_Type_III,True,...,False,False,False,True,False,False,False,False,True,False
4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,Overweight_Level_II,False,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,20.976842,1.710730,131.408528,3.000000,3.000000,1.728139,1.676269,0.906247,Obesity_Type_III,True,...,False,False,False,True,False,False,False,False,True,False
2107,21.982942,1.748584,133.742943,3.000000,3.000000,2.005130,1.341390,0.599270,Obesity_Type_III,True,...,False,False,False,True,False,False,False,False,True,False
2108,22.524036,1.752206,133.689352,3.000000,3.000000,2.054193,1.414209,0.646288,Obesity_Type_III,True,...,False,False,False,True,False,False,False,False,True,False
2109,24.361936,1.739450,133.346641,3.000000,3.000000,2.852339,1.139107,0.586035,Obesity_Type_III,True,...,False,False,False,True,False,False,False,False,True,False


In [17]:
test

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,26.899886,1.848294,120.644178,2.938616,3.000000,2.825629,0.855400,0.000000,False,True,...,False,False,False,True,False,False,False,False,True,False
1,21.000000,1.600000,66.000000,2.000000,1.000000,3.000000,1.000000,0.000000,True,False,...,False,False,False,True,False,False,False,False,True,False
2,26.000000,1.643355,111.600553,3.000000,3.000000,2.621877,0.000000,0.250502,True,False,...,False,False,False,True,False,False,False,False,True,False
3,20.979254,1.553127,103.669116,2.000000,2.977909,2.786417,0.094851,0.000000,False,True,...,False,False,False,True,False,False,False,False,True,False
4,26.000000,1.627396,104.835346,3.000000,3.000000,2.653531,0.000000,0.741069,True,False,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13835,23.327836,1.721384,78.030383,2.813234,3.000000,1.000000,0.807076,0.778632,False,True,...,False,False,False,True,False,False,False,False,True,False
13836,29.000000,1.590000,62.000000,3.000000,3.000000,2.000000,0.000000,0.000000,True,False,...,False,False,False,True,False,False,False,False,True,False
13837,22.935612,1.585547,44.376637,3.000000,2.273740,2.000000,1.949840,1.000000,True,False,...,False,False,False,True,False,False,False,False,True,False
13838,21.000000,1.620000,53.000000,2.000000,3.000000,2.000000,3.000000,2.000000,False,True,...,False,False,False,False,True,False,False,False,True,False


In [18]:
X = train.drop(['NObeyesdad'], axis=1)
y = train['NObeyesdad']
X

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,24.443011,1.699998,81.669950,2.000000,2.983297,2.763573,0.000000,0.976473,False,True,...,False,False,False,True,False,False,False,False,True,False
1,18.000000,1.560000,57.000000,2.000000,3.000000,2.000000,1.000000,1.000000,True,False,...,False,False,False,False,True,True,False,False,False,False
2,18.000000,1.711460,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,True,False,...,False,False,False,False,True,False,False,False,True,False
3,20.952737,1.710730,131.274851,3.000000,3.000000,1.674061,1.467863,0.780199,True,False,...,False,False,False,True,False,False,False,False,True,False
4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,False,True,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,20.976842,1.710730,131.408528,3.000000,3.000000,1.728139,1.676269,0.906247,True,False,...,False,False,False,True,False,False,False,False,True,False
2107,21.982942,1.748584,133.742943,3.000000,3.000000,2.005130,1.341390,0.599270,True,False,...,False,False,False,True,False,False,False,False,True,False
2108,22.524036,1.752206,133.689352,3.000000,3.000000,2.054193,1.414209,0.646288,True,False,...,False,False,False,True,False,False,False,False,True,False
2109,24.361936,1.739450,133.346641,3.000000,3.000000,2.852339,1.139107,0.586035,True,False,...,False,False,False,True,False,False,False,False,True,False


In [19]:
y

0       Overweight_Level_II
1             Normal_Weight
2       Insufficient_Weight
3          Obesity_Type_III
4       Overweight_Level_II
               ...         
2106       Obesity_Type_III
2107       Obesity_Type_III
2108       Obesity_Type_III
2109       Obesity_Type_III
2110       Obesity_Type_III
Name: NObeyesdad, Length: 22845, dtype: object

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
def objective(trial):
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "seed": 42,
        "num_class": 7,
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    # LightGBM Classifier with the suggested parameters
    model = LGBMClassifier(**param)
    # Fit the classifier on the training data
    model.fit(X_train, y_train)
    # Evaluate the classifier on the testing data
    score = model.score(X_test, y_test)
    return score

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=100)

[I 2024-02-06 20:09:37,741] A new study created in memory with name: no-name-d6e74f84-98d6-4fe3-9033-fb307f7ca219
[W 2024-02-06 20:09:37,743] Trial 0 failed with parameters: {} because of the following error: TypeError('objective() takes 1 positional argument but 5 were given').
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ml_Kaggle/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/yr/jdlh4xrn67n28zlnljlxr1940000gn/T/ipykernel_13685/3182193668.py", line 28, in <lambda>
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=100)
TypeError: objective() takes 1 positional argument but 5 were given
[W 2024-02-06 20:09:37,745] Trial 0 failed with value None.


TypeError: objective() takes 1 positional argument but 5 were given

In [22]:
#The tuning process has been commented out due to its time-consuming nature.

# Define the objective function for Optuna optimization
def objective(trial, X_train, y_train, X_test, y_test):
     # Define parameters to be optimized for the LGBMClassifier
     param = {
         "objective": "multiclass",
         "metric": "multi_logloss",
         "verbosity": -1,
         "boosting_type": "gbdt",
         "random_state": 42,
         "num_class": 7,
         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
         "n_estimators": trial.suggest_int("n_estimators", 400, 600),
         "lambda_l1": trial.suggest_float("lambda_l1", 0.005, 0.015),
         "lambda_l2": trial.suggest_float("lambda_l2", 0.02, 0.06),
         "max_depth": trial.suggest_int("max_depth", 6, 14),
         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
         "subsample": trial.suggest_float("subsample", 0.8, 1.0),
         "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
     }

     # Create an instance of LGBMClassifier with the suggested parameters
     lgbm_classifier = LGBMClassifier(**param)

     # Fit the classifier on the training data
     lgbm_classifier.fit(X_train, y_train)

     # Evaluate the classifier on the test data
     score = lgbm_classifier.score(X_test, y_test)

     return score

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust the test_size as needed

# Set up the sampler for Optuna optimization
sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# Create a study object for Optuna optimization
study = optuna.create_study(direction="maximize", sampler=sampler)

# Run the optimization process
study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=100)

# Get the best parameters after optimization
best_params = study.best_params

print('='*50)
print(best_params)

[I 2024-02-06 20:10:59,130] A new study created in memory with name: no-name-f3b791be-1a05-4f3d-b0ef-5c173c7e5e7f
[I 2024-02-06 20:11:31,726] Trial 0 finished with value: 0.9146421536441235 and parameters: {'learning_rate': 0.0249816047538945, 'n_estimators': 591, 'lambda_l1': 0.01231993941811405, 'lambda_l2': 0.04394633936788146, 'max_depth': 7, 'colsample_bytree': 0.3935967122017216, 'subsample': 0.8116167224336399, 'min_child_samples': 45}. Best is trial 0 with value: 0.9146421536441235.
[I 2024-02-06 20:12:05,684] Trial 1 finished with value: 0.9146421536441235 and parameters: {'learning_rate': 0.034044600469728355, 'n_estimators': 542, 'lambda_l1': 0.005205844942958024, 'lambda_l2': 0.05879639408647977, 'max_depth': 13, 'colsample_bytree': 0.4274034664069657, 'subsample': 0.8363649934414201, 'min_child_samples': 17}. Best is trial 0 with value: 0.9146421536441235.
[I 2024-02-06 20:12:33,622] Trial 2 finished with value: 0.9150798861895382 and parameters: {'learning_rate': 0.022169

{'learning_rate': 0.012054059212586784, 'n_estimators': 460, 'lambda_l1': 0.009811018111847463, 'lambda_l2': 0.0577394472782044, 'max_depth': 9, 'colsample_bytree': 0.5132533308618275, 'subsample': 0.9532335850528303, 'min_child_samples': 23}


In [23]:
best_params = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "random_state": 42,
    "num_class": 7,
    "learning_rate": 0.012054059212586784,
    "n_estimators": 460,
    "lambda_l1": 0.009811018111847463,
    "lambda_l2": 0.0577394472782044,
    "max_depth": 9,
    "colsample_bytree": 0.5132533308618275,
    "subsample": 0.9532335850528303,
    "min_child_samples": 23,
}

In [24]:
lgbm_classifier = LGBMClassifier(**best_params)

lgbm_classifier.fit(X_train, y_train)

y_pred = lgbm_classifier.predict(X_test)
accuracy_score(y_test, y_pred) 

0.9179251477347341

In [25]:
submission_df['NObeyesdad'] = lgbm_classifier.predict(test)
submission_df

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Overweight_Level_I
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight


In [26]:
submission_df.to_csv('Dataset/MultiClassPredictionObesityRisk/Prediction/20240206_FineTuned_LGB_submission.csv', index=False)