# LightGBM Baseline and fine-tuned model

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [2]:
train_df = pd.read_csv(r"Dataset/MultiClassPredictionObesityRisk/train.csv")
test_df = pd.read_csv(r"Dataset/MultiClassPredictionObesityRisk/test.csv")
sample_submission_df = pd.read_csv(r"Dataset/MultiClassPredictionObesityRisk/sample_submission.csv")
train_df.shape, test_df.shape

((20758, 18), (13840, 17))

In [3]:
X = train_df.drop(['id', 'NObeyesdad'], axis=1)
y = train_df['NObeyesdad']

In [4]:
# one-hot encoding for categorical features
X_encoded = pd.get_dummies(X, columns=['MTRANS',
                                       'SCC',
                                       'SMOKE',
                                       'CAEC',
                                       'FAVC',
                                       'family_history_with_overweight',
                                       'Gender'])

X_encoded.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,CALC,MTRANS_Automobile,...,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,FAVC_no,FAVC_yes,family_history_with_overweight_no,family_history_with_overweight_yes,Gender_Female,Gender_Male
0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473,Sometimes,False,...,False,False,True,False,False,True,False,True,False,True
1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0,no,True,...,False,True,False,False,False,True,False,True,True,False
2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,no,False,...,False,False,True,False,False,True,False,True,True,False
3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199,Sometimes,False,...,False,False,True,False,False,True,False,True,True,False
4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,Sometimes,False,...,False,False,True,False,False,True,False,True,False,True


In [5]:
X_encoded.head()
test_df = test_df.drop(['id'], axis=1)

X_test_encoded = pd.get_dummies(test_df, columns=['MTRANS',
                                                  'SCC',
                                                  'SMOKE',
                                                  'CAEC',
                                                  'FAVC',
                                                  'family_history_with_overweight',
                                                  'Gender'])

X_test_encoded.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,CALC,MTRANS_Automobile,...,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,FAVC_no,FAVC_yes,family_history_with_overweight_no,family_history_with_overweight_yes,Gender_Female,Gender_Male
0,26.899886,1.848294,120.644178,2.938616,3.0,2.825629,0.8554,0.0,Sometimes,False,...,False,False,True,False,False,True,False,True,False,True
1,21.0,1.6,66.0,2.0,1.0,3.0,1.0,0.0,Sometimes,False,...,False,False,True,False,False,True,False,True,True,False
2,26.0,1.643355,111.600553,3.0,3.0,2.621877,0.0,0.250502,Sometimes,False,...,False,False,True,False,False,True,False,True,True,False
3,20.979254,1.553127,103.669116,2.0,2.977909,2.786417,0.094851,0.0,Sometimes,False,...,False,False,True,False,False,True,False,True,False,True
4,26.0,1.627396,104.835346,3.0,3.0,2.653531,0.0,0.741069,Sometimes,False,...,False,False,True,False,False,True,False,True,True,False


In [6]:
le = LabelEncoder()

X_encoded.CALC = le.fit_transform(X_encoded.CALC)
X_test_encoded.CALC = le.fit_transform(X_test_encoded.CALC)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded,
                                                    y,
                                                    random_state=42,
                                                    stratify=y)

In [8]:
base_model = LGBMClassifier()
base_model.fit(X_train, y_train)

y_pred = base_model.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003396 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2063
[LightGBM] [Info] Number of data points in the train set: 15568, number of used features: 28
[LightGBM] [Info] Start training from score -2.107583
[LightGBM] [Info] Start training from score -1.907537
[LightGBM] [Info] Start training from score -1.964517
[LightGBM] [Info] Start training from score -1.854860
[LightGBM] [Info] Start training from score -1.635336
[LightGBM] [Info] Start training from score -2.146381
[LightGBM] [Info] Start training from score -2.107583


In [9]:
accuracy_score(y_test, y_pred)

0.9090558766859345

In [10]:
y_pred = base_model.predict(X_test_encoded)
y_pred

array(['Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_III', ...,
       'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_II'],
      dtype=object)

In [11]:
sample_submission_df['NObeyesdad'] = y_pred
sample_submission_df

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Normal_Weight
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight


# Hyperparameter tuning

In [11]:
def objective(trial):
    """
    Objective function to be minimized.
    """
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 7,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = LGBMClassifier(**param)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

In [None]:
import optuna
from optuna.samplers import TPESampler
sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="lightgbm", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=200)

In [None]:
# Get the best parameters
print('Best parameters: ', study.best_params)
'''
Best parameters:  {'lambda_l1': 1.2783107769804225e-07, 'lambda_l2': 4.68831195101335, 'num_leaves': 23, 'feature_fraction': 0.9494806730670128, 'bagging_fraction': 0.8552888263673606, 'bagging_freq': 7, 'min_child_samples': 10}
'''

In [None]:
print('Best value: ', study.best_value)
# Best value:  0.9140655105973025

In [None]:
fine_tuned_model = LGBMClassifier(**study.best_params)
fine_tuned_model.fit(X_train, y_train)

In [None]:
y_pred = fine_tuned_model.predict(X_test_encoded)

In [None]:
sample_submission_df['NObeyesdad'] = y_pred
sample_submission_df.to_csv('Dataset/MultiClassPredictionObesityRisk/Prediction/20240204_FineTuned_LGB_submission.csv', index=False)