In [2]:
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error,make_scorer


import lightgbm as lgb

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [3]:
zf = zipfile.ZipFile("playground-series-s4e2.zip")
df_train = pd.read_csv(zf.open("train.csv"))
df_test = pd.read_csv(zf.open("test.csv"))

In [4]:
df_train['BMI'] = df_train.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
df_train['BMI'] = df_train['BMI'].round(2)
df_test['BMI'] = df_test.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
df_test['BMI'] = df_test['BMI'].round(2)
df_train = df_train.drop(['MTRANS'], axis=1)
df_test = df_test.drop(['MTRANS'], axis=1)

In [5]:
df_dummy_train = df_train.drop(['NObeyesdad'], axis = 1)
df_dummy_train = pd.get_dummies(df_dummy_train, drop_first = True, dtype = float)

y = df_train['NObeyesdad']
dummy_y = pd.get_dummies(y, dtype=float)
x = df_dummy_train.drop(['id'], axis = 1)

x_test = pd.get_dummies(df_test, drop_first = True, dtype = float)
x_test = x_test.drop(['id'], axis=1)

In [6]:
dummy_y_one_column = np.argmax(dummy_y.values, axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, dummy_y_one_column, test_size = 0.2, random_state= 25 )

In [7]:
train_data = lgb.Dataset(X_train,label=y_train)
valid_data = lgb.Dataset(X_test, label=y_train, reference=train_data)

In [8]:
# Define hyperparameters
params = {
    "objective": "multiclass",
    "boosting_type": "rf",
    "num_classes": 7,
    "num_leaves": 5,
    "force_row_wise": True,
    "learning_rate": 0.5,
    "metric": "multi_logloss",
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8
}
# Train the LightGBM model
num_round = 500
bst = lgb.train(params, train_data, num_round, valid_sets=[valid_data])

[LightGBM] [Info] Total Bins 2297
[LightGBM] [Info] Number of data points in the train set: 16606, number of used features: 19
[LightGBM] [Info] Start training from score -2.124649
[LightGBM] [Info] Start training from score -1.899491
[LightGBM] [Info] Start training from score -1.953223
[LightGBM] [Info] Start training from score -1.860425
[LightGBM] [Info] Start training from score -1.638521
[LightGBM] [Info] Start training from score -2.146046
[LightGBM] [Info] Start training from score -2.101728


LightGBMError: Length of labels differs from the length of #data

In [None]:
x_test = pd.get_dummies(df_test, drop_first = True, dtype = float)
x_test = x_test.drop(['id'], axis=1)

In [99]:
dummy_y

Unnamed: 0,Insufficient_Weight,Normal_Weight,Obesity_Type_I,Obesity_Type_II,Obesity_Type_III,Overweight_Level_I,Overweight_Level_II
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
20753,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20754,1.0,0.0,0.0,0.0,0.0,0.0,0.0
20755,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20756,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [100]:
predictions

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [101]:
result_labels = dummy_y.columns[np.argmax(predictions, axis=1)]
result_column = pd.Series(result_labels, name='Result')
result_df = result_column
result_df

0            Obesity_Type_II
1         Overweight_Level_I
2           Obesity_Type_III
3             Obesity_Type_I
4           Obesity_Type_III
                ...         
13835    Overweight_Level_II
13836          Normal_Weight
13837    Insufficient_Weight
13838          Normal_Weight
13839        Obesity_Type_II
Name: Result, Length: 13840, dtype: object

In [102]:
df_submission = pd.DataFrame({
    'id': df_test.id,
    'NObeyesdad': result_df
})
df_submission

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Normal_Weight
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight


In [103]:
df_submission.to_csv("4thSubmission_RF_noMTRANS_CALC_SMOKE_CAEC.csv", index= False)