In [None]:
import zipfile
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [62]:
zf = zipfile.ZipFile("playground-series-s4e2.zip")
df_train = pd.read_csv(zf.open("train.csv"))
df_test = pd.read_csv(zf.open("test.csv"))

In [63]:
df_train['BMI'] = df_train.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
df_train['BMI'] = df_train['BMI'].round(2)
df_test['BMI'] = df_test.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
df_test['BMI'] = df_test['BMI'].round(2)
df_train = df_train.drop(['Height', 'Weight'], axis=1)
df_test = df_test.drop(['Height', 'Weight'], axis=1)

In [64]:
#Checking to see if any rows/columns are missing any data
df_train.head(5)

Unnamed: 0,id,Gender,Age,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,0,Male,24.443011,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II,28.26
1,1,Female,18.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight,23.42
2,2,Female,18.0,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight,17.13
3,3,Female,20.952737,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III,44.86
4,4,Male,31.641081,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II,25.6


In [65]:
#Checking if any columns have words that need to be converted to integers/floats for prediction calculations
df_dummy_train = df_train.drop(['NObeyesdad'], axis = 1)
df_dummy_train = pd.get_dummies(df_dummy_train, drop_first = True, dtype = float)

y = df_train['NObeyesdad']
dummy_y = pd.get_dummies(y, dtype=float)
x = df_dummy_train.drop(['id'], axis = 1)

print(dummy_y)

x_test = pd.get_dummies(df_test, drop_first = True, dtype = float)
x_test = x_test.drop(['id'], axis=1)
dummy_y_one_column = pd.DataFrame({'NObeyesdad': np.argmax(dummy_y.values, axis=1)})

       Insufficient_Weight  Normal_Weight  Obesity_Type_I  Obesity_Type_II  \
0                      0.0            0.0             0.0              0.0   
1                      0.0            1.0             0.0              0.0   
2                      1.0            0.0             0.0              0.0   
3                      0.0            0.0             0.0              0.0   
4                      0.0            0.0             0.0              0.0   
...                    ...            ...             ...              ...   
20753                  0.0            0.0             0.0              1.0   
20754                  1.0            0.0             0.0              0.0   
20755                  0.0            0.0             0.0              1.0   
20756                  0.0            0.0             0.0              0.0   
20757                  0.0            0.0             0.0              1.0   

       Obesity_Type_III  Overweight_Level_I  Overweight_Level_I

In [66]:
y = dummy_y_one_column
x = df_dummy_train.drop(['id'], axis =1)

In [67]:
x_train, x_test, y_train, y_test = train_test_split(x,y, stratify=y , random_state=42)

In [68]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [69]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( x_train, y_train), ( x_test, y_test)]
    
    clf.fit(x_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 10,
                        trials = trials)

In [71]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.87112473776474, 'gamma': 1.4864831414116386, 'max_depth': 13.0, 'min_child_weight': 9.0, 'reg_alpha': 128.0, 'reg_lambda': 0.7243653181703552}


In [72]:
model = xgb.XGBClassifier(learning_rate =0.1,
    n_estimators=1000,
    early_stopping_rounds=50,
    max_depth=5,
    min_child_weight=3,
    gamma=0.1,
    colsample_bytree=0.6,
    subsample=0.65,
    reg_alpha=0.01,
    objective= 'multi:softprob',
    nthread=-1,
    random_state=42)


In [73]:
model.fit(x_train, 
          y_train, 
          verbose=True,
          eval_set=[(x_test, y_test)])

[0]	validation_0-mlogloss:1.73637
[1]	validation_0-mlogloss:1.59157
[2]	validation_0-mlogloss:1.47407
[3]	validation_0-mlogloss:1.35169
[4]	validation_0-mlogloss:1.27477
[5]	validation_0-mlogloss:1.18408
[6]	validation_0-mlogloss:1.11356
[7]	validation_0-mlogloss:1.04130
[8]	validation_0-mlogloss:0.97116
[9]	validation_0-mlogloss:0.91488
[10]	validation_0-mlogloss:0.86380
[11]	validation_0-mlogloss:0.82625
[12]	validation_0-mlogloss:0.78934
[13]	validation_0-mlogloss:0.75232
[14]	validation_0-mlogloss:0.71855
[15]	validation_0-mlogloss:0.69129
[16]	validation_0-mlogloss:0.66780
[17]	validation_0-mlogloss:0.64836
[18]	validation_0-mlogloss:0.62909
[19]	validation_0-mlogloss:0.61184
[20]	validation_0-mlogloss:0.59552
[21]	validation_0-mlogloss:0.57832
[22]	validation_0-mlogloss:0.56319
[23]	validation_0-mlogloss:0.54809
[24]	validation_0-mlogloss:0.53231
[25]	validation_0-mlogloss:0.51852
[26]	validation_0-mlogloss:0.50529
[27]	validation_0-mlogloss:0.49401
[28]	validation_0-mlogloss:0.4

In [74]:
#Check feature Importance
importances = model.feature_importances_
columns = x.columns
i = 0

while i< len(columns):
    print (f" The importance of feature '{columns[i]}' is {round(importances[i]*100, 2)}%.")
    i += 1

 The importance of feature 'Age' is 3.65%.
 The importance of feature 'FCVC' is 9.73%.
 The importance of feature 'NCP' is 2.91%.
 The importance of feature 'CH2O' is 3.6%.
 The importance of feature 'FAF' is 1.84%.
 The importance of feature 'TUE' is 3.52%.
 The importance of feature 'BMI' is 21.05%.
 The importance of feature 'Gender_Male' is 14.22%.
 The importance of feature 'family_history_with_overweight_yes' is 4.81%.
 The importance of feature 'FAVC_yes' is 3.06%.
 The importance of feature 'CAEC_Frequently' is 5.18%.
 The importance of feature 'CAEC_Sometimes' is 4.61%.
 The importance of feature 'CAEC_no' is 5.83%.
 The importance of feature 'SMOKE_yes' is 1.18%.
 The importance of feature 'SCC_yes' is 3.1%.
 The importance of feature 'CALC_Sometimes' is 3.87%.
 The importance of feature 'CALC_no' is 3.12%.
 The importance of feature 'MTRANS_Bike' is 0.0%.
 The importance of feature 'MTRANS_Motorbike' is 0.0%.
 The importance of feature 'MTRANS_Public_Transportation' is 3.11%

In [75]:
y_pred = model.predict(x_test)

In [76]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.89

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       631
           1       0.86      0.88      0.87       771
           2       0.86      0.86      0.86       727
           3       0.94      0.97      0.95       812
           4       1.00      1.00      1.00      1012
           5       0.79      0.75      0.77       607
           6       0.79      0.77      0.78       630

    accuracy                           0.89      5190
   macro avg       0.88      0.88      0.88      5190
weighted avg       0.89      0.89      0.89      5190



In [77]:
x_test = pd.get_dummies(df_test, drop_first = True, dtype = float)
x_test = x_test.drop(['id','CALC_Frequently'], axis=1)

predictions = model.predict(x_test)



#
#Code to convert an array to map out for string equivalent in an array
#

columns = ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II']

# Mapping dictionary
label_mapping = {i: column for i, column in enumerate(columns)}

# Map the numeric labels to column titles
predictions_mapped = np.array([label_mapping[label] for label in predictions])

predictions_mapped


array(['Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_III', ...,
       'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_II'],
      dtype='<U19')

In [78]:
index = pd.read_csv(zf.open("test.csv"))

In [79]:
df_submission = pd.DataFrame({
    'id': index['id'],
    'NObeyesdad': predictions_mapped
})
df_submission

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Normal_Weight
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight


In [80]:
df_submission.to_csv('XGBoost_Submission_#5Hypertuned_taken_NOHEIGHTWEIGHT.csv', index=False)