In [36]:
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error,make_scorer

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [37]:
zf = zipfile.ZipFile("playground-series-s4e2.zip")
zf.namelist()

['sample_submission.csv', 'test.csv', 'train.csv']

In [38]:
df_train = pd.read_csv(zf.open("train.csv"))
df_test = pd.read_csv(zf.open("test.csv"))

In [39]:
#Checking to see if any rows/columns are missing any data
df_train

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.669950,yes,yes,2.000000,2.983297,Sometimes,no,2.763573,no,0.000000,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.000000,1.560000,57.000000,yes,yes,2.000000,3.000000,Frequently,no,2.000000,no,1.000000,1.000000,no,Automobile,Normal_Weight
2,2,Female,18.000000,1.711460,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.710730,131.274851,yes,yes,3.000000,3.000000,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,20753,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.000000,Sometimes,no,2.151809,no,1.330519,0.196680,Sometimes,Public_Transportation,Obesity_Type_II
20754,20754,Male,18.000000,1.710000,50.000000,no,yes,3.000000,4.000000,Frequently,no,1.000000,no,2.000000,1.000000,Sometimes,Public_Transportation,Insufficient_Weight
20755,20755,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.000000,Sometimes,no,2.000000,no,1.158040,1.198439,no,Public_Transportation,Obesity_Type_II
20756,20756,Male,33.852953,1.700000,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.000000,0.973834,no,Automobile,Overweight_Level_II


In [40]:
df_train['BMI'] = df_train.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
df_train['BMI'] = df_train['BMI'].round(2)

In [41]:
df_train = df_train.drop(['MTRANS'], axis=1)
df_test = df_test.drop(['MTRANS'], axis=1)

In [42]:
df_train

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,NObeyesdad,BMI
0,0,Male,24.443011,1.699998,81.669950,yes,yes,2.000000,2.983297,Sometimes,no,2.763573,no,0.000000,0.976473,Sometimes,Overweight_Level_II,28.26
1,1,Female,18.000000,1.560000,57.000000,yes,yes,2.000000,3.000000,Frequently,no,2.000000,no,1.000000,1.000000,no,Normal_Weight,23.42
2,2,Female,18.000000,1.711460,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Insufficient_Weight,17.13
3,3,Female,20.952737,1.710730,131.274851,yes,yes,3.000000,3.000000,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Obesity_Type_III,44.86
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Overweight_Level_II,25.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,20753,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.000000,Sometimes,no,2.151809,no,1.330519,0.196680,Sometimes,Obesity_Type_II,36.59
20754,20754,Male,18.000000,1.710000,50.000000,no,yes,3.000000,4.000000,Frequently,no,1.000000,no,2.000000,1.000000,Sometimes,Insufficient_Weight,17.10
20755,20755,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.000000,Sometimes,no,2.000000,no,1.158040,1.198439,no,Obesity_Type_II,31.89
20756,20756,Male,33.852953,1.700000,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.000000,0.973834,no,Overweight_Level_II,28.90


In [43]:

df_dummy_train = df_train.drop(['NObeyesdad'], axis = 1)

df_dummy_train = pd.get_dummies(df_dummy_train, drop_first = True, dtype = float)

df_dummy_train

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,BMI,Gender_Male,family_history_with_overweight_yes,FAVC_yes,CAEC_Frequently,CAEC_Sometimes,CAEC_no,SMOKE_yes,SCC_yes,CALC_Sometimes,CALC_no
0,0,24.443011,1.699998,81.669950,2.000000,2.983297,2.763573,0.000000,0.976473,28.26,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1,18.000000,1.560000,57.000000,2.000000,3.000000,2.000000,1.000000,1.000000,23.42,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,18.000000,1.711460,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,17.13,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,3,20.952737,1.710730,131.274851,3.000000,3.000000,1.674061,1.467863,0.780199,44.86,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,25.60,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,20753,25.137087,1.766626,114.187096,2.919584,3.000000,2.151809,1.330519,0.196680,36.59,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
20754,20754,18.000000,1.710000,50.000000,3.000000,4.000000,1.000000,2.000000,1.000000,17.10,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
20755,20755,20.101026,1.819557,105.580491,2.407817,3.000000,2.000000,1.158040,1.198439,31.89,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
20756,20756,33.852953,1.700000,83.520113,2.671238,1.971472,2.144838,0.000000,0.973834,28.90,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [52]:
y = df_train['NObeyesdad']

dummy_y = pd.get_dummies(y, dtype=float)


x = df_dummy_train.drop(['id'], axis = 1)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(x, dummy_y, test_size = 0.2, random_state= 25 )

In [46]:
seed=2
def objective(params):
    est=int(params['n_estimators'])
    md=int(params['max_depth'])
    msl=int(params['min_samples_leaf'])
    mss=int(params['min_samples_split'])
    model=RandomForestClassifier(n_estimators=est,max_depth=md,min_samples_leaf=msl,min_samples_split=mss)
    model.fit(X_train,y_train)
    pred=model.predict(X_test)
    score=mean_squared_error(y_test,pred)
    return score

def optimize(trial):
    params={'n_estimators':hp.uniform('n_estimators',100,500),
           'max_depth':hp.uniform('max_depth',5,20),
           'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
           'min_samples_split':hp.uniform('min_samples_split',2,6)}
    best=fmin(fn=objective,space=params,algo=tpe.suggest,trials=trial,max_evals=10)
    return best

In [47]:
trials = Trials()

best_hyperparams = optimize(trials)
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

100%|██████████| 10/10 [01:53<00:00, 11.31s/trial, best loss: 0.028488852188274152]
The best hyperparameters are :  

{'max_depth': 15.655406198532715, 'min_samples_leaf': 1.128275612795782, 'min_samples_split': 4.662425552904596, 'n_estimators': 311.20376424775566}


In [48]:
rf_model = RandomForestClassifier(max_depth = 14 ,
                               min_samples_leaf = 1,
                               min_samples_split = 3,
                               n_estimators =235,
                                random_state=25 )


rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)



print(mean_squared_error(y_test, y_pred,squared=False))
print(rf_model.score(X_train,y_train))

0.15618327411370245
0.9531494640491389


In [49]:
#Check feature Importance
importances = rf_model.feature_importances_
columns = x.columns
i = 0

while i< len(columns):
    print (f" The importance of feature '{columns[i]}' is {round(importances[i]*100, 2)}%.")
    i += 1

 The importance of feature 'Age' is 7.07%.
 The importance of feature 'Height' is 6.07%.
 The importance of feature 'Weight' is 24.89%.
 The importance of feature 'FCVC' is 7.03%.
 The importance of feature 'NCP' is 1.99%.
 The importance of feature 'CH2O' is 2.81%.
 The importance of feature 'FAF' is 2.16%.
 The importance of feature 'TUE' is 2.51%.
 The importance of feature 'BMI' is 32.65%.
 The importance of feature 'Gender_Male' is 5.45%.
 The importance of feature 'family_history_with_overweight_yes' is 2.02%.
 The importance of feature 'FAVC_yes' is 0.75%.
 The importance of feature 'CAEC_Frequently' is 0.97%.
 The importance of feature 'CAEC_Sometimes' is 0.93%.
 The importance of feature 'CAEC_no' is 0.25%.
 The importance of feature 'SMOKE_yes' is 0.07%.
 The importance of feature 'SCC_yes' is 0.29%.
 The importance of feature 'CALC_Sometimes' is 1.1%.
 The importance of feature 'CALC_no' is 0.99%.


In [50]:

df_test['BMI'] = df_test.apply(lambda x: x['Weight']/ np.square(x['Height']), axis = 1)
x_test = pd.get_dummies(df_test, drop_first = True, dtype = float)

x_test = x_test.drop(['id','CALC_Frequently'], axis=1)

predictions = rf_model.predict(x_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- CALC_Frequently


In [None]:
dummy_y

Unnamed: 0,Insufficient_Weight,Normal_Weight,Obesity_Type_I,Obesity_Type_II,Obesity_Type_III,Overweight_Level_I,Overweight_Level_II
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
20753,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20754,1.0,0.0,0.0,0.0,0.0,0.0,0.0
20755,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20756,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
predictions

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
result_labels = dummy_y.columns[np.argmax(predictions, axis=1)]
result_column = pd.Series(result_labels, name='Result')
result_df = result_column
result_df

0            Obesity_Type_II
1         Overweight_Level_I
2           Obesity_Type_III
3             Obesity_Type_I
4           Obesity_Type_III
                ...         
13835    Overweight_Level_II
13836          Normal_Weight
13837    Insufficient_Weight
13838          Normal_Weight
13839        Obesity_Type_II
Name: Result, Length: 13840, dtype: object

In [None]:
df_submission = pd.DataFrame({
    'id': df_test.id,
    'NObeyesdad': result_df
})
df_submission

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Normal_Weight
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight


In [35]:
df_submission.to_csv("6thSubmission_RF_hypertuned_noMTRANS.csv", index= False)