In [174]:
# Importing Necessary Libaries
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score

In [175]:
# Reading data in
x = pd.read_csv('train.csv')
xtest = pd.read_csv('test.csv')

In [176]:
# Setting index to be id
x.set_index('id', inplace=True)
xtest.set_index('id', inplace=True)

In [177]:
# Splitting data into tables of x and y variables
y = x['NObeyesdad']
x = x.drop('NObeyesdad', axis=1)

In [178]:
y

id
0        Overweight_Level_II
1              Normal_Weight
2        Insufficient_Weight
3           Obesity_Type_III
4        Overweight_Level_II
                ...         
20753        Obesity_Type_II
20754    Insufficient_Weight
20755        Obesity_Type_II
20756    Overweight_Level_II
20757        Obesity_Type_II
Name: NObeyesdad, Length: 20758, dtype: object

In [179]:
# NOTE - 7 categories
print(pd.unique(y))

['Overweight_Level_II' 'Normal_Weight' 'Insufficient_Weight'
 'Obesity_Type_III' 'Obesity_Type_II' 'Overweight_Level_I'
 'Obesity_Type_I']


In [180]:
# one hot encoding
# changing categorical variables to binary
x['Female'] = np.where(x['Gender'] == 'Female', 1, 0)
x['Male'] = np.where(x['Gender'] == 'Male', 1, 0)
x = x.drop(['Gender'], axis=1)

# ensuring that test data will be the same as the training data
xtest['Female'] = np.where(xtest['Gender'] == 'Female', 1, 0)
xtest['Male'] = np.where(xtest['Gender'] == 'Male', 1, 0)
xtest = xtest.drop(['Gender'], axis=1)

In [181]:
# changing categorical variables to binary
x['Family_History_Overweight'] = np.where(x['family_history_with_overweight'] == 'yes', 1, 0)
x['No_Family_History_with_Overweight'] = np.where(x['family_history_with_overweight'] == 'no', 1, 0)
x = x.drop(['family_history_with_overweight'], axis=1)

# ensuring that test data will be the same as the training data
xtest['Family_History_Overweight'] = np.where(xtest['family_history_with_overweight'] == 'yes', 1, 0)
xtest['No_Family_History_with_Overweight'] = np.where(xtest['family_history_with_overweight'] == 'no', 1, 0)
xtest = xtest.drop(['family_history_with_overweight'], axis=1)

In [182]:
# FAVC - Frequent Consumption of High-Caloric Food
x['YesFAVC'] = np.where(x['FAVC'] == 'yes', 1, 0)
x['NoFAVC'] = np.where(x['FAVC'] == 'no', 1, 0)
x = x.drop(['FAVC'], axis=1)

# ensuring that test data will be the same as the training data
xtest['YesFAVC'] = np.where(xtest['FAVC'] == 'yes', 1, 0)
xtest['NoFAVC'] = np.where(xtest['FAVC'] == 'no', 1, 0)
xtest = xtest.drop(['FAVC'], axis=1)

In [183]:
# SMOKE - changing categorical variables to binary
x['SMOKE'] = np.where(x['SMOKE'] == 'yes', 1, 0)

# ensuring that test data will be the same as the training data
xtest['SMOKE'] = np.where(xtest['SMOKE'] == 'yes', 1, 0)

In [184]:
# SCC - Squamous Cell Carcinoma
x['SCC'] = np.where(x['SCC'] == 'yes', 1, 0)

# ensuring that test data will be the same as the training data
xtest['SCC'] = np.where(xtest['SCC'] == 'yes', 1, 0)

In [185]:
# CAEC - Cardiovascular Adiposity and Endocrine Complications
# CALC - refers to the Body Mass Index (BMI) Calculator

Response_code = {
    'no': 0,
    'Sometimes': 1,
    'Frequently': 2,
    'Always': 3,
}

x['CAEC'] = x['CAEC'].map(Response_code)
x['CALC'] = x['CALC'].map(Response_code)
xtest['CAEC'] = xtest['CAEC'].map(Response_code)
xtest['CALC'] = xtest['CALC'].map(Response_code)
# print(pd.unique(x['CAEC']))
# print(pd.unique(x['CALC']))

In [186]:
# MTRANS - mode of transportation
# categorized transportation to make it usable

Transportation_code = {
    'Public_Transportation': 1,
    'Automobile': 2,
    'Walking': 3,
    'Motorbike': 4,
    'Bike': 5
}

x['MTRANS'] = x['MTRANS'].map(Transportation_code)
xtest['MTRANS'] = xtest['MTRANS'].map(Transportation_code)
# print(pd.unique(x['MTRANS']))

In [187]:
x

Unnamed: 0_level_0,Age,Height,Weight,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Female,Male,Family_History_Overweight,No_Family_History_with_Overweight,YesFAVC,NoFAVC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,24.443011,1.699998,81.669950,2.000000,2.983297,1,0,2.763573,0,0.000000,0.976473,1,1,0,1,1,0,1,0
1,18.000000,1.560000,57.000000,2.000000,3.000000,2,0,2.000000,0,1.000000,1.000000,0,2,1,0,1,0,1,0
2,18.000000,1.711460,50.165754,1.880534,1.411685,1,0,1.910378,0,0.866045,1.673584,0,1,1,0,1,0,1,0
3,20.952737,1.710730,131.274851,3.000000,3.000000,1,0,1.674061,0,1.467863,0.780199,1,1,1,0,1,0,1,0
4,31.641081,1.914186,93.798055,2.679664,1.971472,1,0,1.979848,0,1.967973,0.931721,1,1,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,25.137087,1.766626,114.187096,2.919584,3.000000,1,0,2.151809,0,1.330519,0.196680,1,1,0,1,1,0,1,0
20754,18.000000,1.710000,50.000000,3.000000,4.000000,2,0,1.000000,0,2.000000,1.000000,1,1,0,1,0,1,1,0
20755,20.101026,1.819557,105.580491,2.407817,3.000000,1,0,2.000000,0,1.158040,1.198439,0,1,0,1,1,0,1,0
20756,33.852953,1.700000,83.520113,2.671238,1.971472,1,0,2.144838,0,0.000000,0.973834,0,2,0,1,1,0,1,0


In [188]:
# NOTE - 7 categories
print(pd.unique(y))

['Overweight_Level_II' 'Normal_Weight' 'Insufficient_Weight'
 'Obesity_Type_III' 'Obesity_Type_II' 'Overweight_Level_I'
 'Obesity_Type_I']


In [189]:
# mapping it so it can be used 
Weight_code = {
    'Insufficient_Weight': 1,
    'Normal_Weight': 2,
    'Overweight_Level_I': 3,
    'Overweight_Level_II': 4,
    'Obesity_Type_I': 5,
    'Obesity_Type_II': 6,
    'Obesity_Type_III': 7
}

y = y.map(Weight_code)

In [190]:
y

id
0        4
1        2
2        1
3        7
4        4
        ..
20753    6
20754    1
20755    6
20756    4
20757    6
Name: NObeyesdad, Length: 20758, dtype: int64

In [191]:
x

Unnamed: 0_level_0,Age,Height,Weight,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Female,Male,Family_History_Overweight,No_Family_History_with_Overweight,YesFAVC,NoFAVC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,24.443011,1.699998,81.669950,2.000000,2.983297,1,0,2.763573,0,0.000000,0.976473,1,1,0,1,1,0,1,0
1,18.000000,1.560000,57.000000,2.000000,3.000000,2,0,2.000000,0,1.000000,1.000000,0,2,1,0,1,0,1,0
2,18.000000,1.711460,50.165754,1.880534,1.411685,1,0,1.910378,0,0.866045,1.673584,0,1,1,0,1,0,1,0
3,20.952737,1.710730,131.274851,3.000000,3.000000,1,0,1.674061,0,1.467863,0.780199,1,1,1,0,1,0,1,0
4,31.641081,1.914186,93.798055,2.679664,1.971472,1,0,1.979848,0,1.967973,0.931721,1,1,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,25.137087,1.766626,114.187096,2.919584,3.000000,1,0,2.151809,0,1.330519,0.196680,1,1,0,1,1,0,1,0
20754,18.000000,1.710000,50.000000,3.000000,4.000000,2,0,1.000000,0,2.000000,1.000000,1,1,0,1,0,1,1,0
20755,20.101026,1.819557,105.580491,2.407817,3.000000,1,0,2.000000,0,1.158040,1.198439,0,1,0,1,1,0,1,0
20756,33.852953,1.700000,83.520113,2.671238,1.971472,1,0,2.144838,0,0.000000,0.973834,0,2,0,1,1,0,1,0


In [192]:
# Checking correlation of sex to obesity risk
Female_matrix = np.corrcoef(x['Female'], y)
print("Female_matrix\n", Female_matrix) #NOTE: strong correlation to survival rate
Male_matrix = np.corrcoef(x['Male'], y)
print("\nMale_matrix\n", Male_matrix) #NOTE: negative correlation to surival rate - shouldn't have both variables in model also due to multicollinearity
x = x.drop('Male', axis=1)
xtest = xtest.drop('Male', axis=1)

Female_matrix
 [[1.         0.06769902]
 [0.06769902 1.        ]]

Male_matrix
 [[ 1.         -0.06769902]
 [-0.06769902  1.        ]]


In [193]:
# Checking correlation of family history to obesity risk
FamilyHistory_matrix = np.corrcoef(x['Family_History_Overweight'], y)
print("Family_History_Overweight_matrix\n", FamilyHistory_matrix) #NOTE: Super strong correlation
NoFamilyHistory_matrix = np.corrcoef(x['No_Family_History_with_Overweight'], y)
print("\nNo_Family_History_with_Overweight\n", NoFamilyHistory_matrix) #NOTE: Negative correlation
x = x.drop('No_Family_History_with_Overweight', axis=1) #dropping variable so there won't be multicollinearity
xtest = xtest.drop('No_Family_History_with_Overweight', axis=1)

Family_History_Overweight_matrix
 [[1.         0.52202562]
 [0.52202562 1.        ]]

No_Family_History_with_Overweight
 [[ 1.         -0.52202562]
 [-0.52202562  1.        ]]


In [194]:
# Checking correlation of FAVC to obesity risk
YesFAVC_matrix = np.corrcoef(x['YesFAVC'], y)
print("YesFAVC\n", YesFAVC_matrix) #NOTE: Strong correlation
NoFAVC_matrix = np.corrcoef(x['NoFAVC'], y)
print("\nNoFAVC\n", NoFAVC_matrix) #NOTE: Negative correlation
x = x.drop('NoFAVC', axis=1)
xtest = xtest.drop('NoFAVC', axis=1)

YesFAVC
 [[1.         0.21540067]
 [0.21540067 1.        ]]

NoFAVC
 [[ 1.         -0.21540067]
 [-0.21540067  1.        ]]


In [195]:
# Checking correlation of binary/categorical variables to obesity risk
SCC_matrix = np.corrcoef(x['SCC'], y)
print("SCC\n", SCC_matrix) #NOTE: negative correlation to obesity risk
SMOKE_matrix = np.corrcoef(x['SMOKE'], y)
print("\nSMOKE\n", SMOKE_matrix) #NOTE: little correlation
CAEC_matrix = np.corrcoef(x['CAEC'], y)
print("\nCAEC\n", CAEC_matrix) #NOTE: strong negative correlation
CALC_matrix = np.corrcoef(x['CALC'], y)
print("\nCALC\n", CALC_matrix) #NOTE: decent correlation
MTRANS_matrix = np.corrcoef(x['MTRANS'], y)
print("\nMTRANS\n", MTRANS_matrix) #NOTE: decent negative correlation

SCC
 [[ 1.         -0.18495934]
 [-0.18495934  1.        ]]

SMOKE
 [[1.         0.01350135]
 [0.01350135 1.        ]]

CAEC
 [[ 1.         -0.36405174]
 [-0.36405174  1.        ]]

CALC
 [[1.         0.18936104]
 [0.18936104 1.        ]]

MTRANS
 [[ 1.         -0.09693909]
 [-0.09693909  1.        ]]


In [196]:
# Checking correlation of continuous variables to obesity risk
Age_matrix = np.corrcoef(x['Age'], y)
print("Age\n", Age_matrix) #NOTE: Strong correlation
Height_matrix = np.corrcoef(x['Height'], y)
print("\nHeight\n", Height_matrix) #NOTE: decent correlation
Weight_matrix = np.corrcoef(x['Weight'], y)
print("\nWeight\n", Weight_matrix) #NOTE: Super strong correlation
FCVC_matrix = np.corrcoef(x['FCVC'], y)
print("\nFCVC\n", FCVC_matrix) #NOTE: strong correlation
NCP_matrix = np.corrcoef(x['NCP'], y)
print("\nNCP\n", NCP_matrix) #NOTE: little correlation
CH2O_matrix = np.corrcoef(x['CH2O'], y)
print("\nCH2O\n", CH2O_matrix) #NOTE: strong correlation
FAF_matrix = np.corrcoef(x['FAF'], y)
print("\nFAF\n", FAF_matrix) #NOTE: strong negative correlation
TUE_matrix = np.corrcoef(x['TUE'], y)
print("\nTUE\n", TUE_matrix) #NOTE: negative correlation

Age
 [[1.         0.35621105]
 [0.35621105 1.        ]]

Height
 [[1.         0.15014108]
 [0.15014108 1.        ]]

Weight
 [[1.        0.9212503]
 [0.9212503 1.       ]]

FCVC
 [[1.         0.27293289]
 [0.27293289 1.        ]]

NCP
 [[1.         0.02722691]
 [0.02722691 1.        ]]

CH2O
 [[1.         0.27315371]
 [0.27315371 1.        ]]

FAF
 [[ 1.         -0.23474454]
 [-0.23474454  1.        ]]

TUE
 [[ 1.         -0.12100862]
 [-0.12100862  1.        ]]


In [197]:
# split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=2)

In [198]:
# Create Model
# Decision trees - good for numerical and categorical data
model = DecisionTreeClassifier(max_depth=5)

In [199]:
model.fit(x_train, y_train)

In [200]:
# from sklearn.neighbors import KNeighborsClassifier 
# knn = KNeighborsClassifier(n_neighbors = 7)

In [201]:
# R2 - shows I have a good model and since validation set also scores highly model is performing well
y_val_pred = model.predict(x_val)
r2 = r2_score(y_val, y_val_pred)
print(f"R² score on the validation set: {r2:.2f}")

R² score on the validation set: 0.95


In [202]:
# Cross-Validating data
cv = StratifiedKFold(n_splits=5) # ensures data is distributed evenly
val_score = cross_val_score(model, x_val, y_val, cv=cv, scoring='accuracy') # estimates how well model will perform on the test data
print(f'Validation Accuracy: {val_score.mean()}')

Validation Accuracy: 0.8258695431545677


In [203]:
# officially predicting ypred
ypred = model.predict(xtest)

In [204]:
# mapping it so it can be used 
ypred_series = pd.Series(ypred)

Weight_code_reverse = {
    1: 'Insufficient_Weight',
    2: 'Normal_Weight',
    3: 'Overweight_Level_I',
    4: 'Overweight_Level_II',
    5: 'Obesity_Type_I',
    6: 'Obesity_Type_II',
    7: 'Obesity_Type_III'
}

ypred_mapped = ypred_series.map(Weight_code_reverse)

In [205]:
# back to a numpy array so it can be submitted
ypred2 = np.array(ypred_mapped)

In [207]:
# writing new df for submission
submission = pd.DataFrame({'NObeyesdad': ypred2}, index=xtest.index)

In [209]:
# writing to csv to submit
submission.to_csv('submission.csv', index=True)