In [33]:
import random

import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, mean_absolute_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, root_mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC



In [2]:
# Download latest version
path = kagglehub.dataset_download("ruchikakumbhar/obesity-prediction")

print("Path to dataset files:", path)
df = pd.read_csv(path + "/Obesity Prediction.csv")
df.head()
original_df = df.copy()

Path to dataset files: /Users/hendrata/.cache/kagglehub/datasets/ruchikakumbhar/obesity-prediction/versions/1


In [3]:
df = original_df.copy()
print("df shape: ", df.shape)

# Data massaging:
# Gender: 0 = Female, 1 = Male
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})

# Family history, FAVC, SMOKE, SCC: 0 = no, 1 = yes
yes_no_columns = ['family_history', 'FAVC', 'SMOKE', 'SCC']
for col in yes_no_columns:
    df[col] = df[col].map({'no': 0, 'yes': 1})

# CAEC, CALC: 0 = no, 1 = Sometimes, 2 = Frequently, 3 = Always
frequency_columns = ['CAEC', 'CALC']
for col in frequency_columns:
    df[col] = df[col].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})

# MTRANS: 0 = Public_Transportation, 1 = Walking, 2 = Automobile, 3 = Motorbike, 4 = Bike
df['MTRANS'] = df['MTRANS'].map({'Public_Transportation': 0, 'Walking': 1, 'Automobile': 2, 'Motorbike': 3, 'Bike': 4})

# Obesity
df['Obesity'] = df['Obesity'].map({'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2,
                                   'Overweight_Level_II': 3, 'Obesity_Type_I': 4, 'Obesity_Type_II': 5,
                                   'Obesity_Type_III': 6})

# Now df should be all numeric
df.head()
#print(df['Obesity'].unique())

df shape:  (2111, 17)


Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,0,21.0,1.62,64.0,1,0,2.0,3.0,1,0,2.0,0,0.0,1.0,0,0,1
1,0,21.0,1.52,56.0,1,0,3.0,3.0,1,1,3.0,1,3.0,0.0,1,0,1
2,1,23.0,1.8,77.0,1,0,2.0,3.0,1,0,2.0,0,2.0,1.0,2,0,1
3,1,27.0,1.8,87.0,0,0,3.0,3.0,1,0,2.0,0,2.0,0.0,2,1,2
4,1,22.0,1.78,89.8,0,0,2.0,1.0,1,0,2.0,0,0.0,0.0,1,0,3


In [4]:
# Describe the data sets to get an insight
print(df.describe())

            Gender          Age       Height       Weight  family_history  \
count  2111.000000  2111.000000  2111.000000  2111.000000     2111.000000   
mean      0.505921    24.312600     1.701677    86.586058        0.817622   
std       0.500083     6.345968     0.093305    26.191172        0.386247   
min       0.000000    14.000000     1.450000    39.000000        0.000000   
25%       0.000000    19.947192     1.630000    65.473343        1.000000   
50%       1.000000    22.777890     1.700499    83.000000        1.000000   
75%       1.000000    26.000000     1.768464   107.430682        1.000000   
max       1.000000    61.000000     1.980000   173.000000        1.000000   

              FAVC         FCVC          NCP         CAEC        SMOKE  \
count  2111.000000  2111.000000  2111.000000  2111.000000  2111.000000   
mean      0.883941     2.419043     2.685628     1.140692     0.020843   
std       0.320371     0.533927     0.778039     0.468543     0.142893   
min       

In [5]:
print("df shape: ", df.shape)
random.seed(12345)

target_column = "Obesity"
x_data = df[df.columns[:-1]]
y_data = df[target_column]

num_of_samples = df.shape[0]
# training samples: 80% of the samples
num_of_training_samples = (int)(num_of_samples * 0.8)
num_of_test_samples = num_of_samples - num_of_training_samples
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=num_of_test_samples)

print("x_train shape", x_train.shape)
print("x_test shape", x_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)


df shape:  (2111, 17)
x_train shape (1688, 16)
x_test shape (423, 16)
y_train shape (1688,)
y_test shape (423,)


Defining the models
(eventually we'll add our own models here. These ones are just the ones I took off the internet)

In [37]:
# This is where you add your own models with fine tuning

models = []

# Samples from the internet, feel free to use these as starting points
# models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
# models.append(('LDA', LinearDiscriminantAnalysis()))
# models.append(('KNN', KNeighborsClassifier()))
# models.append(('CART', DecisionTreeClassifier()))
# models.append(('NB', GaussianNB()))
# models.append(('SVM', SVC(gamma='auto')))

gammas = ['scale', 'auto']
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
regularization_params = [0.01, 0.1, 1, 10, 100]
for gamma in gammas:
    for kernel in kernels:
        for regularization_param in regularization_params:
            name = "SVC_" + gamma + "_" + kernel + "_" + str(regularization_param)
            models.append((name, SVC(gamma=gamma, kernel=kernel, C=regularization_param)))

print(models)


[('SVC_scale_linear_0.01', SVC(C=0.01, kernel='linear')), ('SVC_scale_linear_0.1', SVC(C=0.1, kernel='linear')), ('SVC_scale_linear_1', SVC(C=1, kernel='linear')), ('SVC_scale_linear_10', SVC(C=10, kernel='linear')), ('SVC_scale_linear_100', SVC(C=100, kernel='linear')), ('SVC_scale_poly_0.01', SVC(C=0.01, kernel='poly')), ('SVC_scale_poly_0.1', SVC(C=0.1, kernel='poly')), ('SVC_scale_poly_1', SVC(C=1, kernel='poly')), ('SVC_scale_poly_10', SVC(C=10, kernel='poly')), ('SVC_scale_poly_100', SVC(C=100, kernel='poly')), ('SVC_scale_rbf_0.01', SVC(C=0.01)), ('SVC_scale_rbf_0.1', SVC(C=0.1)), ('SVC_scale_rbf_1', SVC(C=1)), ('SVC_scale_rbf_10', SVC(C=10)), ('SVC_scale_rbf_100', SVC(C=100)), ('SVC_scale_sigmoid_0.01', SVC(C=0.01, kernel='sigmoid')), ('SVC_scale_sigmoid_0.1', SVC(C=0.1, kernel='sigmoid')), ('SVC_scale_sigmoid_1', SVC(C=1, kernel='sigmoid')), ('SVC_scale_sigmoid_10', SVC(C=10, kernel='sigmoid')), ('SVC_scale_sigmoid_100', SVC(C=100, kernel='sigmoid')), ('SVC_auto_linear_0.01', 

In [None]:
# Brendan to add his own models here

In [None]:
# Joseph to add his own models here

In [38]:

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    #cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
    scoring = ['accuracy', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']
    cv_results = cross_validate(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    accuracy = cv_results['test_accuracy'].mean()
    neg_RMS = cv_results['test_neg_root_mean_squared_error'].mean()
    neg_MAE = cv_results['test_neg_mean_absolute_error'].mean()
    print(name + ": Accuracy " + str(accuracy) + " --- RMS: " + str(-neg_RMS) + " --- MAE: " + str(-neg_MAE))

SVC_scale_linear_0.01: Accuracy 0.7428958861651169 --- RMS: 0.5735735822132544 --- MAE: 0.28139616793462946
SVC_scale_linear_0.1: Accuracy 0.8246407438715131 --- RMS: 0.4729267465293708 --- MAE: 0.19195195829811212
SVC_scale_linear_1: Accuracy 0.8767645815722738 --- RMS: 0.3805372508405709 --- MAE: 0.13153000845308535
SVC_scale_linear_10: Accuracy 0.9502359819667513 --- RMS: 0.23675155694005795 --- MAE: 0.05272612003381234
SVC_scale_linear_100: Accuracy 0.9603057199211046 --- RMS: 0.20201051365349693 --- MAE: 0.04028599605522683
SVC_scale_poly_0.01: Accuracy 0.4046315863623556 --- RMS: 0.9665240695995674 --- MAE: 0.7067624683009298
SVC_scale_poly_0.1: Accuracy 0.49942589461820236 --- RMS: 0.9003389410034313 --- MAE: 0.6012996618765849
SVC_scale_poly_1: Accuracy 0.6019230769230769 --- RMS: 0.743521481688994 --- MAE: 0.45021837137221754
SVC_scale_poly_10: Accuracy 0.7002430262045647 --- RMS: 0.6223437359680817 --- MAE: 0.32939560439560445
SVC_scale_poly_100: Accuracy 0.789687235841082 --

In [24]:
# Print more analytics here
cv_results

{'fit_time': array([0.13671112, 0.16144514, 0.13024282, 0.12184668, 0.12318325,
        0.1341939 , 0.14638805, 0.12457585, 0.12372303, 0.13277793]),
 'score_time': array([0.00882101, 0.00950003, 0.00777507, 0.00817108, 0.007447  ,
        0.00819492, 0.00977206, 0.00785518, 0.00803709, 0.00770402]),
 'test_accuracy': array([0.92899408, 0.88757396, 0.85207101, 0.85207101, 0.90532544,
        0.85207101, 0.86390533, 0.89349112, 0.86904762, 0.86309524]),
 'test_neg_root_mean_squared_error': array([-0.26646936, -0.38461538, -0.40703866, -0.44853476, -0.30769231,
        -0.42828957, -0.43514263, -0.32635698, -0.38575837, -0.41547448])}

In [41]:
# Choose the best model:
best_model_idx = names.index('SVC_auto_poly_0.1')
model = models[best_model_idx][1]
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print(accuracy_score(y_test, predictions))
print(root_mean_squared_error(y_test, predictions))
print(mean_absolute_error(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))


0.9479905437352246
0.2280558183094117
0.05200945626477541
[[50  0  0  0  0  0  0]
 [ 6 54  3  0  0  0  0]
 [ 0  3 59  3  0  0  0]
 [ 0  0  2 55  2  0  0]
 [ 0  0  0  2 67  0  0]
 [ 0  0  0  0  1 56  0]
 [ 0  0  0  0  0  0 60]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        50
           1       0.95      0.86      0.90        63
           2       0.92      0.91      0.91        65
           3       0.92      0.93      0.92        59
           4       0.96      0.97      0.96        69
           5       1.00      0.98      0.99        57
           6       1.00      1.00      1.00        60

    accuracy                           0.95       423
   macro avg       0.95      0.95      0.95       423
weighted avg       0.95      0.95      0.95       423

