In [19]:
import random

import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC



In [29]:
# Download latest version
path = kagglehub.dataset_download("ruchikakumbhar/obesity-prediction")

print("Path to dataset files:", path)
df = pd.read_csv(path + "/Obesity Prediction.csv")
df.head()
original_df = df.copy()

Path to dataset files: /Users/hendrata/.cache/kagglehub/datasets/ruchikakumbhar/obesity-prediction/versions/1


In [45]:
df = original_df.copy()
print("df shape: ", df.shape)

# Data massaging:
# Gender: 0 = Female, 1 = Male
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})

# Family history, FAVC, SMOKE, SCC: 0 = no, 1 = yes
yes_no_columns = ['family_history', 'FAVC', 'SMOKE', 'SCC']
for col in yes_no_columns:
    df[col] = df[col].map({'no': 0, 'yes': 1})

# CAEC, CALC: 0 = no, 1 = Sometimes, 2 = Frequently, 3 = Always
frequency_columns = ['CAEC', 'CALC']
for col in frequency_columns:
    df[col] = df[col].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})

# MTRANS: 0 = Public_Transportation, 1 = Walking, 2 = Automobile, 3 = Motorbike, 4 = Bike
df['MTRANS'] = df['MTRANS'].map({'Public_Transportation': 0, 'Walking': 1, 'Automobile': 2, 'Motorbike': 3, 'Bike': 4})

# Obesity
df['Obesity'] = df['Obesity'].map({'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2,
                                   'Overweight_Level_II': 3, 'Obesity_Type_I': 4, 'Obesity_Type_II': 5,
                                   'Obesity_Type_III': 6})

# Now df should be all numeric
df.head()
#print(df['Obesity'].unique())

df shape:  (2111, 17)


Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,0,21.0,1.62,64.0,1,0,2.0,3.0,1,0,2.0,0,0.0,1.0,0,0,1
1,0,21.0,1.52,56.0,1,0,3.0,3.0,1,1,3.0,1,3.0,0.0,1,0,1
2,1,23.0,1.8,77.0,1,0,2.0,3.0,1,0,2.0,0,2.0,1.0,2,0,1
3,1,27.0,1.8,87.0,0,0,3.0,3.0,1,0,2.0,0,2.0,0.0,2,1,2
4,1,22.0,1.78,89.8,0,0,2.0,1.0,1,0,2.0,0,0.0,0.0,1,0,3


In [46]:
# Describe the data sets to get an insight
print(df.describe())

            Gender          Age       Height       Weight  family_history  \
count  2111.000000  2111.000000  2111.000000  2111.000000     2111.000000   
mean      0.505921    24.312600     1.701677    86.586058        0.817622   
std       0.500083     6.345968     0.093305    26.191172        0.386247   
min       0.000000    14.000000     1.450000    39.000000        0.000000   
25%       0.000000    19.947192     1.630000    65.473343        1.000000   
50%       1.000000    22.777890     1.700499    83.000000        1.000000   
75%       1.000000    26.000000     1.768464   107.430682        1.000000   
max       1.000000    61.000000     1.980000   173.000000        1.000000   

              FAVC         FCVC          NCP         CAEC        SMOKE  \
count  2111.000000  2111.000000  2111.000000  2111.000000  2111.000000   
mean      0.883941     2.419043     2.685628     1.140692     0.020843   
std       0.320371     0.533927     0.778039     0.468543     0.142893   
min       

In [47]:
print("df shape: ", df.shape)
random.seed(12345)

target_column = "Obesity"
x_data = df[df.columns[:-1]]
y_data = df[target_column]

num_of_samples = df.shape[0]
# training samples: 80% of the samples
num_of_training_samples = (int)(num_of_samples * 0.8)
num_of_test_samples = num_of_samples - num_of_training_samples
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=num_of_test_samples)

print("x_train shape", x_train.shape)
print("x_test shape", x_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)


df shape:  (2111, 17)
x_train shape (1688, 16)
x_test shape (423, 16)
y_train shape (1688,)
y_test shape (423,)


Defining the models
(eventually we'll add our own models here. These ones are just the ones I took off the internet)

In [51]:
# This is where you add your own models with fine tuning

models = []

# Samples from the internet, feel free to use these as starting points
# models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
# models.append(('LDA', LinearDiscriminantAnalysis()))
# models.append(('KNN', KNeighborsClassifier()))
# models.append(('CART', DecisionTreeClassifier()))
# models.append(('NB', GaussianNB()))
# models.append(('SVM', SVC(gamma='auto')))

models.append(('Hendrata_SVM_degree2', SVC(gamma='auto', degree=2)))
models.append(('Hendrata_SVM_degree3', SVC(gamma='scale', degree=3)))

print(models)


[('Hendrata_SVM_degree2', SVC(degree=2, gamma='auto')), ('Hendrata_SVM_degree3', SVC())]


In [None]:
# Brendan to add his own models here

In [None]:
# Joseph to add his own models here

In [52]:

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

Hendrata_SVM_degree2: 0.885084 (0.015837)
Hendrata_SVM_degree3: 0.537905 (0.036313)


In [None]:
# Print more analytics here