In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import time

from imblearn.over_sampling import SMOTE

In [5]:
applySmote = False

In [6]:
df = pd.read_csv('body_level_classification_train.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Body_Level
0,Female,22.547298,1.722461,51.881263,yes,2.663421,1.04111,no,no,3.0,Frequently,yes,no,0.794402,1.391948,Public_Transportation,Body Level 1
1,Male,19.799054,1.743702,54.927529,yes,2.0,2.847264,Sometimes,no,3.28926,Sometimes,yes,no,1.680844,2.0,Public_Transportation,Body Level 1
2,Female,17.823438,1.708406,50.0,yes,1.642241,1.099231,Sometimes,no,3.45259,Sometimes,no,no,0.418875,1.0,Public_Transportation,Body Level 1
3,Female,19.007177,1.690727,49.895716,yes,1.212908,1.029703,Sometimes,no,3.207071,Sometimes,no,no,2.0,1.0,Public_Transportation,Body Level 1
4,Male,19.72925,1.793315,58.19515,yes,2.508835,2.076933,no,no,3.435905,Sometimes,yes,no,2.026668,1.443328,Automobile,Body Level 1


In [7]:
df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'H_Cal_Consump', 'Veg_Consump',
       'Water_Consump', 'Alcohol_Consump', 'Smoking', 'Meal_Count',
       'Food_Between_Meals', 'Fam_Hist', 'H_Cal_Burn', 'Phys_Act',
       'Time_E_Dev', 'Transport', 'Body_Level'],
      dtype='object')

In [8]:
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
categorical_columns.remove('Body_Level')
categorical_columns

['Gender',
 'H_Cal_Consump',
 'Alcohol_Consump',
 'Smoking',
 'Food_Between_Meals',
 'Fam_Hist',
 'H_Cal_Burn',
 'Transport']

In [9]:
# one hot for categorical columns
df_h = pd.get_dummies(df, columns=categorical_columns)
# make body_level as the last column
df_h.drop('Body_Level', axis=1, inplace=True)
df_h['Body_Level'] = df['Body_Level']
df_h.head()

Unnamed: 0,Age,Height,Weight,Veg_Consump,Water_Consump,Meal_Count,Phys_Act,Time_E_Dev,Gender_Female,Gender_Male,...,Fam_Hist_no,Fam_Hist_yes,H_Cal_Burn_no,H_Cal_Burn_yes,Transport_Automobile,Transport_Bike,Transport_Motorbike,Transport_Public_Transportation,Transport_Walking,Body_Level
0,22.547298,1.722461,51.881263,2.663421,1.04111,3.0,0.794402,1.391948,1,0,...,0,1,1,0,0,0,0,1,0,Body Level 1
1,19.799054,1.743702,54.927529,2.0,2.847264,3.28926,1.680844,2.0,0,1,...,0,1,1,0,0,0,0,1,0,Body Level 1
2,17.823438,1.708406,50.0,1.642241,1.099231,3.45259,0.418875,1.0,1,0,...,1,0,1,0,0,0,0,1,0,Body Level 1
3,19.007177,1.690727,49.895716,1.212908,1.029703,3.207071,2.0,1.0,1,0,...,1,0,1,0,0,0,0,1,0,Body Level 1
4,19.72925,1.793315,58.19515,2.508835,2.076933,3.435905,2.026668,1.443328,0,1,...,0,1,1,0,1,0,0,0,0,Body Level 1


In [19]:
df_h['Body_Level'] = pd.factorize(df_h.Body_Level)[0] 
df_h.head()


Unnamed: 0,Age,Height,Weight,Veg_Consump,Water_Consump,Meal_Count,Phys_Act,Time_E_Dev,Gender_Female,Gender_Male,...,Fam_Hist_no,Fam_Hist_yes,H_Cal_Burn_no,H_Cal_Burn_yes,Transport_Automobile,Transport_Bike,Transport_Motorbike,Transport_Public_Transportation,Transport_Walking,Body_Level
0,22.547298,1.722461,51.881263,2.663421,1.04111,3.0,0.794402,1.391948,1,0,...,0,1,1,0,0,0,0,1,0,0
1,19.799054,1.743702,54.927529,2.0,2.847264,3.28926,1.680844,2.0,0,1,...,0,1,1,0,0,0,0,1,0,0
2,17.823438,1.708406,50.0,1.642241,1.099231,3.45259,0.418875,1.0,1,0,...,1,0,1,0,0,0,0,1,0,0
3,19.007177,1.690727,49.895716,1.212908,1.029703,3.207071,2.0,1.0,1,0,...,1,0,1,0,0,0,0,1,0,0
4,19.72925,1.793315,58.19515,2.508835,2.076933,3.435905,2.026668,1.443328,0,1,...,0,1,1,0,1,0,0,0,0,0


# Looks like SVM Overfits, specially with high C

In [11]:
# SVM
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote

# SVM
# svm = SVC(kernel='linear', C=1.0, random_state=0)
svm = SVC(kernel='linear', C= 100, random_state=0)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train, svm.predict(X_train)))
print('train precision: ', precision_score(y_train, svm.predict(X_train), average='macro'))
print('train recall: ', recall_score(y_train, svm.predict(X_train), average='macro'))
print('train f1: ', f1_score(y_train, svm.predict(X_train), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))



train accuracy:  0.9940728196443692
train precision:  0.9906629894236427
train recall:  0.990515935214211
train f1:  0.9905681222113505

accuracy:  0.9797297297297297
precision:  0.9748976513682396
recall:  0.9730866274179983
f1:  0.9736505010398941


# Linear Regression

In [12]:
# linear regression
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# applySmote = True
if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote

# linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred = np.round(y_pred)

# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))


accuracy:  0.793918918918919
precision:  0.6256887046412667
recall:  0.616535604149145
f1:  0.6130446543166778


  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression

In [13]:
# logistic regression, we have 4 classes
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote


# logistic regression
lr = LogisticRegression( solver='lbfgs', multi_class='multinomial', max_iter=5000, random_state=0, penalty='l2')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train, lr.predict(X_train)))
print('train precision: ', precision_score(y_train, lr.predict(X_train), average='macro'))
print('train recall: ', recall_score(y_train, lr.predict(X_train), average='macro'))
print('train f1: ', f1_score(y_train, lr.predict(X_train), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))


train accuracy:  0.903471634208298
train precision:  0.8765800886852466
train recall:  0.8727109314660851
train f1:  0.8727319355070285

accuracy:  0.8716216216216216
precision:  0.8462122372401117
recall:  0.8528796958228203
f1:  0.8459289610438119


# SVM with RBF Kernel

In [14]:
# SVM
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote
    
# SVM
svm = SVC(kernel='rbf', C=1.0, random_state=0)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train, svm.predict(X_train)))
print('train precision: ', precision_score(y_train, svm.predict(X_train), average='macro'))
print('train recall: ', recall_score(y_train, svm.predict(X_train), average='macro'))
print('train f1: ', f1_score(y_train, svm.predict(X_train), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))



train accuracy:  0.7781541066892464
train precision:  0.7235967914826364
train recall:  0.7257154093407842
train f1:  0.7156824112652663

accuracy:  0.7195945945945946
precision:  0.665854135129511
recall:  0.6724395281398934
f1:  0.6618753146078727


# Naive Bayes

In [15]:
# Naive Bayes
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote
    
# Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
nb = MultinomialNB()
# nb = GaussianNB( )
# nb = BernoulliNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train, nb.predict(X_train)))
print('train precision: ', precision_score(y_train, nb.predict(X_train), average='macro'))
print('train recall: ', recall_score(y_train, nb.predict(X_train), average='macro'))
print('train f1: ', f1_score(y_train, nb.predict(X_train), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))


train accuracy:  0.7281964436917866
train precision:  0.7068533943772393
train recall:  0.6663080817545772
train f1:  0.6799915833855088

accuracy:  0.6587837837837838
precision:  0.6126143009576293
recall:  0.5860350741169049
f1:  0.595458936829122


# SVM linear with SMOTE

In [16]:
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# apply smote
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


# check the number of samples per class
print('Before SMOTE: ', np.bincount(y_train))
print('After SMOTE: ', np.bincount(y_train_smote))

print()

# SVM, linear, C=10
svm = SVC(kernel='linear', C=100, random_state=0)
svm.fit(X_train_smote, y_train_smote)
y_pred = svm.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train_smote, svm.predict(X_train_smote)))
print('train precision: ', precision_score(y_train_smote, svm.predict(X_train_smote), average='macro'))
print('train recall: ', recall_score(y_train_smote, svm.predict(X_train_smote), average='macro'))
print('train f1: ', f1_score(y_train_smote, svm.predict(X_train_smote), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))



Before SMOTE:  [150 160 319 552]
After SMOTE:  [552 552 552 552]

train accuracy:  0.9977355072463768
train precision:  0.9977395843657325
train recall:  0.9977355072463768
train f1:  0.9977355009293448

accuracy:  0.9662162162162162
precision:  0.9579890308607981
recall:  0.9569852554667788
f1:  0.9563375292092965


# Catboost 

In [39]:
from catboost import CatBoostClassifier

X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create the CatBoost model
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    random_seed=42
)

# Fit the model on the training data
model.fit(X_train, y_train)



0:	learn: 1.2449449	total: 23.8ms	remaining: 2.35s
1:	learn: 1.1339118	total: 31.9ms	remaining: 1.56s
2:	learn: 1.0272223	total: 40.6ms	remaining: 1.31s
3:	learn: 0.9358980	total: 48.6ms	remaining: 1.17s
4:	learn: 0.8759126	total: 56.5ms	remaining: 1.07s
5:	learn: 0.8011394	total: 64ms	remaining: 1s
6:	learn: 0.7337919	total: 72.2ms	remaining: 960ms
7:	learn: 0.6913820	total: 79.9ms	remaining: 919ms
8:	learn: 0.6496348	total: 88.4ms	remaining: 893ms
9:	learn: 0.6096378	total: 95.8ms	remaining: 862ms
10:	learn: 0.5811812	total: 104ms	remaining: 841ms
11:	learn: 0.5584318	total: 111ms	remaining: 817ms
12:	learn: 0.5282805	total: 120ms	remaining: 800ms
13:	learn: 0.4960504	total: 127ms	remaining: 779ms
14:	learn: 0.4725373	total: 135ms	remaining: 763ms
15:	learn: 0.4550466	total: 142ms	remaining: 746ms
16:	learn: 0.4335506	total: 150ms	remaining: 732ms
17:	learn: 0.4142997	total: 160ms	remaining: 727ms
18:	learn: 0.4026071	total: 167ms	remaining: 714ms
19:	learn: 0.3910405	total: 175ms	re

<catboost.core.CatBoostClassifier at 0x20d0c349040>

In [40]:
# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('train precision: ', precision_score(y_train, model.predict(X_train), average='macro'))
print('train recall: ', recall_score(y_train, model.predict(X_train), average='macro'))
print('train f1: ', f1_score(y_train, nb.predict(X_train), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))


train accuracy:  0.9966130397967824
train precision:  0.9972114718358656
train recall:  0.9945288009404388
train f1:  0.6799915833855088

accuracy:  0.9662162162162162
precision:  0.9579890308607981
recall:  0.9569852554667788
f1:  0.9563375292092965


In [37]:
# Create a dictionary with sample data
new_data_dict = {
    'Age': [32, 45, 21],
    'Height': [1.65, 1.75, 1.68],
    'Weight': [72, 89, 55],
    'Veg_Consump': [3, 2, 4],
    'Water_Consump': [4, 3, 2],
    'Meal_Count': [3, 4, 2],
    'Phys_Act': [2, 3, 4],
    'Time_E_Dev': [3, 2, 1],
    'Gender_Female': [0, 1, 0],
    'Gender_Male': [1, 0, 1],
    'H_Cal_Consump_no': [1, 0, 0],
    'H_Cal_Consump_yes': [0, 1, 1],
    'Alcohol_Consump_Always': [0, 0, 1],
    'Alcohol_Consump_Frequently': [1, 0, 0],
    'Alcohol_Consump_Sometimes': [0, 1, 0],
    'Alcohol_Consump_no': [0, 0, 0],
    'Smoking_no': [1, 1, 0],
    'Smoking_yes': [0, 0, 1],
    'Food_Between_Meals_Always': [0, 1, 0],
    'Food_Between_Meals_Frequently': [0, 0, 1],
    'Food_Between_Meals_Sometimes': [1, 0, 0],
    'Food_Between_Meals_no': [0, 0, 0],
    'Fam_Hist_no': [1, 0, 1],
    'Fam_Hist_yes': [0, 1, 0],
    'H_Cal_Burn_no': [1, 0, 0],
    'H_Cal_Burn_yes': [0, 1, 1],
    'Transport_Automobile': [1, 0, 0],
    'Transport_Bike': [0, 1, 0],
    'Transport_Motorbike': [0, 0, 1],
    'Transport_Public_Transportation': [0, 0, 0],
    'Transport_Walking': [0, 0, 0]
}

# Convert the dictionary to a Pandas DataFrame
new_data = pd.DataFrame(new_data_dict)

print(len(df_h.columns))
print("---------------------------")
print(len(new_data.columns))
print("")

# Make prediction using the model
predictions = model.predict(new_data)

# Print the predicted class labels
print("predictions are : \n", predictions)


32
---------------------------
31

predictions are : 
 [[2]
 [2]
 [1]]
