In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import time

from imblearn.over_sampling import SMOTE

In [15]:
applySmote = False

In [16]:
df = pd.read_csv('body_level_classification_train.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Body_Level
0,Female,22.547298,1.722461,51.881263,yes,2.663421,1.04111,no,no,3.0,Frequently,yes,no,0.794402,1.391948,Public_Transportation,Body Level 1
1,Male,19.799054,1.743702,54.927529,yes,2.0,2.847264,Sometimes,no,3.28926,Sometimes,yes,no,1.680844,2.0,Public_Transportation,Body Level 1
2,Female,17.823438,1.708406,50.0,yes,1.642241,1.099231,Sometimes,no,3.45259,Sometimes,no,no,0.418875,1.0,Public_Transportation,Body Level 1
3,Female,19.007177,1.690727,49.895716,yes,1.212908,1.029703,Sometimes,no,3.207071,Sometimes,no,no,2.0,1.0,Public_Transportation,Body Level 1
4,Male,19.72925,1.793315,58.19515,yes,2.508835,2.076933,no,no,3.435905,Sometimes,yes,no,2.026668,1.443328,Automobile,Body Level 1


In [17]:
df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'H_Cal_Consump', 'Veg_Consump',
       'Water_Consump', 'Alcohol_Consump', 'Smoking', 'Meal_Count',
       'Food_Between_Meals', 'Fam_Hist', 'H_Cal_Burn', 'Phys_Act',
       'Time_E_Dev', 'Transport', 'Body_Level'],
      dtype='object')

In [18]:
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
categorical_columns.remove('Body_Level')
categorical_columns

['Gender',
 'H_Cal_Consump',
 'Alcohol_Consump',
 'Smoking',
 'Food_Between_Meals',
 'Fam_Hist',
 'H_Cal_Burn',
 'Transport']

In [19]:
# one hot for categorical columns
df_h = pd.get_dummies(df, columns=categorical_columns)
# make body_level as the last column
df_h.drop('Body_Level', axis=1, inplace=True)
df_h['Body_Level'] = df['Body_Level']
df_h.head()

Unnamed: 0,Age,Height,Weight,Veg_Consump,Water_Consump,Meal_Count,Phys_Act,Time_E_Dev,Gender_Female,Gender_Male,...,Fam_Hist_no,Fam_Hist_yes,H_Cal_Burn_no,H_Cal_Burn_yes,Transport_Automobile,Transport_Bike,Transport_Motorbike,Transport_Public_Transportation,Transport_Walking,Body_Level
0,22.547298,1.722461,51.881263,2.663421,1.04111,3.0,0.794402,1.391948,True,False,...,False,True,True,False,False,False,False,True,False,Body Level 1
1,19.799054,1.743702,54.927529,2.0,2.847264,3.28926,1.680844,2.0,False,True,...,False,True,True,False,False,False,False,True,False,Body Level 1
2,17.823438,1.708406,50.0,1.642241,1.099231,3.45259,0.418875,1.0,True,False,...,True,False,True,False,False,False,False,True,False,Body Level 1
3,19.007177,1.690727,49.895716,1.212908,1.029703,3.207071,2.0,1.0,True,False,...,True,False,True,False,False,False,False,True,False,Body Level 1
4,19.72925,1.793315,58.19515,2.508835,2.076933,3.435905,2.026668,1.443328,False,True,...,False,True,True,False,True,False,False,False,False,Body Level 1


In [20]:
df_h['Body_Level'] = pd.factorize(df_h.Body_Level)[0] 
df_h.head()

Unnamed: 0,Age,Height,Weight,Veg_Consump,Water_Consump,Meal_Count,Phys_Act,Time_E_Dev,Gender_Female,Gender_Male,...,Fam_Hist_no,Fam_Hist_yes,H_Cal_Burn_no,H_Cal_Burn_yes,Transport_Automobile,Transport_Bike,Transport_Motorbike,Transport_Public_Transportation,Transport_Walking,Body_Level
0,22.547298,1.722461,51.881263,2.663421,1.04111,3.0,0.794402,1.391948,True,False,...,False,True,True,False,False,False,False,True,False,0
1,19.799054,1.743702,54.927529,2.0,2.847264,3.28926,1.680844,2.0,False,True,...,False,True,True,False,False,False,False,True,False,0
2,17.823438,1.708406,50.0,1.642241,1.099231,3.45259,0.418875,1.0,True,False,...,True,False,True,False,False,False,False,True,False,0
3,19.007177,1.690727,49.895716,1.212908,1.029703,3.207071,2.0,1.0,True,False,...,True,False,True,False,False,False,False,True,False,0
4,19.72925,1.793315,58.19515,2.508835,2.076933,3.435905,2.026668,1.443328,False,True,...,False,True,True,False,True,False,False,False,False,0


# Looks like SVM Overfits, specially with high C

In [21]:
# SVM
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote

# SVM
# svm = SVC(kernel='linear', C=1.0, random_state=0)
svm = SVC(kernel='linear', C= 100, random_state=0)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train, svm.predict(X_train)))
print('train precision: ', precision_score(y_train, svm.predict(X_train), average='macro'))
print('train recall: ', recall_score(y_train, svm.predict(X_train), average='macro'))
print('train f1: ', f1_score(y_train, svm.predict(X_train), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))



train accuracy:  0.9940728196443692
train precision:  0.9906629894236427
train recall:  0.990515935214211
train f1:  0.9905681222113505

accuracy:  0.9797297297297297
precision:  0.9748976513682396
recall:  0.9730866274179983
f1:  0.9736505010398941


# Linear Regression

In [22]:
# linear regression
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# applySmote = True
if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote

# linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred = np.round(y_pred)

# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))


accuracy:  0.793918918918919
precision:  0.6256887046412667
recall:  0.616535604149145
f1:  0.6130446543166778


  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression

In [23]:
# logistic regression, we have 4 classes
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote


# logistic regression
lr = LogisticRegression( solver='lbfgs', multi_class='multinomial', max_iter=5000, random_state=0, penalty='l2')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train, lr.predict(X_train)))
print('train precision: ', precision_score(y_train, lr.predict(X_train), average='macro'))
print('train recall: ', recall_score(y_train, lr.predict(X_train), average='macro'))
print('train f1: ', f1_score(y_train, lr.predict(X_train), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))


train accuracy:  0.9043183742591024
train precision:  0.8775490945444013
train recall:  0.874273431466085
train f1:  0.874120080494378

accuracy:  0.8716216216216216
precision:  0.8462122372401117
recall:  0.8528796958228203
f1:  0.8459289610438119


# SVM with RBF Kernel

In [24]:
# SVM
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote
    
# SVM
svm = SVC(kernel='rbf', C=1.0, random_state=0)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train, svm.predict(X_train)))
print('train precision: ', precision_score(y_train, svm.predict(X_train), average='macro'))
print('train recall: ', recall_score(y_train, svm.predict(X_train), average='macro'))
print('train f1: ', f1_score(y_train, svm.predict(X_train), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))



train accuracy:  0.7781541066892464
train precision:  0.7235967914826364
train recall:  0.7257154093407842
train f1:  0.7156824112652663

accuracy:  0.7195945945945946
precision:  0.665854135129511
recall:  0.6724395281398934
f1:  0.6618753146078727


# Naive Bayes

In [25]:
# Naive Bayes
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

if applySmote:
    print('Before SMOTE: ', np.bincount(y_train))
    smote = SMOTE(random_state=0)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    # check the number of samples per class
    print('After SMOTE: ', np.bincount(y_train_smote))
    print()
    X_train = X_train_smote
    y_train = y_train_smote
    
# Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
nb = MultinomialNB()
# nb = GaussianNB( )
# nb = BernoulliNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train, nb.predict(X_train)))
print('train precision: ', precision_score(y_train, nb.predict(X_train), average='macro'))
print('train recall: ', recall_score(y_train, nb.predict(X_train), average='macro'))
print('train f1: ', f1_score(y_train, nb.predict(X_train), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))


train accuracy:  0.7281964436917866
train precision:  0.7068533943772393
train recall:  0.6663080817545772
train f1:  0.6799915833855088

accuracy:  0.6587837837837838
precision:  0.6126143009576293
recall:  0.5860350741169049
f1:  0.595458936829122


# SVM linear with SMOTE

In [26]:
X = df_h.iloc[:, :-1].values
y = df_h.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# apply smote
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


# check the number of samples per class
print('Before SMOTE: ', np.bincount(y_train))
print('After SMOTE: ', np.bincount(y_train_smote))

print()

# SVM, linear, C=10
svm = SVC(kernel='linear', C=100, random_state=0)
svm.fit(X_train_smote, y_train_smote)
y_pred = svm.predict(X_test)

# train accuracy, precision, recall, f1
print('train accuracy: ', accuracy_score(y_train_smote, svm.predict(X_train_smote)))
print('train precision: ', precision_score(y_train_smote, svm.predict(X_train_smote), average='macro'))
print('train recall: ', recall_score(y_train_smote, svm.predict(X_train_smote), average='macro'))
print('train f1: ', f1_score(y_train_smote, svm.predict(X_train_smote), average='macro'))

print()
# accuracy, precision, recall, f1
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred, average='macro'))
print('recall: ', recall_score(y_test, y_pred, average='macro'))
print('f1: ', f1_score(y_test, y_pred, average='macro'))



Before SMOTE:  [150 160 319 552]
After SMOTE:  [552 552 552 552]

train accuracy:  0.9977355072463768
train precision:  0.9977395843657325
train recall:  0.9977355072463768
train f1:  0.9977355009293448

accuracy:  0.9662162162162162
precision:  0.9579890308607981
recall:  0.9569852554667788
f1:  0.9563375292092965
