In [55]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import itertools
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE, RandomOverSampler, SMOTENC, SMOTEN 
from sklearn.utils import resample
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [56]:
df = pd.read_csv('body_level_classification_train.csv')

In [57]:
df_p = df.copy()

In [58]:
df_p['Body_Level'] = df_p['Body_Level'].map({
    'Body Level 1': 1, 
    'Body Level 2': 2, 
    'Body Level 3': 3, 
    'Body Level 4': 4})

In [59]:
resample_ = True

In [60]:
# resample
if resample_:
    df_p1 = df_p[df_p['Body_Level'] == 1]
    df_p2 = df_p[df_p['Body_Level'] == 2]
    df_p3 = df_p[df_p['Body_Level'] == 3]
    df_p4 = df_p[df_p['Body_Level'] == 4]

    max_class_size = max(len(df_p1), len(df_p2), len(df_p3), len(df_p4))

    df_p1 = resample(df_p1, replace=True, n_samples=max_class_size, random_state=0)
    df_p2 = resample(df_p2, replace=True, n_samples=max_class_size, random_state=0)
    df_p3 = resample(df_p3, replace=True, n_samples=max_class_size, random_state=0)
    df_p4 = resample(df_p4, replace=True, n_samples=max_class_size, random_state=0)

    df_p = pd.concat([df_p1, df_p2, df_p3, df_p4])

In [61]:
# LabelEncoder for categorical columns except Body_Level
le = LabelEncoder()
categorical_columns = [col for col in df_p.columns if df_p[col].dtype == 'object']
for col in categorical_columns:
    df_p[col] = le.fit_transform(df_p[col])


In [62]:
# scaling
scaler = StandardScaler()
df_p.iloc[:, :-1] = scaler.fit_transform(df_p.iloc[:, :-1])

In [63]:
X_train, X_test, y_train, y_test = train_test_split(df_p.iloc[:, :-1], df_p.iloc[:, -1], test_size=0.2, random_state=0)

In [64]:
# GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# SVM
param_grid = {'C': [0.1, 1, 3, 5, 7, 10],
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid = GridSearchCV(SVC(), param_grid, refit=True, cv=5)
grid.fit(X_train, y_train)
# output the cv result to csv
pd.DataFrame(grid.cv_results_).to_csv('svm_grid_search_cv.csv')


In [65]:

print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))      
# print f1 score
print(f1_score(y_test, grid_predictions, average='macro'))        


{'C': 10, 'gamma': 1, 'kernel': 'linear'}
SVC(C=10, gamma=1, kernel='linear')
              precision    recall  f1-score   support

           1       0.98      1.00      0.99       132
           2       0.98      0.97      0.97       138
           3       0.99      0.98      0.98       125
           4       1.00      1.00      1.00       149

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

[[132   0   0   0]
 [  3 134   1   0]
 [  0   3 122   0]
 [  0   0   0 149]]
0.9867951168078025


In [66]:
# Random Forest
param_grid = {'n_estimators': [10, 50, 100, 200, 500],
                'max_features': ['sqrt', 'log2'],
                'max_depth' : [4,5,6,7,8],
                'criterion' :['gini', 'entropy']}
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, cv=5)
grid.fit(X_train, y_train)
# output cv results to csv
pd.DataFrame(grid.cv_results_).to_csv('random_forest_grid_search_results.csv')

In [67]:
# print best parameter after tuning
print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))
# print f1 score
print(f1_score(y_test, grid_predictions, average='macro'))


{'criterion': 'entropy', 'max_depth': 8, 'max_features': 'log2', 'n_estimators': 500}
RandomForestClassifier(criterion='entropy', max_depth=8, max_features='log2',
                       n_estimators=500)
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       132
           2       0.99      0.99      0.99       138
           3       0.98      0.97      0.97       125
           4       0.99      0.99      0.99       149

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

[[132   0   0   0]
 [  0 136   2   0]
 [  0   2 121   2]
 [  0   0   1 148]]
0.9868403379403903
