In [57]:
import sys; sys.path.append('../Preprocess/');
from preprocess import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

In [75]:
df = pd.read_csv('../Dataset/body_level_classification_train.csv')

In [76]:
df_p = preprocess(df, resample_=True)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(df_p.iloc[:, :-1], df_p.iloc[:, -1], test_size=0.2, random_state=0)

In [78]:
scorer = make_scorer(f1_score, average='weighted')

In [62]:
!mkdir Results/

mkdir: cannot create directory ‘Results/’: File exists


In [63]:
# GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# SVM
param_grid = {'C': [0.1, 1, 3, 5, 7, 10, 12, 14, 15],
                'gamma': [4, 2, 1, 0.1, 0.01, 0.001, 0.0001],
                'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}

grid = GridSearchCV(SVC(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output the cv result to csv
pd.DataFrame(grid.cv_results_).to_csv('Results/svm_grid_search_cv.csv')


In [64]:

print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))      
# print f1 score
print(f1_score(y_test, grid_predictions, average='weighted'))        


{'C': 15, 'gamma': 4, 'kernel': 'linear'}
SVC(C=15, gamma=4, kernel='linear')
              precision    recall  f1-score   support

           1       0.98      1.00      0.99       132
           2       0.99      0.97      0.98       138
           3       0.99      0.98      0.99       125
           4       0.99      1.00      1.00       149

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

[[132   0   0   0]
 [  3 134   1   0]
 [  0   1 123   1]
 [  0   0   0 149]]
0.9889430690248693


In [65]:
# Random Forest
param_grid = {'n_estimators': [10, 50, 100, 200, 500],
                'max_features': ['sqrt', 'log2'],
                'max_depth' : [4,5,6,7,8, 12, 16],
                'criterion' :['gini', 'entropy']}
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output cv results to csv
pd.DataFrame(grid.cv_results_).to_csv( 'Results/random_forest_grid_search_results.csv')

In [66]:
# print best parameter after tuning
print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))
# print f1 score
print(f1_score(y_test, grid_predictions, average='weighted'))


{'criterion': 'entropy', 'max_depth': 12, 'max_features': 'log2', 'n_estimators': 200}
RandomForestClassifier(criterion='entropy', max_depth=12, max_features='log2',
                       n_estimators=200)
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       132
           2       0.99      0.99      0.99       138
           3       0.98      0.98      0.98       125
           4       0.99      0.99      0.99       149

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

[[132   0   0   0]
 [  0 136   2   0]
 [  0   2 122   1]
 [  0   0   1 148]]
0.9889705882352942


In [80]:
{'criterion': 'entropy', 'max_depth': 12, 'max_features': 'sqrt', 'n_estimators': 100}
clf = RandomForestClassifier(criterion='entropy', max_depth=12, max_features='sqrt', n_estimators=100)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))
# print f1 score
print(f1_score(y_test, predictions, average='weighted'))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00       132
           2       1.00      0.99      0.99       138
           3       0.98      0.98      0.98       125
           4       0.99      0.99      0.99       149

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

[[132   0   0   0]
 [  0 136   2   0]
 [  0   0 123   2]
 [  0   0   1 148]]
0.9908229367862087


# No Resampling

In [67]:
df_p = preprocess(df, resample_=False)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(df_p.iloc[:, :-1], df_p.iloc[:, -1], test_size=0.2, random_state=0)

In [69]:
scorer = make_scorer(f1_score, average='weighted')

In [70]:
!mkdir Results/

mkdir: cannot create directory ‘Results/’: File exists


In [71]:
# GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# SVM
param_grid = {'C': [0.1, 1, 3, 5, 7, 10, 12, 14, 15],
                'gamma': [4, 2, 1, 0.1, 0.01, 0.001, 0.0001],
                'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}

grid = GridSearchCV(SVC(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output the cv result to csv
pd.DataFrame(grid.cv_results_).to_csv('Results/svm_grid_search_cv.csv')


In [72]:

print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))      
# print f1 score
print(f1_score(y_test, grid_predictions, average='weighted'))        


{'C': 10, 'gamma': 4, 'kernel': 'linear'}
SVC(C=10, gamma=4, kernel='linear')
              precision    recall  f1-score   support

           1       0.98      1.00      0.99        40
           2       1.00      0.98      0.99        41
           3       0.98      1.00      0.99        87
           4       1.00      0.98      0.99       128

    accuracy                           0.99       296
   macro avg       0.99      0.99      0.99       296
weighted avg       0.99      0.99      0.99       296

[[ 40   0   0   0]
 [  1  40   0   0]
 [  0   0  87   0]
 [  0   0   2 126]]
0.9898766541237013


In [73]:
# Random Forest
param_grid = {'n_estimators': [10, 50, 100, 200, 500],
                'max_features': ['sqrt', 'log2'],
                'max_depth' : [4,5,6,7,8, 12, 16],
                'criterion' :['gini', 'entropy']}
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output cv results to csv
pd.DataFrame(grid.cv_results_).to_csv( 'Results/random_forest_grid_search_results.csv')

In [74]:
# print best parameter after tuning
print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))
# print f1 score
print(f1_score(y_test, grid_predictions, average='weighted'))


{'criterion': 'entropy', 'max_depth': 12, 'max_features': 'log2', 'n_estimators': 200}
RandomForestClassifier(criterion='entropy', max_depth=12, max_features='log2',
                       n_estimators=200)
              precision    recall  f1-score   support

           1       1.00      0.97      0.99        40
           2       0.90      0.90      0.90        41
           3       0.93      0.94      0.94        87
           4       0.98      0.98      0.98       128

    accuracy                           0.96       296
   macro avg       0.95      0.95      0.95       296
weighted avg       0.96      0.96      0.96       296

[[ 39   1   0   0]
 [  0  37   4   0]
 [  0   2  82   3]
 [  0   1   2 125]]
0.9561658765456235
