In [1]:
import sys; sys.path.append('../Preprocess/');
from preprocess import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../Dataset/body_level_classification_train.csv')

In [3]:
df_p = preprocess(df, resample_=True)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_p.iloc[:, :-1], df_p.iloc[:, -1], test_size=0.2, random_state=0)

In [6]:
# GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# SVM
param_grid = {'C': [0.1, 1, 3, 5, 7, 10],
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid = GridSearchCV(SVC(), param_grid, refit=True, cv=5)
grid.fit(X_train, y_train)
# output the cv result to csv
pd.DataFrame(grid.cv_results_).to_csv('Results/svm_grid_search_cv.csv')


In [7]:

print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))      
# print f1 score
print(f1_score(y_test, grid_predictions, average='macro'))        


{'C': 10, 'gamma': 1, 'kernel': 'linear'}
SVC(C=10, gamma=1, kernel='linear')
              precision    recall  f1-score   support

           1       0.98      1.00      0.99       132
           2       0.98      0.97      0.97       138
           3       0.99      0.98      0.98       125
           4       1.00      1.00      1.00       149

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

[[132   0   0   0]
 [  3 134   1   0]
 [  0   3 122   0]
 [  0   0   0 149]]
0.9867951168078025


In [8]:
# Random Forest
param_grid = {'n_estimators': [10, 50, 100, 200, 500],
                'max_features': ['sqrt', 'log2'],
                'max_depth' : [4,5,6,7,8],
                'criterion' :['gini', 'entropy']}
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, cv=5)
grid.fit(X_train, y_train)
# output cv results to csv
pd.DataFrame(grid.cv_results_).to_csv('Results/random_forest_grid_search_results.csv')

In [9]:
# print best parameter after tuning
print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))
# print f1 score
print(f1_score(y_test, grid_predictions, average='macro'))


{'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 500}
RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators=500)
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       132
           2       1.00      0.99      0.99       138
           3       0.97      0.98      0.98       125
           4       0.99      0.99      0.99       149

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

[[132   0   0   0]
 [  0 136   2   0]
 [  0   0 123   2]
 [  0   0   2 147]]
0.9888670968313842
