In [8]:
import sys; sys.path.append('../Preprocess/');
from preprocess import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv('../Dataset/body_level_classification_train.csv')

In [10]:
df_p = preprocess(df, resample_=True)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_p.iloc[:, :-1], df_p.iloc[:, -1], test_size=0.2, random_state=0)

In [12]:
scorer = make_scorer(f1_score, average='weighted')

In [16]:
!mkdir Results/

In [21]:
# GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# SVM
param_grid = {'C': [0.1, 1, 3, 5, 7, 10, 12, 14, 15],
                'gamma': [4, 2, 1, 0.1, 0.01, 0.001, 0.0001],
                'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}

grid = GridSearchCV(SVC(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output the cv result to csv
pd.DataFrame(grid.cv_results_).to_csv('Results/svm_grid_search_cv.csv')


In [23]:

print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))      
# print f1 score
print(f1_score(y_test, grid_predictions, average='weighted'))        


{'C': 15, 'gamma': 4, 'kernel': 'linear'}
SVC(C=15, gamma=4, kernel='linear')
              precision    recall  f1-score   support

           1       0.98      1.00      0.99       132
           2       0.99      0.97      0.98       138
           3       0.99      0.98      0.99       125
           4       0.99      1.00      1.00       149

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

[[132   0   0   0]
 [  3 134   1   0]
 [  0   1 123   1]
 [  0   0   0 149]]
0.9889430690248693


In [19]:
# Random Forest
param_grid = {'n_estimators': [10, 50, 100, 200, 500],
                'max_features': ['sqrt', 'log2'],
                'max_depth' : [4,6,8,12, 16],
                'criterion' :['gini', 'entropy']}
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output cv results to csv
pd.DataFrame(grid.cv_results_).to_csv( 'Results/random_forest_grid_search_results.csv')

In [20]:
# print best parameter after tuning
print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
print(confusion_matrix(y_test, grid_predictions))
# print f1 score
print(f1_score(y_test, grid_predictions, average='weighted'))


{'criterion': 'entropy', 'max_depth': 12, 'max_features': 'sqrt', 'n_estimators': 100}
RandomForestClassifier(criterion='entropy', max_depth=12)
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       132
           2       1.00      0.99      0.99       138
           3       0.98      0.98      0.98       125
           4       0.99      1.00      0.99       149

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

[[132   0   0   0]
 [  0 136   2   0]
 [  0   0 123   2]
 [  0   0   0 149]]
0.9926458959496207
