In [1]:
import os
import numpy as np
from sklearn import svm
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier

In [2]:
lbp_norm_path = '/Volumes/Extreme SSD/cmpt400/tiles_prod/lbp-prod-sorted-data-norm.csv'
lbp_norm = np.genfromtxt(lbp_norm_path, delimiter=',')
hoc_norm_path = '/Volumes/Extreme SSD/cmpt400/tiles_prod/hoc-prod-sorted-data-norm.csv'
hoc_norm = np.genfromtxt(hoc_norm_path, delimiter=',')

In [3]:
lbp_norm.shape

(754, 36)

In [4]:
hoc_norm.shape

(754, 30)

In [5]:
lbp_hoc_norm = np.hstack((lbp_norm, hoc_norm))

In [6]:
lbp_hoc_norm.shape

(754, 66)

In [7]:
train_y_path = \
            '/Volumes/Extreme SSD/cmpt400/tiles_prod/train-sorted-labels-data-bin.csv'
train_y = np.genfromtxt(train_y_path, delimiter=',') # CE = 0, LAA = 1

In [8]:
train_y.shape

(754,)

In [9]:
train_y[0:10]

array([0., 0., 1., 1., 0., 1., 0., 0., 0., 0.])

In [23]:
rbf_svc = svm.SVC(kernel='rbf', gamma=0.1, C=1000)

In [24]:
scores = cross_val_score(rbf_svc, lbp_hoc_norm, train_y, cv=10)

In [25]:
print(scores)

[0.60526316 0.63157895 0.63157895 0.69736842 0.58666667 0.64
 0.74666667 0.73333333 0.64       0.73333333]


In [26]:
print(scores.mean())
print(scores.std())

0.664578947368421
0.0549934633346673


### Search for best hyperparameters for SVM with RBF kernel

#### SVM with RBF (radial basis function) kernel.

In [27]:
param_grid = [{'kernel': ['rbf'], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5], 
               'C': [0.01, 0.1 , 1, 10, 100]}]
rbf_svc = svm.SVC()
grid_search = GridSearchCV(rbf_svc, param_grid, cv=5)
grid_search.fit(lbp_hoc_norm, train_y)

In [28]:
grid_search.best_params_

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

In [29]:
rbf_svc_best = svm.SVC(kernel=grid_search.best_params_['kernel'], 
                       gamma=grid_search.best_params_['gamma'], 
                       C=grid_search.best_params_['C'])
scores = cross_val_score(rbf_svc_best, lbp_hoc_norm, train_y, cv=5)

In [30]:
print(scores)
print(scores.mean())
print(scores.std())

[0.7218543  0.73509934 0.7218543  0.73509934 0.72666667]
0.7281147902869757
0.005967448863258644


#### Linear kernel

In [105]:
param_grid = [{'kernel': ['linear'], 'C': [0.001, 0.01, 0.1]}]
linear_svc = svm.SVC()
grid_search = GridSearchCV(linear_svc, param_grid, cv=5)
grid_search.fit(lbp_hoc_norm, train_y)

In [106]:
grid_search.best_params_

{'C': 0.001, 'kernel': 'linear'}

In [15]:
lin_svc_best = svm.SVC(kernel=grid_search.best_params_['kernel'], 
                       C=grid_search.best_params_['C'])
scores = cross_val_score(lin_svc_best, lbp_hoc_norm, train_y, cv=5)

In [16]:
print(scores)
print(scores.mean())
print(scores.std())

[0.66225166 0.66887417 0.66887417 0.68874172 0.72666667]
0.6830816777041943
0.023534160738886506


#### Polynomial kernel

In [17]:
polynomial2_svc = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2)),
    ('svc', svm.SVC(kernel='linear', C=0.01))
])
scores = cross_val_score(polynomial2_svc, lbp_hoc_norm, train_y, cv=5)

In [18]:
print(scores)
print(scores.mean())
print(scores.std())

[0.68874172 0.71523179 0.69536424 0.69536424 0.69333333]
0.6976070640176599
0.009138386042380533


#### Random forest classifier

In [31]:
param_grid = [{'n_estimators': [10, 100, 250, 500], 'max_leaf_nodes': [4, 8, 16, 32], 
               'n_jobs': [-1]}]
rf_clf = RandomForestClassifier()
grid_search = GridSearchCV(rf_clf, param_grid, cv=5)
grid_search.fit(lbp_hoc_norm, train_y)

In [32]:
grid_search.best_params_

{'max_leaf_nodes': 4, 'n_estimators': 100, 'n_jobs': -1}

In [33]:
rf_clf_best = RandomForestClassifier(
    n_estimators=grid_search.best_params_['n_estimators'], 
    max_leaf_nodes=grid_search.best_params_['max_leaf_nodes'], 
    n_jobs=-1)
scores = cross_val_score(rf_clf_best, lbp_hoc_norm, train_y, cv=5)
print(scores)
print(scores.mean())
print(scores.std())

[0.71523179 0.72847682 0.71523179 0.72847682 0.72666667]
0.7228167770419425
0.006228289730040037
