In [1]:
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
from PIL import Image
import pandas as pd

In [2]:
data = np.load('./data/data_pca_50_y_Xmean.pickle.npz')
data.files

['arr_0', 'arr_1', 'arr_2']

In [3]:
X = data['arr_0']
y = data['arr_1']
Xmean = data['arr_2']

In [4]:
#split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify =y)
#stratify -> train and test sets have almost same ratio for diff y like approx same male:female ratio

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5826, 50), (1457, 50), (5826,), (1457,))

# Training

In [5]:
from sklearn.svm import SVC
svc = SVC(kernel ='rbf', gamma = 0.01, probability=True)
svc.fit(X_train, y_train)

SVC(gamma=0.01, probability=True)

In [6]:
svc.score(X_train, y_train)

0.8403707518022657

In [7]:
svc.score(X_test, y_test)

0.787920384351407

# Evaluation

In [8]:
y_pred = svc.predict(X_test)

In [9]:
from sklearn.metrics import confusion_matrix, classification_report, cohen_kappa_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[447 175]
 [134 701]]


In [10]:
#classification report
cr = classification_report(y_test, y_pred, target_names= ['male', 'female'], output_dict = True)

In [11]:
pd.DataFrame(cr).T

Unnamed: 0,precision,recall,f1-score,support
male,0.769363,0.71865,0.743142,622.0
female,0.800228,0.839521,0.819404,835.0
accuracy,0.78792,0.78792,0.78792,0.78792
macro avg,0.784796,0.779085,0.781273,1457.0
weighted avg,0.787052,0.78792,0.786847,1457.0


# Hyper Parameter Tuning 

In [12]:
svc_tune = SVC()

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
param_grid = {'C': [1, 10, 20 , 30, 50, 100],
              'kernel': ['rbf'],
              'gamma': [0.01, 0.001, 0.05, 0.1, 0.002, 0.005],
              }
              

In [15]:
model_grid = GridSearchCV(svc_tune, param_grid =param_grid, scoring='accuracy', cv= 5, verbose=1)
model_grid.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 10, 20, 30, 50, 100],
                         'gamma': [0.01, 0.001, 0.05, 0.1, 0.002, 0.005],
                         'kernel': ['rbf']},
             scoring='accuracy', verbose=1)

In [16]:
model_grid.best_params_

{'C': 1, 'gamma': 0.05, 'kernel': 'rbf'}

In [17]:
model_grid.best_score_

0.8204044325612616

In [18]:
#with best parameters
svc = SVC(kernel ='rbf', gamma = 0.05, probability=True)
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.814687714481812

In [19]:
y_pred = svc.predict(X_test)

In [20]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[457 165]
 [105 730]]


In [21]:
#classification report
cr = classification_report(y_test, y_pred, target_names= ['male', 'female'], output_dict = True)

In [22]:
pd.DataFrame(cr).T

Unnamed: 0,precision,recall,f1-score,support
male,0.813167,0.734727,0.771959,622.0
female,0.815642,0.874251,0.843931,835.0
accuracy,0.814688,0.814688,0.814688,0.814688
macro avg,0.814405,0.804489,0.807945,1457.0
weighted avg,0.814586,0.814688,0.813206,1457.0


In [23]:
#save model
import pickle 
pickle.dump(svc, open('./model/model_svm.pickle','wb'))
pickle.dump(Xmean, open('./model/mean_preprocess.pickle', 'wb'))