# Classify
* Load the feature matrix prepared in previous notebook
* Split the data into train/validate/test datasets
* Compute a baseline
* Classify using Support Vector Machine


In [1]:
import numpy as np
from os.path import join
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pickle
import pandas as pd
import random

Load the feature matrix prepared in previous notebook

In [14]:
DATA_PATH = 'data'
MODELS_PATH = 'models'

# size of sample used to train the model
NUM_SAMPLES = 1000
NUM_VAL_SAMPLES = 250


In [11]:
with np.load(join(DATA_PATH, 'train_data.npz')) as data:
    data_dict=dict(data.items())
X = data_dict['X']
y = data_dict['y']

Split the train data into train/validate datasets. The validate will be used for tuning the parameters. Make sure the train dataset has at least NUM_SAMPLES entries.

In [12]:
# Split the data into train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, random_state=0)
print('The training dataset has {} entries, the validate dataset {}.'.format(
    len(X_train), len(X_val)))

The training dataset has 10369 entries, the validate dataset 1153.


In [15]:
# Random sample the train dataset to fit the model with the best parameters
# while this is not strictly necessary it helps to keep the number of samples smaller while trying things out.
idx = random.sample(range(len(X_train)-1), NUM_SAMPLES)
X_train_sub = X_train[idx]
y_train_sub = y_train[idx]

idx = random.sample(range(len(X_val)-1), NUM_VAL_SAMPLES)
X_val_sub = X_val[idx]
y_val_sub = y_val[idx]


In [16]:
# flatten the images
X_flat = [img.flatten() for img in X_train_sub]
X_train_sub = X_flat

# flatten the images
X_flat = [img.flatten() for img in X_val_sub]
X_val_sub = X_flat

## Compute a baseline

In [7]:
# Create the dummy classifier
dummy = DummyClassifier(strategy='most_frequent')

# Fit it to the train data
dummy.fit(X_train_sub, y_train_sub)

# save the model
with open(join(MODELS_PATH, 'dummy.pickle'), 'wb') as handle:
    pickle.dump(dummy, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM
Create a pipeline using a standard scaler and a support vector machine (SVM) model.

In [8]:
# Create SVM
svc = SVC(kernel='rbf', C=1, gamma='scale')

pipe_svc = Pipeline([
    ('scaler', StandardScaler()), # More accurate with standardization?
    #('scaler', None), # Better performance (faster) without standardization?
    ('svc', svc)
])

### Tune the SVM model
Tune C (regularization) parameter (skipped gamma set to 'scale' by default), using grid search

In [9]:
%%time
# Create cross-validation object
grid_svc = {
    'svc__C': [0.75, 1, 2.5],
}
# create grid search with cross validation
gridsearch_svc = GridSearchCV(pipe_svc, grid_svc, cv=4, return_train_score=True, verbose=3, n_jobs=4)

# Fit estimator
gridsearch_svc.fit(X_val_sub, y_val_sub)

Fitting 4 folds for each of 3 candidates, totalling 12 fits
CPU times: user 32.7 s, sys: 357 ms, total: 33 s
Wall time: 36.3 s


In [10]:
# Collect results in a DataFrame
results_svc = pd.DataFrame(gridsearch_svc.cv_results_)

# Print a few interesting columns
cols = ['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score', 'param_svc__C']
sorted_svc = results_svc[cols].sort_values('mean_test_score', ascending=False)
sorted_svc.head(10)

Unnamed: 0,mean_test_score,std_test_score,mean_train_score,std_train_score,param_svc__C
0,0.32,0.063246,0.57,0.050222,0.75
2,0.32,0.063246,0.993333,0.006667,2.5
1,0.3,0.066332,0.773333,0.009428,1.0


### Refit and save the model

In [11]:
# get best parameter from result of grid search
C = sorted_svc.iloc[0]['param_svc__C']

# set parameter in pipeline
pipe_svc.set_params(svc__C=C)

# fit the model
pipe_svc.fit(X_train_sub, y_train_sub)

# save the model
with open(join(MODELS_PATH, 'pipe_svc.pickle'), 'wb') as handle:
    pickle.dump(pipe_svc, handle, protocol=pickle.HIGHEST_PROTOCOL)