# SVM Model Tuning

In this notebook, we will take the folds generated by the OU class in the previous notebook  and try to find the best set of parameters for our SVM to perform binary classification. 

In [12]:
import pandas as pd
import numpy as np

import sklearn
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV

import pickle

import OU

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
save_dir = './data/'

In [10]:
info = np.load(save_dir + "/info.npy")

First we need to recompile the fold dictionary into one large dataframe so that sci-kit learn's GridSearchCV class can search for the optimal parameters efficiently. 

In [None]:
splits = []
multi_cv_df = pd.DataFrame()
multi_cv_labels = pd.Series()

for i in range(len(info)):
    train = info[i]['train']['df_scale'].copy()
    train_labels = info[i]['train']['labels'].copy()
    
    test = info[i]['test']['df_scale'].copy()
    test_labels = info[i]['test']['labels'].copy()
    
    train_len = train.shape[0]
    test_len = test.shape[0]
    
    # Append rows to dataframe
    multi_cv_df = multi_cv_df.append(train, ignore_index=True)
    multi_cv_labels = multi_cv_labels.append(train_labels, ignore_index=True)
    
    # Append labels to a dataframe
    multi_cv_df = multi_cv_df.append(test, ignore_index=True)
    multi_cv_labels = multi_cv_labels.append(test_labels, ignore_index=True)
    
    # Append the indices of the folds to a list
    splits.append((multi_cv_df.iloc[-train_len-test_len:-test_len].index, multi_cv_df.iloc[-test_len:].index))
    
    # Quality Assurance
    assert(np.array_equal(multi_cv_df.loc[splits[i][0]].values, train.values))
    assert(np.array_equal(multi_cv_labels.loc[splits[i][0]].values, train_labels.values))
    assert(np.array_equal(multi_cv_df.loc[splits[i][1]], test.values))
    assert(np.array_equal(multi_cv_labels.loc[splits[i][1]], test_labels))
    
splits = np.array(splits)

np.save(save_dir + 'splits.npy', splits)

In [11]:
# Save off data
multi_cv_df.to_csv(save_dir + 'df.csv')
multi_cv_labels.to_csv(save_dir + 'labels.csv')

In [None]:
# Gridsearch

We want to find the optimal hyperparameters for our SVM by exploring all combinations of possible hyperparameter


In [232]:
params = [{ 'kernel': ['rbf'],
            'C': [0.1,1,10,100], 
            'gamma': [1, 0.1, 0.001, 0.0001], 
            'cache_size': [2000], 
            'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4}, 
                             {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}]
          }, 
          { 'kernel': ['poly'], 
            'C': [0.1, 1,10,100,], 
            'gamma': [1, 0.1, 0.001, 0.0001],
            'degree': [3, 5],
            'cache_size': [2000],
            'class_weight': [{0: 0.5, 1: 0.5}, 
                             {0: 0.6, 1: 0.4}, {0: 0.7, 1: 0.3}]
          }]

In [233]:
# Use all cores (n_jobs-1)
gridcv = GridSearchCV(svm.SVC(), params, verbose=1, cv=list(splits), n_jobs=-1, 
                    scoring=['precision'], refit=False)

gridcv.fit(multi_cv_df, multi_cv_labels)

Fitting 2005 folds for each of 160 candidates, totalling 320800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 4968 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 6018 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 7168 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 8418 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 9768 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 11218 tasks      

GridSearchCV(cv=[array([RangeIndex(start=0, stop=3000, step=1),
       RangeIndex(start=3000, stop=3100, step=1)], dtype=object), array([RangeIndex(start=3100, stop=6100, step=1),
       RangeIndex(start=6100, stop=6200, step=1)], dtype=object), array([RangeIndex(start=6200, stop=9200, step=1),
       RangeIndex...2400, stop=6215400, step=1),
       RangeIndex(start=6215400, stop=6215447, step=1)], dtype=object)],
       error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'kernel': ['rbf'], 'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.001, 0.0001], 'cache_size': [2000], 'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4}, {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}]}, {'kernel': ['poly'], 'C': [0.1,

In [242]:
# save the object to a file
with open(save_dir+'gridsearch_results.pkl', 'wb') as f:
    pickle.dump(gridcv, f)