# SVM Classifier

In [13]:
# general imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, log_loss, precision_score

In [14]:
# loading in original data sets
df_training = pd.read_csv('../data/archive/train.csv')
df_testing = pd.read_csv('../data/archive/test.csv')

# loading in pca data sets
df_training_pca = pd.read_csv('../data/archive/train_pca.csv')
df_testing_pca = pd.read_csv('../data/archive/test_pca.csv')

In [15]:
#Separating X and y training and testing
x_train_full = df_training.iloc[:, :-1]
y_train_full = df_training.iloc[:,-1]

x_test = df_testing.iloc[:, :-1]
y_test = df_testing.iloc[:,-1]

# get the pca training and test data

x_train_pca = df_training_pca.iloc[:, :-1]
y_train_pca = df_training_pca.iloc[:,-1]

x_test_pca = df_testing_pca.iloc[:, :-1]
y_test_pca = df_testing_pca.iloc[:,-1]

In [16]:
#Making a grid of values we want our grid search to test to find the best parameters

grid_values = {'kernel': ['linear', 'rbf'], 'C': [1,10,100,1000], 'decision_function_shape': ['ovr', 'ovo']}

# With PCA

In [17]:
#getting all of our randomized samples from the training
all_samples = []

for sample in range(0,5):
    all_samples.append(df_training_pca.sample(n=5000, replace= True))

In [18]:
%%time
# ^ shows runtime of cell

#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params = []
f1_best_params = []
precision_best_params = []

for sample in all_samples:
    svm = SVC()
    svc_classifier = GridSearchCV(estimator = svm, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'f1_micro', 'precision_micro'], refit = False, verbose = 0)
    
    x_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:,-1]
    
    svc_model = svc_classifier.fit(x_train, y_train)
    
    accuracy_best_params.append(svc_model.cv_results_['params'][ np.argmin(svc_model.cv_results_['rank_test_accuracy'])])
    f1_best_params.append(svc_model.cv_results_['params'][ np.argmin(svc_model.cv_results_['rank_test_f1_micro'])])
    precision_best_params.append(svc_model.cv_results_['params'][ np.argmin(svc_model.cv_results_['rank_test_precision_micro'])])

Wall time: 12min 23s


In [19]:
accuracy_best_params

[{'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 1000, 'decision_function_shape': 'ovr', 'kernel': 'rbf'}]

In [20]:
f1_best_params

[{'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 1000, 'decision_function_shape': 'ovr', 'kernel': 'rbf'}]

In [21]:
precision_best_params

[{'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 1000, 'decision_function_shape': 'ovr', 'kernel': 'rbf'}]

In [35]:
%%time
# ^ shows runtime of cell

accuracy_test_score = []
f1_test_score = []
precision_test_score = []

for i in range(5):
    svc_clf = SVC(kernel = accuracy_best_params[i].get('kernel'), C = accuracy_best_params[i].get('C'),
              decision_function_shape = accuracy_best_params[i].get('decision_function_shape'))
    
    model = svc_clf.fit(x_train_pca, y_train_pca)
    y_pred = model.predict(x_test_pca)
    
    accuracy_test_score.append(accuracy_score(y_test_pca, y_pred))
    f1_test_score.append(f1_score(y_test_pca, y_pred, average = 'micro'))
    precision_test_score.append(precision_score(y_test_pca, y_pred, average = 'micro'))

Wall time: 2.98 s


In [36]:
accuracy_test_score

[0.9294197488971836,
 0.9233118425517476,
 0.9294197488971836,
 0.9233118425517476,
 0.9233118425517476]

In [37]:
f1_test_score

[0.9294197488971836,
 0.9233118425517476,
 0.9294197488971836,
 0.9233118425517476,
 0.9233118425517476]

In [38]:
precision_test_score

[0.9294197488971836,
 0.9233118425517476,
 0.9294197488971836,
 0.9233118425517476,
 0.9233118425517476]

# Without PCA

In [39]:
#getting all of our randomized samples from the training
all_samples = []

for sample in range(0,5):
    all_samples.append(df_training.sample(n=5000, replace= True))

In [40]:
%%time
# ^ shows runtime of cell

#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params = []
f1_best_params = []
precision_best_params = []

for sample in all_samples:
    svm = SVC()
    svc_classifier = GridSearchCV(estimator = svm, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'f1_micro', 'precision_micro'], refit = False, verbose = 0)
    
    x_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:,-1]
    
    svc_model = svc_classifier.fit(x_train, y_train)
    
    accuracy_best_params.append(svc_model.cv_results_['params'][ np.argmin(svc_model.cv_results_['rank_test_accuracy'])])
    f1_best_params.append(svc_model.cv_results_['params'][ np.argmin(svc_model.cv_results_['rank_test_f1_micro'])])
    precision_best_params.append(svc_model.cv_results_['params'][ np.argmin(svc_model.cv_results_['rank_test_precision_micro'])])

Wall time: 16min 46s


In [41]:
accuracy_best_params

[{'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 1000, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 1000, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'}]

In [42]:
f1_best_params

[{'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 1000, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 1000, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'}]

In [43]:
precision_best_params

[{'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 1000, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 1000, 'decision_function_shape': 'ovr', 'kernel': 'rbf'},
 {'C': 100, 'decision_function_shape': 'ovr', 'kernel': 'rbf'}]

In [44]:
%%time
# ^ shows runtime of cell

accuracy_test_score = []
f1_test_score = []
precision_test_score = []

for i in range(5):
    svc_clf = SVC(kernel = accuracy_best_params[i].get('kernel'), C = accuracy_best_params[i].get('C'),
              decision_function_shape = accuracy_best_params[i].get('decision_function_shape'))
    
    model = svc_clf.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    accuracy_test_score.append(accuracy_score(y_test, y_pred))
    f1_test_score.append(f1_score(y_test, y_pred, average = 'micro'))
    precision_test_score.append(precision_score(y_test, y_pred, average = 'micro'))

Wall time: 14.7 s


In [45]:
accuracy_test_score

[0.9467254835425857,
 0.9467254835425857,
 0.9467254835425857,
 0.9467254835425857,
 0.9467254835425857]

In [46]:
f1_test_score

[0.9467254835425857,
 0.9467254835425857,
 0.9467254835425857,
 0.9467254835425857,
 0.9467254835425857]

In [47]:
precision_test_score

[0.9467254835425857,
 0.9467254835425857,
 0.9467254835425857,
 0.9467254835425857,
 0.9467254835425857]