# Support vector machine

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [9]:
from sklearn import preprocessing

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# model validation
from sklearn.metrics import accuracy_score

In [10]:
train_labels = pd.read_csv("./data/train_labels.csv", header=None)
train_data = pd.read_csv("./data/train_data.csv", header=None)

In [11]:
# scale
scaled_features = preprocessing.scale(train_data)
pd.DataFrame(scaled_features).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,254,255,256,257,258,259,260,261,262,263
0,-1.571333,-1.423949,-0.935285,-1.169432,-0.772396,-1.258998,-0.811812,-0.985136,-0.581884,-0.756522,...,1.279245,1.266232,-0.158097,0.297566,0.164625,0.499226,0.921775,-0.185416,0.092879,0.006096
1,-0.602171,0.267213,0.055395,-0.711935,-1.073589,-0.876857,-0.373028,0.155161,0.39948,0.963212,...,-0.897103,-0.852736,0.074136,1.103637,0.111916,-0.726099,0.269456,-1.177036,-0.694615,-0.2238
2,-0.584983,0.148239,0.606352,0.806748,0.07589,-0.140405,-0.116555,0.220999,-0.266751,-0.562176,...,-0.406852,1.244176,0.375294,1.291144,0.824687,2.77078,-0.179373,-0.786025,0.560227,-0.548004
3,0.193432,1.292285,0.85447,0.486911,0.341579,0.529897,0.640292,0.450361,0.375602,0.654629,...,-0.975856,0.528008,-0.66158,0.90356,-1.144311,0.899273,-1.229087,0.110236,2.960029,-0.956088
4,-0.825078,-0.712986,-0.865418,-1.029277,-1.073589,-1.243794,-0.87486,-1.021802,-0.95912,-0.545164,...,-0.189255,1.433153,-0.525024,-1.507232,-0.966014,-0.535219,-0.432634,-1.336981,-1.421282,0.632431


In [12]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, train_labels[0], test_size=0.30)

In [13]:
svc_model = SVC()
svc_model.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
predictions = svc_model.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

0.6371275783040489

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001]} 

In [18]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   5.8s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.3s remaining:    0.0s


[CV] ................................... C=0.1, gamma=1, total=   5.5s
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   5.4s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   5.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   5.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   5.7s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=   4.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=   3.5s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  4.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [19]:
grid_predictions = grid.predict(X_test)

In [20]:
accuracy_score(y_true=y_test, y_pred=grid_predictions)

0.6699770817417876

In [40]:
from sklearn.metrics import classification_report
print(classification_report(y_test,grid_predictions))

             precision    recall  f1-score   support

          1       0.68      0.90      0.78       636
          2       0.68      0.58      0.63       202
          3       0.66      0.70      0.68        97
          4       0.64      0.55      0.59        77
          5       0.55      0.15      0.24        71
          6       0.53      0.38      0.44        84
          7       0.00      0.00      0.00        27
          8       0.86      0.41      0.56        61
          9       0.56      0.29      0.38        31
         10       0.00      0.00      0.00        23

avg / total       0.64      0.67      0.64      1309



### train using full data set

In [38]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001]} 

grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(scaled_features, train_labels[0])

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=5, gamma=0.005 ................................................
[CV] ................................. C=5, gamma=0.005, total=   5.4s
[CV] C=5, gamma=0.005 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.8s remaining:    0.0s


[CV] ................................. C=5, gamma=0.005, total=   6.0s
[CV] C=5, gamma=0.005 ................................................
[CV] ................................. C=5, gamma=0.005, total=   5.7s
[CV] C=5, gamma=0.001 ................................................
[CV] ................................. C=5, gamma=0.001, total=   4.0s
[CV] C=5, gamma=0.001 ................................................
[CV] ................................. C=5, gamma=0.001, total=   3.8s
[CV] C=5, gamma=0.001 ................................................
[CV] ................................. C=5, gamma=0.001, total=   3.5s
[CV] C=5, gamma=0.0005 ...............................................
[CV] ................................ C=5, gamma=0.0005, total=   3.4s
[CV] C=5, gamma=0.0005 ...............................................
[CV] ................................ C=5, gamma=0.0005, total=   3.6s
[CV] C=5, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  3.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [5, 10, 50], 'gamma': [0.005, 0.001, 0.0005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [39]:
grid.best_params_

{'C': 5, 'gamma': 0.001}

### test on submission data

In [41]:
test_data = pd.read_csv("./data/test_data.csv", header=None)
scaled_test_data = preprocessing.scale(test_data)
pd.DataFrame(scaled_test_data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,254,255,256,257,258,259,260,261,262,263
0,0.043826,-0.809767,-0.945635,-0.571495,-0.451566,-0.539191,-0.568266,-0.369424,-0.299865,0.101217,...,0.988079,0.40627,1.051041,0.315653,1.091843,1.080988,0.829564,1.646998,0.159933,1.253805
1,-0.764994,-0.760131,-1.318111,-0.279499,-1.052338,-0.93628,-1.156958,-1.194894,-1.448273,-1.322496,...,-1.008087,-0.546926,0.945286,-0.057584,-0.624147,0.383785,0.286891,1.209392,1.329834,0.436842
2,-1.111097,-0.395324,-0.271093,-0.399065,-0.108006,-0.318027,-0.471264,-0.778783,-0.784574,-0.720827,...,-0.645886,0.708601,-0.063793,-0.748778,-1.639754,-0.541667,-0.921541,-0.184411,-0.864731,-0.883554
3,-0.238746,0.154644,0.249435,-0.011017,0.36387,0.735586,0.085907,0.430105,0.388821,1.164532,...,-0.687927,1.859212,-1.102982,0.289751,-0.835946,0.829969,-2.591491,-0.12366,0.844177,-1.695492
4,-1.17684,-1.313125,-1.168628,-1.241462,-1.013563,-0.95083,-0.793338,-0.848165,-0.870585,-0.55014,...,-0.943363,0.394816,-1.090551,1.65006,-1.533462,-1.704437,-1.215077,-0.021176,-0.107274,-0.871699


In [42]:
submission_predictions = grid.predict(scaled_test_data)

In [43]:
np.unique(submission_predictions)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [46]:
template = pd.read_csv("./data/dummy_solution_accuracy.csv")
submission = template.copy()
submission["Sample_label"] = submission_predictions
submission.head()

Unnamed: 0,Sample_id,Sample_label
0,1,3
1,2,9
2,3,1
3,4,1
4,5,1


In [47]:
submission.to_csv("./submission/accuracy_submission_svm.csv", index=False)

In [52]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001]} 

grid_prob = GridSearchCV(SVC(probability=True),param_grid,refit=True,verbose=2)
grid_prob.fit(scaled_features, train_labels[0])

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=  50.5s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.4s remaining:    0.0s


[CV] ................................... C=0.1, gamma=1, total=  48.1s
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=  48.5s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=  48.7s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=  53.5s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=  52.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=  28.9s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=  27.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 31.9min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

### on submission data

In [56]:
submission_data = pd.read_csv("./data/test_data.csv", header=None)
submission_proba = grid_prob.predict_proba(submission_data)

In [57]:
dummy = pd.read_csv("./data/dummy_solution_accuracy.csv")
dummy.drop("Sample_label", inplace=True, axis=1)
print(dummy.shape)
dummy.head()

(6544, 1)


Unnamed: 0,Sample_id
0,1
1,2
2,3
3,4
4,5


In [58]:
submission2 = dummy.copy()

In [59]:
for i in range(10):
    submission2[f'Class_{i+1}'] = submission_proba[:, i]

In [60]:
submission2.head()

Unnamed: 0,Sample_id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9,Class_10
0,1,0.374756,0.519683,0.027261,0.006329,0.020247,0.001659,0.035121,0.000478,0.008594,0.005872
1,2,0.374756,0.519683,0.027261,0.006329,0.020247,0.001659,0.035121,0.000478,0.008594,0.005872
2,3,0.374756,0.519683,0.027261,0.006329,0.020247,0.001659,0.035121,0.000478,0.008594,0.005872
3,4,0.374756,0.519683,0.027261,0.006329,0.020247,0.001659,0.035121,0.000478,0.008594,0.005872
4,5,0.374756,0.519683,0.027261,0.006329,0.020247,0.001659,0.035121,0.000478,0.008594,0.005872


In [61]:
submission2.to_csv("./submission/logloss_svm.csv", index=False)