In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [2]:
file_csv = "data/classify_reduced.csv"
svm_df = pd.read_csv(file_csv, encoding="ISO-8859-1")
svm_df.head()

Unnamed: 0,IsHoliday,Weekly_Sales,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5
0,0,24924.5,,,,,
1,0,41595.55,,,,,
2,0,19403.54,,,,,
3,0,21827.9,,,,,
4,0,21043.39,,,,,


In [3]:
# Replace missing values with 0
svm_df = svm_df.dropna()

In [4]:
X = svm_df.loc[:, svm_df.columns != 'IsHoliday']
X.head()

Unnamed: 0,Weekly_Sales,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5
86,18689.54,10382.9,6115.67,215.07,2406.62,6551.42
87,19050.66,6074.12,254.39,51.98,427.39,5988.57
88,25293.49,5629.51,68.0,1398.11,2084.64,20475.32
89,33305.92,4640.65,19.0,105.02,3639.42,14461.82
90,45773.03,5011.32,67.0,347.37,225.79,4011.37


In [5]:
y = svm_df['IsHoliday'].to_frame()
y.head()

Unnamed: 0,IsHoliday
86,0
87,0
88,0
89,0
90,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [7]:
model = SVC(cache_size=7000)

In [8]:
model.fit(X_train,y_train)

SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [9]:
predictions = model.predict(X_test)

In [10]:
print(confusion_matrix(y_test,predictions))

[[17390     0]
 [ 2011    11]]


In [11]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     17390
           1       1.00      0.01      0.01      2022

   micro avg       0.90      0.90      0.90     19412
   macro avg       0.95      0.50      0.48     19412
weighted avg       0.91      0.90      0.85     19412



In [14]:
MAE = mean_absolute_error(y_test,predictions)
print(MAE)

0.10359571399134555


In [None]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [None]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [None]:
grid.fit(X_train,y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.1, gamma=1, kernel=rbf, score=0.89734178193339, total=20.4min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 23.8min remaining:    0.0s


In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid_predictions = grid.predict(X_test)

In [None]:
print(confusion_matrix(y_test,grid_predictions))

In [None]:
print(classification_report(y_test,grid_predictions))