In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

MNIST Digits - Classification Using SVM**

Objective We will develop a model using Support Vector Machine which should correctly classify the handwritten digits from 0-9 based on the pixel values given as features. Thus, this is a 10-class classification problem.

Data Description For this problem, we use the MNIST data which is a large database of handwritten digits. The 'pixel values' of each digit (image) comprise the features, and the actual number between 0-9 is the label.

Since each image is of 28 x 28 pixels, and each pixel forms a feature, there are 784 features. 

In [4]:
#getting access of my current working directory
import os
os.getcwd()

'C:\\Users\\Anam Fatima'

In [5]:
#changing my working directory
os.chdir('C:\\Users\\Anam Fatima\\Downloads\\mnist')

In [6]:
train= pd.read_csv('train.csv')
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
print('train_shape:', train.shape)
train.info()

train_shape: (42000, 785)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [8]:
train['label'].value_counts()

1    4684
7    4401
3    4351
9    4188
2    4177
6    4137
0    4132
4    4072
8    4063
5    3795
Name: label, dtype: int64

In [9]:
test= pd.read_csv('test.csv')
test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#separating X and Y
Y= train['label']
X= train.drop(columns='label')

display('x_shape:',X.shape, 'y_shape:',Y.shape)

'x_shape:'

(42000, 784)

'y_shape:'

(42000,)

In [11]:
x_train,x_test,y_train,y_test= train_test_split(X,Y, test_size=0.2, random_state=10)
display(x_train.shape, y_train.shape,x_test.shape,y_test.shape)

(33600, 784)

(33600,)

(8400, 784)

(8400,)

First I will build simple non linear svc model using kernel rbf

In [11]:
rbf_model= SVC(kernel='rbf')
rbf_model.fit(x_train,y_train)

SVC()

In [12]:
y_pred=rbf_model.predict(x_test)

In [13]:
print(accuracy_score(y_test,y_pred))

0.9757142857142858


K-Folds cross-validator

Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default).

Each fold is then used once as a validation while the k - 1 remaining folds form the training set.

In [14]:
folds= KFold(n_splits=5, shuffle=True, random_state=10)

params= {'gamma':[0.01,0.1,10],
        'C':[0.1,1.0,10]}

model= SVC(kernel='rbf')

grid_model=GridSearchCV(estimator=model,param_grid=params,cv=folds, scoring='accuracy', verbose=1,n_jobs=-1)



In [None]:
grid_model.fit(x_train,y_train)


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


# OVO,OVR

In [12]:
OVR_SVC= OneVsRestClassifier(SVC())
OVR_SVC.fit(x_train,y_train)

OneVsRestClassifier(estimator=SVC())

In [13]:
y_pred=OVR_SVC.predict(x_test)
print(accuracy_score(y_test,y_pred))

0.9760714285714286


In [26]:
hyper_param= {'estimator__C':[0.01,0.1,10],
             'estimator__gamma':[1,0.1,0.01],
             'estimator__kernel':['rbf']}

ovr= OneVsRestClassifier(SVC())
ovr_grid_mod= GridSearchCV(estimator=ovr, param_grid=hyper_param, cv=3, scoring='accuracy')

In [None]:
ovr_grid_mod.fit(x_train,y_train)

In [25]:
ovr.get_params().keys()

dict_keys(['estimator__C', 'estimator__break_ties', 'estimator__cache_size', 'estimator__class_weight', 'estimator__coef0', 'estimator__decision_function_shape', 'estimator__degree', 'estimator__gamma', 'estimator__kernel', 'estimator__max_iter', 'estimator__probability', 'estimator__random_state', 'estimator__shrinking', 'estimator__tol', 'estimator__verbose', 'estimator', 'n_jobs'])

In [None]:
ovr_grid_model.best_score_

In [None]:
ovr_grid_model.best_params_

In [None]:
y_pred= ovr_grid_model.predict(x_test)