In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import svm

In [2]:
# read in dataset
df = pd.read_csv('train.csv', header=0)

In [3]:
# check the data out
df.describe()
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
count,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,...,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0
mean,4.456643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.219286,0.117095,0.059024,0.02019,0.017238,0.002857,0.0,0.0,0.0,0.0
std,2.88773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.31289,4.633819,3.274488,1.75987,1.894498,0.414264,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,254.0,253.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0


Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Many pixels have zero or near-zero variance

In [4]:
# split data for validation
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0], \
                                                    test_size=0.2, random_state=10)

In [5]:
# Standardize data (normalize and center)
scalerX = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scalerX.transform(X_train)
X_test_scaled = scalerX.transform(X_test)

### Reduce dimensionality

In [6]:
# Low variance filter

# reduce number of dimensions by dropping columns with low variance
low_var_index = []
var_tol = 1 # might need to scan over this value
num_features = len(df.columns)-1

for i in range(num_features):
    if ( np.var(X_train_scaled[:, i]) > var_tol ): low_var_index.append(i)

X_train_scaled_dimred = X_train_scaled[:, low_var_index]
X_test_scaled_dimred = X_test_scaled[:, low_var_index]
X_train_scaled_dimred.shape
X_test_scaled_dimred.shape

(33600, 250)

(8400, 250)

### Train model

In [7]:
# SVM Classifier
# C value chosen by trial; performing a grid search was slower than simply choosing hyperparameter values by hand.

svc = svm.SVC(C=100)
svc.fit(X_train_scaled_dimred, y_train)
svc.score(X_test_scaled_dimred, y_test)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

0.9615476190476191

#### The model seems pretty good. So, we should now train on the entire training set to improve accuracy.

In [17]:
# recombine dataset splits
X_full = np.concatenate((X_train_scaled_dimred, X_test_scaled_dimred), axis=0)
y_full = np.concatenate((y_train, y_test), axis=0)

In [None]:
# train model on full training dataset
svc_full = svm.SVC(C=100)
svc_full.fit(X_full, y_full)

### Prediction

In [20]:
# read in test set
test_df = pd.read_csv('test.csv', header=0)
test_df.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# pre-process test set
test_scaled = scalerX.transform(test_df)
test_scaled_dimred = test_scaled[:, low_var_index]
test_scaled_dimred.shape

(28000, 250)

In [22]:
# make prediction
submission = svc_full.predict(test_scaled_dimred)

In [23]:
# convert prediction to a dataframe with the appropriate index and column label for submission
submission_df = pd.DataFrame(submission, columns=['Label'])
submission_df.index += 1
submission_df.head()

Unnamed: 0,Label
1,2
2,0
3,9
4,9
5,3


In [24]:
# write submission to csv
submission_df.to_csv('Kaggle_Digit_Recognizer-SVM_submission.csv', index_label='ImageId')

In [None]:
#end