In [1]:
"""
Running MNIST pipeline entirely online, rather than from csv files like currently done in the experiments folder
Steps:
    1. Import MNIST data (from Kaggle to be consistent with Sebastien)
    2. Blur the training images
    3. Reduce image matrices by principal component analysis
    4. classify using polynomial SVM

Outputs 0.937166667 accuracy on Kaggle test dataset after training on entire train dataset
"""
    
# from gcp_hpo.smart_search import SmartSearch
# from ..smart_search import SmartSearch # Alec edit
import numpy as np
import math
import pandas as pd
from scipy.ndimage.filters import gaussian_filter 
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [2]:
# Make iPython notebook take up whole screen

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Print more items in numpy arrays
np.set_printoptions(edgeitems=100)

In [None]:
# This cv2 implementation allows us to do the Gaussian blur, but only for odd values of the kernel size,
# so not incorporating for now
# from cv2 import GaussianBlur

# def gaussian_blur(img):
#     return GaussianBlur(img,(5,5),sigmaX=4)

# print gaussian_blur(X_train[0,:])

In [3]:
# Set parameters
blur_ksize = 1 # Fixed at this value
blur_sigma = 4
pca_dim = 60
degree_poly = 3
gamma = 10**(-3)

In [4]:
def import_data():
    data = np.loadtxt('/Users/aandersonlaptop/Desktop/MNIST_train.csv',skiprows=1,delimiter=',')
    train_x = data[:40000,1:]
    train_y = data[:40000,0]
    # print 'train Y',train_y
    # print 'train X',train_x
    test_x = data[40000:,1:]
    test_y = data[40000:,0]
    
    # print 'test X',test_x
    return train_x,train_y,test_x,test_y
    
    
X_train,y_train,X_test,y_test = import_data()

In [5]:
def gaussian_blur(X,stddev):
    # This fixes a kernel size, can incorporate changing kernel sizes later using scipy generic_filter or opencv
    return gaussian_filter(X,sigma=stddev,order=0)

blurred_X_train = gaussian_blur(X_train,stddev=blur_sigma)
print blurred_X_train.shape

(40000, 784)


In [6]:
def do_pca(X,num_components,fit,fitted_model=None):
    if fit:
        pca = PCA(n_components = num_components)
        pca = pca.fit(X)
        return pca,pca.transform(X)
    else:
        return fitted_model.transform(X)
    
pca_model,pca_X_train = do_pca(blurred_X_train,num_components=pca_dim,fit = True)

In [None]:
def random_sample(X,y,num_samples):
    idx = np.random.randint(X.shape[0],size=num_samples)
    return X[idx],y[idx]

X_sample,y_sample = random_sample(pca_X_train,y_train,15000)
print 'shape',X_sample.shape
print 'shape',y_sample.shape

In [None]:
# def do_svm(X,y,degree,gamma_coeff):
#     clf = SVC(kernel='poly',degree=degree,gamma=gamma_coeff)
#     clf.fit(X,y)
#     print 'accuracy on training set',clf.score(X,y)
#     return clf

# # model = do_svm(pca_X_train,y_train,degree=degree_poly,gamma_coeff=gamma) # Whole training set
# model = do_svm(pca_X_train,y_train,degree=degree_poly,gamma_coeff=gamma)
# model = do_svm(X_train[:5000,:],y_train[:5000],degree=degree_poly,gamma_coeff=gamma)
# model = do_svm(X_sample,y_sample,degree=degree_poly,gamma_coeff=gamma)

In [12]:
from sklearn.ensemble import RandomForestClassifier

def random_forest(X,y,num_estimators):
    clf = RandomForestClassifier(n_estimators=num_estimators)
    clf.fit(X,y)
    print 'accuracy on training set',clf.score(X,y)
    return clf

model = random_forest(pca_X_train,y_train,num_estimators=10)

accuracy on training set 0.979425


In [None]:
# ############################# Test model ############################
# def import_test_data():
#     # Import MNIST data
#     data = np.loadtxt('/Users/aandersonlaptop/Desktop/MNIST_test.csv',skiprows=1,delimiter=',') # way slower than pandas
#     new_x = data[:,1:]
#     new_y = data[:,0]
#     print 'Y',new_y
#     print 'X',new_x
#     return new_x,new_y

# X_test,y_test = import_data()

In [13]:
### Test model ###

blurred_X_test = gaussian_blur(X_test,stddev=blur_sigma)
pca_X_test = do_pca(blurred_X_test,num_components=pca_dim,fit=False,fitted_model=pca_model)
print model.score(pca_X_test,y_test)
# print model.score(X_test,y_test)

0.1695
