In [1]:
from __future__ import division
import numpy as np
import scipy as sp
import pandas as pd
import csv
import random as rn

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import KernelPCA, PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn import svm, cross_validation, grid_search
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif, RFECV, SelectKBest

In [3]:
# Train IO
train_digits = pd.read_csv("../datasets/mnist/traindigits", sep='   ', header=None, engine='python')
train_labels = pd.read_csv("../datasets/mnist/trainlabels", sep='   ', header=None, engine='python')
test_digits = pd.read_csv("../datasets/mnist/testdigits", sep='   ', header=None, engine='python')
test_labels = pd.read_csv("../datasets/mnist/testlabels", sep='   ', header=None, engine='python')

In [4]:
digits = train_digits.append(test_digits)
labels = train_labels.append(test_labels)

In [5]:
X = digits.as_matrix()
y = labels.as_matrix()
y = np.array([np.argmax(pattern) for pattern in y])

In [13]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
 X,y, train_size=.7, random_state=20)

In [14]:
X.shape

(70000, 784)

# MNIST Classifers

In [8]:
# Logit
logit_clf = LogisticRegressionCV(solver='newton-cg', Cs=[.001,.01,.1,1,10,100], 
                                 multi_class='multinomial', penalty='l2')
logit_clf.fit(X_train, y_train)
logit_clf.score(X_test, y_test)

0.91086666666666671

In [9]:
# SVM 
svm_params = {'C':[.01,1,10,100,500], 'degree':[1,2,3]}
clf = svm.SVC(kernel='poly')
svm_clf = grid_search.GridSearchCV(clf, svm_params, cv=3, n_jobs=4)
svm_clf.fit(X_train,y_train)
print svm_clf.best_score_
svm_clf.best_params_

0.9545


{'C': 500, 'degree': 2}

In [15]:
clf = svm.SVC(kernel='poly',C=1000, degree=2)
clf.fit(X_train,y_train)
clf.score(X_test, y_test)

0.98061904761904761

In [10]:
# Random Forests
rf_params = {"max_features":[1,2,4,6,8,12,16,20], 
             "n_estimators":[1024]}
clf = RandomForestClassifier()
rf_clf = grid_search.GridSearchCV(clf, rf_params, cv=3, n_jobs=4)
rf_clf.fit(X_train,y_train)
print rf_clf.best_score_
rf_clf.best_params_

0.9439


{'max_features': 8, 'n_estimators': 1024}

In [11]:
# KNN
knn_params = {"n_neighbors":[1,2,3,5,7,10,15,25,50,100,500]}
clf = KNeighborsClassifier()
knn_clf = grid_search.GridSearchCV(clf, knn_params, cv=3, n_jobs=4)
knn_clf.fit(X_train,y_train);
print knn_clf.best_score_
knn_clf.best_params_

0.9403


{'n_neighbors': 1}

In [None]:
# Gradient Boosting
gb_params = {"n_estimators":[512,1024],
            "learning_rate":[.01,.1]}
clf = GradientBoostingClassifier()
gb_clf = grid_search.GridSearchCV(clf, gb_params, cv=3, n_jobs=4)
gb_clf.fit(X_train,y_train);
print gb_clf.best_score_
gb_clf.best_params_

In [12]:
for classifer in [logit_clf, svm_clf, rf_clf, knn_clf]:
    print classifer.score(X_test, y_test)

0.910866666667
0.962016666667
0.950283333333
0.948733333333
