In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

# plot options
plt.rcParams.update({'font.size': 11})
# plt.rcParams['figure.figsize'] = [16, 10] # for big plots
# %matplotlib notebook

#print option
pd.set_option("display.max_columns", 500)
# pd.set_option('display.max_rows',100)
# clear output in Jupyter cell
from IPython.display import clear_output

In [2]:
# Import datasets, classifiers and performance metrics
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split

### load data and split into train, valid, and test sets

In [3]:
# Load data from https://www.openml.org/d/554
X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)

In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [9]:
print('{} train instances'.format(X_train.shape[0]))
print('{} valid instances'.format(X_valid.shape[0]))
print('{} test instances'.format(X_test.shape[0]))

49000 train instances
10500 valid instances
10500 test instances


# Train a few models

### SVM

In [None]:
from sklearn import svm

In [12]:
svm_clf=svm.SVC(kernel='rbf',C=10,probability=True)
# fit
svm_clf.fit(X_train,y_train)

SVC(C=10, probability=True)

In [13]:
# get preds
svm_preds=svm_clf.predict(X_valid)
# get probs
svm_probs=svm_clf.predict_proba(X_valid)

In [23]:
print('SVM accuracy = {:.3f}'.format(metrics.accuracy_score(y_valid,svm_preds)))

SVM accuracy = 0.981


### KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier

In [21]:
knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=4)
# fit
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [24]:
# get preds
knn_preds=knn_clf.predict(X_valid)
# get probs
knn_probs=knn_clf.predict_proba(X_valid)

In [25]:
print('KNN accuracy = {:.3f}'.format(metrics.accuracy_score(y_valid,knn_preds)))

KNN accuracy = 0.972


### Gradient Boosting trees

In [30]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [31]:
hgb_clf=HistGradientBoostingClassifier()
# fit
hgb_clf.fit(X_train,y_train)

HistGradientBoostingClassifier()

In [32]:
# get preds
hgb_preds=hgb_clf.predict(X_valid)
# get probs
hgb_probs=hgb_clf.predict_proba(X_valid)

In [34]:
print('HGB accuracy = {:.3f}'.format(metrics.accuracy_score(y_valid,hgb_preds)))

HGB accuracy = 0.972


# Ensembles

### Hard voting

In [53]:
catpreds=pd.DataFrame(data={'knn':knn_preds,'svm':svm_preds,'hgb':hgb_preds})

In [61]:
print('hard voting accuracy = {:.3f}'.format(metrics.accuracy_score(y_valid,catpreds.mode(axis=1)[0].values)))

hard voting accuracy = 0.981


### Soft voting

This may be of limited utility bc the KNN prbablity isn't very helpful

Stack the probability arrays along a third dimension

In [69]:
probs=np.stack((knn_probs,svm_probs,hgb_probs))

In [71]:
probs.shape

(3, 10500, 10)

mean

In [82]:
mean_probs=probs.mean(axis=0).argmax(axis=1)
print('(mean) soft voting accuracy = {:.3f}'.format(metrics.accuracy_score(y_valid.astype(int),mean_probs)))

(mean) soft voting accuracy = 0.981


median

In [85]:
med_probs=np.median(probs,axis=0).argmax(axis=1)
print('(median) soft voting accuracy = {:.3f}'.format(metrics.accuracy_score(y_valid.astype(int),med_probs)))

(median) soft voting accuracy = 0.981


# Test set

In [86]:
# get preds
svm_preds=svm_clf.predict(X_test)
knn_preds=knn_clf.predict(X_test)
hgb_preds=hgb_clf.predict(X_test)

In [87]:
print('SVM accuracy = {:.3f}'.format(metrics.accuracy_score(y_test,svm_preds)))
print('KNN accuracy = {:.3f}'.format(metrics.accuracy_score(y_test,knn_preds)))
print('HGB accuracy = {:.3f}'.format(metrics.accuracy_score(y_test,hgb_preds)))

SVM accuracy = 0.984
KNN accuracy = 0.973
HGB accuracy = 0.977


In [88]:
catpreds=pd.DataFrame(data={'knn':knn_preds,'svm':svm_preds,'hgb':hgb_preds})
print('hard voting accuracy = {:.3f}'.format(metrics.accuracy_score(y_test,catpreds.mode(axis=1)[0].values)))

hard voting accuracy = 0.984
