In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

# plot options
plt.rcParams.update({'font.size': 11})
# plt.rcParams['figure.figsize'] = [16, 10] # for big plots
# %matplotlib notebook

#print option
pd.set_option("display.max_columns", 500)
# pd.set_option('display.max_rows',100)
# clear output in Jupyter cell
from IPython.display import clear_output

In [2]:
# Import datasets, classifiers and performance metrics
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split

### load data and split into train, valid, and test sets

In [3]:
# Load data from https://www.openml.org/d/554
X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)

In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [9]:
print('{} train instances'.format(X_train.shape[0]))
print('{} valid instances'.format(X_valid.shape[0]))
print('{} test instances'.format(X_test.shape[0]))

49000 train instances
10500 valid instances
10500 test instances


# Train a few models

### SVM

In [5]:
from sklearn import svm

In [6]:
svm_clf=svm.SVC(kernel='rbf',C=10,probability=False)
# fit
svm_clf.fit(X_train,y_train)

SVC(C=10)

In [7]:
# get preds
svm_preds=svm_clf.predict(X_valid)

In [8]:
print('SVM accuracy = {:.3f}'.format(metrics.accuracy_score(y_valid,svm_preds)))

SVM accuracy = 0.981


### KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=4)
# fit
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [11]:
# get preds
knn_preds=knn_clf.predict(X_valid)

In [12]:
print('KNN accuracy = {:.3f}'.format(metrics.accuracy_score(y_valid,knn_preds)))

KNN accuracy = 0.972


### Gradient Boosting trees

In [14]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [15]:
hgb_clf=HistGradientBoostingClassifier()
# fit
hgb_clf.fit(X_train,y_train)

HistGradientBoostingClassifier()

In [16]:
# get preds
hgb_preds=hgb_clf.predict(X_valid)

In [17]:
print('HGB accuracy = {:.3f}'.format(metrics.accuracy_score(y_valid,hgb_preds)))

HGB accuracy = 0.973


# Blender

Concatenate the predictions on the validation set into a single dataframe

In [18]:
catpreds=pd.DataFrame(data={'knn':knn_preds,'svm':svm_preds,'hgb':hgb_preds})

Train an SVM on the different model's predictions

In [21]:
svm_clf_2=svm.SVC(kernel='rbf',C=10,probability=False)
# fit
svm_clf_2.fit(catpreds,y_valid)

SVC(C=10)

Get the different models predictions on the test set

In [23]:
svm_test_preds=svm_clf.predict(X_test)
knn_test_preds=knn_clf.predict(X_test)
hgb_test_preds=hgb_clf.predict(X_test)

Concatenate the predictions on the test set into a single dataframe

In [24]:
catpreds=pd.DataFrame(data={'knn':knn_test_prcatpreds'svm':svm_test_preds,'hgb':hgb_test_preds})

In [25]:
# get preds
svm_test_preds=svm_clf_2.predict(catpreds)

In [26]:
print('SVM blender accuracy = {:.3f}'.format(metrics.accuracy_score(y_test,svm_test_preds)))

SVM blender accuracy = 0.981
