In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

# plot options
plt.rcParams.update({'font.size': 11})
# plt.rcParams['figure.figsize'] = [16, 10] # for big plots
# %matplotlib notebook

#print option
pd.set_option("display.max_columns", 500)
# pd.set_option('display.max_rows',100)
# clear output in Jupyter cell
from IPython.display import clear_output

In [2]:
# Import datasets, classifiers and performance metrics
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split

### load data and split into train, valid, and test sets

In [3]:
# Load data from https://www.openml.org/d/554
X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42, stratify=y)

In [8]:
print('{} train instances'.format(X_train.shape[0]))
# print('{} valid instances'.format(X_valid.shape[0]))
print('{} test instances'.format(X_test.shape[0]))

60000 train instances
10000 test instances


# Train w/o dimensionality reduction

In [12]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
import time

In [15]:
t_start = time.time()

hgb_clf=HistGradientBoostingClassifier()
# fit
hgb_clf.fit(X_train,y_train)

print('Training time: {:.1f} s'.format(time.time()-t_start))

Training time: 473.091 s


In [17]:
print('HGB accuracy w/ full training data = {:.3f}'.format(metrics.accuracy_score(y_test,hgb_clf.predict(X_test))))

HGB accuracy w/ full training data = 0.976


# Train w/ PCA first

In [19]:
from sklearn.decomposition import PCA

In [20]:
pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X_train)

In [21]:
t_start = time.time()

hgb_clf_pca=HistGradientBoostingClassifier()
# fit
hgb_clf_pca.fit(X_reduced,y_train)

print('Training time: {:.1f} s'.format(time.time()-t_start))

Training time: 141.856 s


In [23]:
X_test_red=pca.transform(X_test)

In [25]:
print('HGB accuracy w/ PCA data = {:.3f}'.format(metrics.accuracy_score(y_test,hgb_clf_pca.predict(X_test_red))))

HGB accuracy w/ PCA data = 0.964
