In [1]:
from sklearn.datasets import fetch_openml

X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

In [2]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X,y,test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val,y_train_val,test_size=10000, random_state=42)

In [3]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

lsvc = LinearSVC(random_state=42)
rf = RandomForestClassifier(n_jobs=-1,random_state=42)
et = ExtraTreesClassifier(n_jobs=-1,random_state=42)

estimators = [lsvc, rf, et]

In [4]:
for estimator in estimators:
    print('Training', estimator)
    estimator.fit(X_train, y_train)

Training LinearSVC(random_state=42)




Training RandomForestClassifier(n_jobs=-1, random_state=42)
Training ExtraTreesClassifier(n_jobs=-1, random_state=42)


In [7]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.8695, 0.9692, 0.9715]

In [5]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

named_estimators = [
    ('linSVC',lsvc),
    ('randomforest',rf),
    ('extratrees',et)
]
vc = VotingClassifier(named_estimators, voting='soft', n_jobs=1)

In [8]:
vc.fit(X_train,y_train)



VotingClassifier(estimators=[('linSVC', LinearSVC(random_state=42)),
                             ('randomforest',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=42)),
                             ('extratrees',
                              ExtraTreesClassifier(n_jobs=-1,
                                                   random_state=42))],
                 n_jobs=1, voting='soft')

In [10]:
vc.voting='hard'

In [11]:
vc.score(X_val,y_val)

0.9699

It sucks a bit, Extra Trees gave us a better result on the validation set

In [12]:
vc.score(X_test, y_test)

0.9663

# Exercise 9

In [14]:
import numpy as np

X_val_pred = np.empty((len(X_val),len(estimators)))

for index, estimator in enumerate(estimators):
    X_val_pred[:,index] = estimator.predict(X_val)

In [15]:
et_blender = ExtraTreesClassifier(random_state=42, n_jobs=-1)

et_blender.fit(X_val_pred, y_val)

ExtraTreesClassifier(n_jobs=-1, random_state=42)

In [17]:
et_blender.score(X_val_pred,y_val)

0.977

This stacked estimator improves substantially the result. Let's try it in the test set (without training)

In [18]:
X_test_pred = np.empty((len(X_test),len(estimators)))

for index, estimator in enumerate(estimators):
    X_test_pred[:,index] = estimator.predict(X_test)

In [20]:
et_blender.score(X_test_pred, y_test)

0.9669

The accuracy gain in the test set is far less impressive