In [1]:
from sklearn.datasets import fetch_openml #sklearn to download data
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [2]:
X = mnist["data"]
y = mnist["target"]

In [3]:
len(X)

70000

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test_val, y_train, y_test_val = train_test_split(X, y ,test_size = 20000, random_state=42)

In [5]:
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=10000, random_state=42)

In [6]:
print(len(X_val),len(X_test))

10000 10000


In [7]:
X_val.shape

(10000, 784)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier

rnd_clf = RandomForestClassifier(n_estimators=100,random_state=42)
ext_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(kernel="poly", degree=4, coef0=0.8, C=3)


In [9]:
estimators = [rnd_clf, ext_clf, svm_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

Training the RandomForestClassifier(random_state=42)
Training the ExtraTreesClassifier(random_state=42)
Training the SVC(C=3, coef0=0.8, degree=4, kernel='poly')


In [10]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9677, 0.9689, 0.9788]

In [11]:
voting_clf = VotingClassifier(estimators=[('ef', ext_clf),
                                          ('rf',rnd_clf),
                                          ('svc', svm_clf)],
                                          voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('ef', ExtraTreesClassifier(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc',
                              SVC(C=3, coef0=0.8, degree=4, kernel='poly'))])

In [12]:
from sklearn.metrics import accuracy_score
for clf in (ext_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

ExtraTreesClassifier 0.9692
RandomForestClassifier 0.9672
SVC 0.9785
VotingClassifier 0.9722


In [13]:
voting_clf.set_params(svc=None)

VotingClassifier(estimators=[('ef', ExtraTreesClassifier(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', None)])

In [14]:
voting_clf.estimators

[('ef', ExtraTreesClassifier(random_state=42)),
 ('rf', RandomForestClassifier(random_state=42)),
 ('svc', None)]

In [15]:
del voting_clf.estimators_[2]

In [16]:
voting_clf.score(X_val, y_val)

0.9685

In [17]:
voting_clf.voting = "soft"

In [18]:

voting_clf.score(X_val, y_val)

0.9682

In [19]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9675

In [20]:
from sklearn.metrics import accuracy_score
for clf in (ext_clf, rnd_clf,voting_clf):
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

ExtraTreesClassifier 0.9692
RandomForestClassifier 0.9672
VotingClassifier 0.9675


In [23]:
import numpy as np
X_val_predictions= np.empty((len(X_val), len(estimators)), dtype=np.float32)
for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [24]:
X_val_predictions

array([[8., 8., 8.],
       [5., 5., 5.],
       [5., 5., 5.],
       ...,
       [3., 3., 3.],
       [7., 7., 7.],
       [0., 0., 0.]], dtype=float32)

In [25]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_predictions, y_val)
# with oob, the data that were not used in　training will be used as tests
# Thus, no validation needed here
# can monitor the generalization ability while training


RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)

In [26]:
rnd_forest_blender.oob_score_
# oob scores are tested while training with the oob data

0.9747

In [27]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [28]:
y_pred = rnd_forest_blender.predict(X_test_predictions)
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9754

In [None]:
# this time, the score of this stacking is better than voting
# stackingはデータセット全体を訓練に使用する
# oob=Trueで多少解消する