In [1]:
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier


In [2]:
mnist = fetch_openml("mnist_784", as_frame=False)

  warn(


In [3]:

X, y = mnist.data, mnist.target
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=10_000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=10_000, random_state=42)

print ("Train set: ", X_train.shape, y_train.shape)
print ("Val set: ", X_val.shape, y_val.shape)
print ("Test set: ", X_test.shape, y_test.shape)


Train set:  (50000, 784) (50000,)
Val set:  (10000, 784) (10000,)
Test set:  (10000, 784) (10000,)


In [8]:
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)
print (f"Random forest accuracy (test): {forest_clf.score(X_test, y_test)}")

extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
extra_trees_clf.fit(X_train, y_train)
print (f"Extra trees accuracy (test): {extra_trees_clf.score(X_test, y_test)}")

# svm_clf = SVC(random_state=42)
# svm_clf.fit(X_train, y_train)
# print (f"SVM accuracy (test): {svm_clf.score(X_test, y_test)}")

Random forest accuracy (test): 0.9645
Extra trees accuracy (test): 0.9674


In [9]:
voting_clf = VotingClassifier(
  estimators=[
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=100, random_state=42)),
  ], voting="soft"
)
voting_clf.fit(X_train, y_train)

In [10]:
print (f"Voting CLF accuracy (test): {voting_clf.score(X_test, y_test)}")

Voting CLF accuracy (test): 0.9681


In [23]:
# Building a stacking ensemble classifier

# Run each classifier from the ensemble on the validation dataset to build a new dataset where for each instance, we get a column
# for each prediction from each classifier

estimators_preds = []
for estimator in voting_clf.estimators_:
  preds = estimator.predict(X_val)
  estimators_preds.append(preds)

# Will have the shape (num_examples, num_estimators)
estimators_preds = np.array(estimators_preds).T

estimators_preds.shape

(10000, 2)

In [24]:
# Training a model on the new dataset (the blender model)

forest_blender_clf = RandomForestClassifier(random_state=42)

# The actual validation set y values are used as y here as well, only the features change
forest_blender_clf.fit(estimators_preds, y_val)

In [27]:
# Prediction is now done by taking the predictions from each estimator and passing it to the blender model

preds = []
for estimator in voting_clf.estimators_:
  preds.append(estimator.predict(X_test))

preds = np.array(preds).T

forest_blender_clf.score(preds, y_test)

0.9657

In [34]:
# Using an actual stacking classifier from sklearn

from sklearn.ensemble import StackingClassifier

stack_clf = StackingClassifier(estimators=[
              ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
              ('et', ExtraTreesClassifier(n_estimators=100, random_state=42)),
            ],
            final_estimator=RandomForestClassifier(random_state=42), cv=2)
stack_clf.fit(X_train, y_train)

In [35]:
stack_clf.score(X_test, y_test)

0.9744