# Load the MNIST dataset (introduced in Chapter 3), and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing). Then train various classifiers, such as a random forest classifier, an extra-trees classifier, and an SVM classifier. Next, try to combine them into an ensemble that outperforms each individual classifier on the validation set, using soft or hard voting. Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

# Step 1: Load MNIST dataset

In [1]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# load MNIST dataset from OpenML.org
from sklearn.datasets import fetch_openml
X_mnist, y_mnist = fetch_openml('mnist_784', return_X_y=True, as_frame=False)

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  warn(


# Step 2: Split into 5:1:1 ratio for training, valid, test sets.

In [3]:
# split data into train, validation, and test sets in a 5:1:1 ratio
X_train, y_train = X_mnist[:50_000], y_mnist[:50_000]
X_valid, y_valid = X_mnist[50_000:60_000], y_mnist[50_000:60_000]
X_test, y_test = X_mnist[60_000:], y_mnist[60_000:]

# print lengths of each set
print("Train set length:", len(X_train))
print("Validation set length:", len(X_valid))
print("Test set length:", len(X_test))

Train set length: 50000
Validation set length: 10000
Test set length: 10000


# Step 3: Train various classifiers, such as a random forest classifier, an extra-trees classifier, and an SVM classifier.

In [4]:
# train random forest, extra-trees, and SVM classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC

random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)

In [5]:
# iterate through each estimator and fit it to the training data
estimators = [random_forest_clf, extra_trees_clf, svm_clf]
for estimator in estimators:
    estimator.fit(X_train, y_train)

In [6]:
# evaluate each estimator on the validation set
from sklearn.metrics import accuracy_score

for estimator in estimators:
        y_pred = estimator.predict(X_valid)
        print(estimator.__class__.__name__, accuracy_score(y_valid, y_pred))

RandomForestClassifier 0.9736
ExtraTreesClassifier 0.9743
LinearSVC 0.8662


# Step 4: Combine them into an ensemble that outperforms each individual classifier on the validation set, using soft or hard voting.

In [7]:
# combine into ensemble that outperforms each individual classifier on the validation set using hard voting
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
        estimators=[('rf', random_forest_clf), ('et', extra_trees_clf), ('svm', svm_clf)],
        voting='hard'
)

voting_clf.fit(X_train, y_train)

In [8]:
# evaluate ensemble on the validation set
y_pred = voting_clf.predict(X_valid)
print(accuracy_score(y_valid, y_pred))

0.9737


In [9]:
# convert classes to class indices
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_valid_encoded = encoder.fit_transform(y_valid)

In [10]:
# convert to integers
import numpy as np
y_valid_encoded = y_valid.astype(np.int64)

In [11]:
# evaluate classifier clones on the validation set
[estimator.score(X_valid, y_valid_encoded)
 for estimator in voting_clf.estimators_]

[0.9736, 0.9743, 0.8662]

In [12]:
# try dropping the SVM classifier to see if the ensemble performs better
voting_clf.set_params(svm=None)
voting_clf.estimators

[('rf', RandomForestClassifier(random_state=42)),
 ('et', ExtraTreesClassifier(random_state=42)),
 ('svm', None)]

In [13]:
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 LinearSVC(max_iter=100, random_state=42, tol=20)]

In [14]:
voting_clf.named_estimators_

{'rf': RandomForestClassifier(random_state=42),
 'et': ExtraTreesClassifier(random_state=42),
 'svm': LinearSVC(max_iter=100, random_state=42, tol=20)}

# In order to remove the LinearSVC estimator, we need to either fit VotingClassifier again or just remove SVM from the list of trained estimators, both in estimators_ and named_estimators_.

In [15]:
# remove the SVM classifier from the list of estimators
svm_clf_trained = voting_clf.named_estimators_.pop("svm")
voting_clf.estimators_.remove(svm_clf_trained)

In [16]:
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42)]

In [17]:
# evaluate the voting classifier again without the SVM classifier
voting_clf.score(X_valid, y_valid)

0.9735

# As seen, the SVM classifier was slightly hurting the data.

In [18]:
# try using soft voting and see if it performs better than hard voting
voting_clf.voting = "soft"
voting_clf.score(X_valid, y_valid)

0.9749

# Step 5: Try it on the test set.

In [19]:
# evaluate the ensemble on the test set
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9691

In [20]:
[estimator.score(X_test, y_test.astype(np.int64))
 for estimator in voting_clf.estimators_]

[0.968, 0.9703]

# Run the individual classifiers from the previous exercise to make predictions on the validation set, and create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image’s class. Train a classifier on this new training set. Congratulations—you have just trained a blender, and together with the classifiers it forms a stacking ensemble! Now evaluate the ensemble on the test set. For each image in the test set, make predictions with all your classifiers, then feed the predictions to the blender to get the ensemble’s predictions. How does it compare to the voting classifier you trained earlier? Now try again using a StackingClassifier instead. Do you get better performance? If so, why?

# Step 1: # Run the individual classifiers from the previous exercise to make predictions on the validation set, and create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image’s class. Train a classifier on this new training set.

In [21]:
# run individual classifiers to make predictions on the validation set
X_valid_predictions = np.empty((len(X_valid), len(estimators)), dtype=object)

# create a matrix of predictions for each estimator
for index, estimator in enumerate(estimators):
        X_valid_predictions[:, index] = estimator.predict(X_valid)

X_valid_predictions

array([['3', '3', '3'],
       ['8', '8', '8'],
       ['6', '6', '6'],
       ...,
       ['5', '5', '5'],
       ['6', '6', '6'],
       ['8', '8', '8']], dtype=object)

In [22]:
# create a new random forest classifier, but this time train it on the predictions of the other classifiers
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_valid_predictions, y_valid_encoded)

# evaluate the blender on the validation set
rnd_forest_blender.oob_score_

0.973

# Step 2: Now evaluate the ensemble on the test set. For each image in the test set, make predictions with all your classifiers, then feed the predictions to the blender to get the ensemble’s predictions. How does it compare to the voting classifier you trained earlier? Now try again using a StackingClassifier instead. Do you get better performance? If so, why?

In [23]:
# evaluate ensemble on test set
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=object)

# create a matrix of predictions for each estimator
for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [24]:
y_pred = rnd_forest_blender.predict(X_test_predictions)

In [25]:
accuracy_score(y_test, y_pred)

0.0

In [26]:
# try with a stacking classifier 
# join training and validation sets
X_train_full, y_train_full = X_mnist[:60_000], y_mnist[:60_000]

In [30]:
# train stacking classifier on combined set
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
        estimators=[('rf', random_forest_clf), ('et', extra_trees_clf), ('svm', svm_clf)],
        final_estimator=RandomForestClassifier(n_estimators=100, random_state=42)
)

stacking_clf.fit(X_train_full, y_train_full)

In [32]:
stacking_clf.score(X_test, y_test)

0.9773