# 8

Load MNIST.

Split into training (50,000), validation (10,000), and testing (10,000).

Train various classifiers: Random Forest, Extra-Trees, and SVM

Combine them into ensemble and attempt to get a better score than the individual classifiers - use hard and soft voting.

Once you have found one - use it on the test set.

How much better does it perform compared to individual classifiers?

In [1]:
# fetch MNIST
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist['data'], mnist['target']
print(f'X Shape: {X.shape}, Y Shape: {y.shape}')

X Shape: (70000, 784), Y Shape: (70000,)


In [2]:
# pop off training, validation, and test set
X_train, X_val, X_test, y_train, y_val, y_test = X[:50000], X[50000:60000], X[60000:], y[:50000], y[50000:60000], y[60000:]
print('Shape X Train: {}'.format(X_train.shape))
print('Shape Y Train: {}'.format(y_train.shape))
print('Shape X Validation: {}'.format(X_val.shape))
print('Shape Y Validation: {}'.format(y_val.shape))
print('Shape X Test: {}'.format(X_test.shape))
print('Shape Y Test: {}'.format(y_test.shape))

Shape X Train: (50000, 784)
Shape Y Train: (50000,)
Shape X Validation: (10000, 784)
Shape Y Validation: (10000,)
Shape X Test: (10000, 784)
Shape Y Test: (10000,)


In [19]:
# let's scale our features since we have to run an SVM classifier later
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_val_scale = scaler.transform(X_val)
X_test_scale = scaler.transform(X_test)

## Random Forest

In [21]:
# evaluate and train a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rf_params = {"n_estimators": [600], "max_depth": [30,60,150], "max_leaf_nodes": [250,500,1000]}

In [22]:
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=3, scoring="accuracy", n_jobs=-1)

In [23]:
%time rf_grid.fit(X_train_scale, y_train)

CPU times: user 2min 6s, sys: 1.59 s, total: 2min 8s
Wall time: 22min 19s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [24]:
# out of curiousity, let's check on the best params to see that we're not way off and need to re-run gridsearch
rf_grid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=150, max_features='auto', max_leaf_nodes=1000,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
from sklearn.metrics import accuracy_score
y_rf_val_pred = rf_grid.predict(X_val_scale)        #<-- this predicts on the best, re-fitted model
print("Random Forest Validation Accuracy is: {}".format(accuracy_score(y_val, y_rf_val_pred)))

Random Forest Validation Accuracy is: 0.9639


## Extra-Trees

In [26]:
# now let's fit the extra trees classifier
from sklearn.ensemble import ExtraTreesClassifier
xt_params = {"n_estimators": [600], "max_depth": [200,400], "max_leaf_nodes": [1000,2000]}
xt_grid = GridSearchCV(ExtraTreesClassifier(), xt_params, cv=3, scoring="accuracy", n_jobs=-1)

In [27]:
%time xt_grid.fit(X_train_scale, y_train)

CPU times: user 1min 57s, sys: 960 ms, total: 1min 58s
Wall time: 14min 32s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None,
                                            criterion='gini', max_depth=None,
                                            max_features='auto',
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators='warn', n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [200, 400],

In [28]:
xt_grid.best_estimator_

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=200, max_features='auto', max_leaf_nodes=2000,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=600,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [30]:
y_xt_val_pred = xt_grid.predict(X_val_scale)        #<-- this predicts on the best, re-fitted model
print("Extra-Trees Validation Accuracy is: {}".format(accuracy_score(y_val, y_xt_val_pred)))

Extra-Trees Validation Accuracy is: 0.9664


## SVM

In [31]:
# let's train a SVM using the grid searched hyperparameters from Chap. 5 exercises
from sklearn.svm import SVC
svc_model = SVC(C=3, gamma=0.001, kernel='rbf', probability=True, decision_function_shape='ovr')
#^ setting for "one-vs-rest" and getting prediction probs for compatability with other models when we combine them

In [32]:
svc_model.fit(X_train_scale, y_train)

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [33]:
y_svc_val_pred = svc_model.predict(X_val_scale)        
print("SVM Validation Accuracy is: {}".format(accuracy_score(y_val, y_svc_val_pred)))

SVM Validation Accuracy is: 0.9731


## Voting Classifier

In [34]:
from sklearn.ensemble import VotingClassifier

In [35]:
ensemble_hard = VotingClassifier(
    estimators=[('rf',rf_grid.best_estimator_), ('xt',xt_grid.best_estimator_), ('svc',svc_model)],
    voting='hard')

ensemble_hard.fit(X_train_scale, y_train)
ensemble_hard_val_preds = ensemble_hard.predict(X_val_scale)

In [36]:
print('Ensemble Hard Voting Validation Accuracy: {}'.format(accuracy_score(y_val, ensemble_hard_val_preds)))

Ensemble Hard Voting Validation Accuracy: 0.968


In [37]:
ensemble_soft = VotingClassifier(
    estimators=[('rf',rf_grid.best_estimator_), ('xt',xt_grid.best_estimator_), ('svc',svc_model)],
    voting='soft')

ensemble_soft.fit(X_train_scale, y_train)
ensemble_soft_val_preds = ensemble_soft.predict(X_val_scale)

In [38]:
print('Ensemble Soft Voting Validation Accuracy: {}'.format(accuracy_score(y_val, ensemble_soft_val_preds)))

Ensemble Soft Voting Validation Accuracy: 0.9778


In [39]:
# now let's run our test set on the hard and soft voting ensembles
ensemble_hard_test_preds = ensemble_hard.predict(X_test_scale)
print('Ensemble Hard Voting Test Set Accuracy: {}'.format(accuracy_score(y_test, ensemble_hard_test_preds)))

Ensemble Hard Voting Test Set Accuracy: 0.9644


In [40]:
ensemble_soft_test_preds = ensemble_soft.predict(X_test_scale)
print('Ensemble Soft Voting Test Set Accuracy: {}'.format(accuracy_score(y_test, ensemble_soft_test_preds)))

Ensemble Soft Voting Test Set Accuracy: 0.9741


# 9