### Training MINST Dataset on ensemble based methods, using Ensemble methods and Stacking

In [5]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [7]:
minst_data = fetch_mldata("MNIST original")

### Split Data into test, train and validation set using 60/20/20 rule. 
First we split 80/20 . Then 75/25 on the 80 since 75% of 80 is 60% of original

In [8]:
X_train, X_test, y_train, y_test = train_test_split(minst_data['data'], minst_data['target'], test_size=0.2, random_state=42)
X_train, X_validation, y_train,y_validation = train_test_split(X_train,y_train, test_size =0.25,random_state=42)
print('Train: {}'.format(X_train.shape))
print('Validation: {}'.format(X_validation.shape))
print('Test: {}'.format(X_test.shape))

Train: (42000, 784)
Validation: (14000, 784)
Test: (14000, 784)


### Now let's train individual classifiers vs ensemble to check what their accuracy score is

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

In [31]:
svc = LinearSVC()
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
# this is using hard voting (mode of the dataset, if we want soft, we will need to enable predict proba methods for SVC)
voting = VotingClassifier(estimators=[('svc',svc),('rf',rf),('et',et)],voting='hard')
for clf in [svc,rf,et]:
    clf.fit(X_train,y_train)
    #y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, clf.score(X_test,y_pred=y_pred))

LinearSVC 0.841642857143
RandomForestClassifier 0.920714285714
ExtraTreesClassifier 0.929857142857


Now lets see how the voting classifier performs

In [32]:
voting.fit(X_train,y_train)
y_vote = voting.predict(X_test)
print('Voting Classifier {}'.format(accuracy_score(y_true=y_test, y_pred=y_vote)))

Voting Classifier 0.9282142857142858


So we see that the voting estimator performs better than individual ones. But Linear SVC performs poorly generally, what if I removed it and take best of two?

In [33]:
[print(estimator.__class__.__name__, estimator.score(X_test,y=y_test))for estimator in voting.estimators_];

LinearSVC 0.851642857143
RandomForestClassifier 0.921714285714
ExtraTreesClassifier 0.925785714286


In [34]:
voting.set_params(svc=None)

VotingClassifier(estimators=[('svc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None...timators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))],
         n_jobs=1, voting='hard', weights=None)

interesting. The set params gives me a new obj? or the classifier caches the estimators. I'm just going to delete it

In [35]:
voting.estimators_

[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
      verbose=0),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)]

In [36]:
del voting.estimators_[0]

In [37]:
voting.estimators_

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)]

In [38]:
y_vote = voting.predict(X_test)
print('Voting Classifier {}'.format(accuracy_score(y_true=y_test, y_pred=y_vote)))

Voting Classifier 0.9134285714285715


Does soft voting give me better performance? Yes!

In [39]:
voting.voting='soft'
y_vote = voting.predict(X_test)
print('Voting Classifier with soft voting {}'.format(accuracy_score(y_true=y_test, y_pred=y_vote)))

Voting Classifier with soft voting 0.9457142857142857


### Building a Stacking classifier on the validation set
Lets use the validation set to create a new training set and then evaluate a stacking strategy

In [42]:
X_prediction_validation = np.empty((len(X_validation),len(voting.estimators_)),dtype=np.float32)

In [43]:
for i ,est in enumerate(voting.estimators_):
    X_prediction_validation[:,i] = est.predict(X_validation)
print(X_prediction_validation)

[[ 1.  1.]
 [ 3.  3.]
 [ 0.  0.]
 ..., 
 [ 0.  0.]
 [ 7.  7.]
 [ 1.  1.]]


If these classifiers make orthogonal errors, this would be like training a classifier on these errors (Like using boosting in regression using multiple stages)

In [44]:
rf_blender = RandomForestClassifier(n_estimators=200,oob_score=True)

In [46]:
rf_blender.fit(X_prediction_validation,y_validation)
print(rf_blender.oob_score_)

0.928214285714


Although this gives a better prediction than individual classifiers, it does not compare to the ensemble classifier. Maybe I should use the full dataset (X_test) to generate these predictions?

In [50]:
X_test_predictions = np.empty((len(X_test),len(voting.estimators_)),dtype = np.float32)
for i ,est in enumerate(voting.estimators_):
    X_test_predictions[:,i] = est.predict(X_test)


print('Stacking classifier {}'.format(accuracy_score(y_true=y_test,y_pred = rf_blender.predict(X_test_predictions))))

Stacking classifier 0.9328571428571428


TODO: Add other types of multi class classifiers, maybe throw in a NN in there