# Ensemble learning exercise

Train a Random Forest, an Extra Trees and an SVM classifiers on the MNIST data set and use the validation set to see their performance. At that point, create an ensemble of the three classifiers using hard and soft voting and check its performance on the validation set compared to that ottained previously. Has there been an overall improvement compared to each single previous case?

 - [Spliting the data into training, validation and testing sets](#Spliting-the-data-into-training,-validation-and-testing-sets)
 - [Model training](#Model-training)
   - [Decision tree](#Decision-tree)
   - [Support vector machine](#Support-vector-machine)
   - [Random forest](#Random-forest)
   - [Logistic regression](#Logistic-regression)
 - [Ensemble model](#Ensemble-model)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# load the MNIST data set
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

In [None]:
print(mnist.DESCR)

## Spliting the data into training, validation and testing sets

In [None]:
print(mnist['data'].shape)
print(type(mnist['data']))
print(mnist['target'].shape)
print(type(mnist['target']))

In [None]:
training_ratio = 0.6
validation_ratio = 0.2
testing_ratio = 0.2
training_size = int(training_ratio*len(mnist['data']))
test_size = int(testing_ratio*len(mnist['data']))
validation_size = len(mnist['data']) - training_size - test_size
shuffled_index = np.random.permutation(len(mnist['data']))

In [None]:
X_training_set = mnist['data'][shuffled_index[:training_size]]
X_validation_set = mnist['data'][shuffled_index[training_size:-test_size]]
X_testing_set = mnist['data'][shuffled_index[-test_size:]]

In [None]:
print(X_training_set.shape)
print(X_validation_set.shape)
print(X_testing_set.shape)

In [None]:
y_training_set = mnist['target'][shuffled_index[:training_size]]
y_validation_set = mnist['target'][shuffled_index[training_size:-test_size]]
y_testing_set = mnist['target'][shuffled_index[-test_size:]]

In [None]:
print(y_training_set.shape)
print(y_validation_set.shape)
print(y_testing_set.shape)

## Model training

Let's train a decision tree, random forest and support vector machine classifiers on the training set, and check its performance using the validation set.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from time import time

In [None]:
tree_clf = DecisionTreeClassifier()
svm_clf = SVC(gamma='scale')
rf_clf = RandomForestClassifier(n_estimators=100)
lg_clf = LogisticRegression(solver='liblinear', multi_class='ovr', max_iter=-1)

### Decision tree

In [None]:
t0 = time()
tree_clf.fit(X_training_set, y_training_set)
print(f'Time elapsed: {time()-t0:.2f} sec')

In [None]:
y_prediction_set = tree_clf.predict(X_validation_set)
print(f'Accuracy score: {accuracy_score(y_prediction_set, y_validation_set):.6f}')

### Support vector machine

In [None]:
t0 = time()
svm_clf.fit(X_training_set, y_training_set)
print(f'Time elapsed: {time()-t0:.2f} sec')

In [None]:
y_prediction_set = svm_clf.predict(X_validation_set)
print(f'Accuracy score: {accuracy_score(y_prediction_set, y_validation_set):.6f}')

### Random forest

In [None]:
t0 = time()
rf_clf.fit(X_training_set, y_training_set)
print(f'Time elapsed: {time()-t0:.2f} sec')

In [None]:
y_prediction_set = rf_clf.predict(X_validation_set)
print(f'Accuracy score: {accuracy_score(y_prediction_set, y_validation_set):.6f}')

### Logistic regression

In [None]:
t0 = time()
lg_clf.fit(X_training_set, y_training_set)
print(f'Time elapsed: {time()-t0:.2f} sec')

In [None]:
y_prediction_set = lg_clf.predict(X_validation_set)
print(f'Accuracy score: {accuracy_score(y_prediction_set, y_validation_set):.6f}')

## Ensemble model

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('tr', tree_clf), ('svc', svm_clf), ('rf', rf_clf), ('lg', lg_clf)], voting='hard', n_jobs=-1)

In [None]:
voting_clf.fit(X_training_set, y_training_set)

In [None]:
for clf in (tree_clf, svm_clf, rf_clf, lg_clf, voting_clf):
    clf.fit(X_training_set, y_training_set)
    y_pred = clf.predict(X_validation_set)
    print(f'Accuracy score for {clf.__class__.__name__}: {accuracy_score(y_validation_set, y_pred):.6f}')