This notebook does a simple analysis of the mnist fashion data to test out my implementation of the superlearner classifer. I asses the model performance as well as diversity and performance of the individual base estimators.

In [8]:
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, classification_report
from keras.datasets import fashion_mnist 
from keras.utils import to_categorical
from sklearn import tree
from scipy.stats.stats import pearsonr

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from superlearner import SuperLearnerClassifier

Lets begin by testing the model is working on the iris dataset

In [2]:
iris = load_iris()

sl_model = SuperLearnerClassifier(use_stacked_prob=False)
sl_model.fit(pd.DataFrame(iris.data), iris.target)
accuracy_score(sl_model.predict(pd.DataFrame(iris.data)), iris.target)

0.9933333333333333

Now lets test the superlearner on a more challenging dataset - mnist fashion

In [40]:
# load and preprocess the data
(train_X,train_Y), (test_X,test_Y)  = fashion_mnist.load_data()
train_X = train_X.astype('float32')
test_X = test_X.astype('float32')
train_X = train_X / 255
test_X = test_X / 255

train_X = [train_X[i].reshape(-1) for i in range(train_X.shape[0])]
test_X = [test_X[i].reshape(-1) for i in range(test_X.shape[0])]

train_X = pd.DataFrame(train_X)
test_X = pd.DataFrame(test_X)

train_Y_one_hot = to_categorical(train_Y)
test_Y_one_hot = to_categorical(test_Y)

In [41]:
# subsample the data to something more manageable
prop = 0.2  # leaves 12000 observations
num_samples = int(np.round(prop*train_X.shape[0]))
train_X = train_X.sample(n=num_samples, random_state=1)
train_Y = train_Y[train_X.index]

In [42]:
# fit a decision tree as a quick baseline
mod = tree.DecisionTreeClassifier()
mod.fit(train_X, train_Y)
preds = mod.predict(test_X)
print('Accuracy: ', accuracy_score(preds, test_Y))

Accuracy:  0.7532


In [48]:
help(SuperLearnerClassifier)

Help on class SuperLearnerClassifier in module superlearner:

class SuperLearnerClassifier(builtins.object)
 |  SuperLearnerClassifier(use_stacked_prob=False, stacked_classifier='decision_tree', estimators_to_remove=[], include_original_input=False)
 |  
 |  An ensemble classifier that uses heterogeneous models at the base layer and a aggregation model at the aggregation layer.
 |  
 |  
 |  Parameters
 |  ----------
 |  
 |  use_stacked_prob : bool, optional (default = False)
 |      Option to use probability estimates rather than classifiacations 
 |      for training at the stacked layer.
 |  
 |  stacked_classifier : string or None, optional (default = decision_tree)
 |      Choice of classifier on the stacked dataset Z. Options are: 
 |      "decision_tree", "logistic_regression", "k_nearest_neighbours", 
 |      "random_forest" or "most_frequent".
 |      
 |  estimators_to_remove : list, optional (default = [])
 |      Option to remove (in order to specify) one or more of the ba

In [43]:
# now fit the superlearner (This can take some time)
mod = SuperLearnerClassifier()
mod.fit(train_X, train_Y)

In [44]:
# some evaluation
y_pred = mod.predict(test_X)

accuracy = accuracy_score(test_Y, y_pred) 
print("Accuracy: " +  str(accuracy))
print(classification_report(test_Y, y_pred))

print("Confusion Matrix")
pd.crosstab(np.array(test_Y), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.8418
              precision    recall  f1-score   support

           0       0.74      0.84      0.79      1000
           1       0.98      0.95      0.96      1000
           2       0.69      0.77      0.73      1000
           3       0.84      0.85      0.84      1000
           4       0.73      0.74      0.74      1000
           5       0.96      0.94      0.95      1000
           6       0.68      0.51      0.58      1000
           7       0.91      0.94      0.92      1000
           8       0.94      0.94      0.94      1000
           9       0.95      0.94      0.94      1000

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,836,5,21,37,7,1,77,0,16,0,1000
1,9,951,3,22,7,0,6,0,2,0,1000
2,23,3,774,12,129,0,49,0,10,0,1000
3,59,10,22,849,25,0,32,0,3,0,1000
4,3,0,141,50,743,0,59,0,4,0,1000
5,1,0,0,0,0,936,1,43,0,19,1000
6,181,2,151,39,92,0,509,0,26,0,1000
7,0,0,0,0,0,23,0,942,2,33,1000
8,11,3,10,4,8,2,19,5,938,0,1000
9,0,0,0,0,0,12,0,47,1,940,1000


Lets perform an analysis to investigate the strength of the base estimators and the strengths of the correlations between them.

In [45]:
# trained_base_models = diversity.output["Base estimators"]
trained_base_models = mod.estimators

predictions = {} #obtaining predictions
for key,model in trained_base_models.items():
    predictions[key] =  model.predict(test_X)
    
A = predictions.keys()

#filling in the pearson table using the pearsonr function
pearson_table = pd.DataFrame(index = A, columns=A)
for i in A:
    for j in A:
            X = predictions[i]
            Y = predictions[j]
            pearson_table.loc[i,j] = pearsonr(X,Y)[0]

#measuring accuracy of individual base estimators on test set            
accuracy_table = pd.DataFrame(index = ["Accuracy"], columns=A)
for i in A:
    X = accuracy_score(predictions[i], test_Y)
    accuracy_table.loc["Accuracy",i] = X

In [46]:
print("\nPearson table of correlation between predictions of base level models:")
pearson_table


Pearson table of correlation between predictions of base level models:


Unnamed: 0,decision_tree,random_forest,bagging,logistic_regression,k_nearest_neighbours,linear_svc
decision_tree,1.0,0.879585,0.883291,0.852862,0.852773,0.841616
random_forest,0.879585,1.0,0.91244,0.897859,0.905728,0.881959
bagging,0.883291,0.91244,1.0,0.876212,0.87688,0.865801
logistic_regression,0.852862,0.897859,0.876212,1.0,0.876294,0.920903
k_nearest_neighbours,0.852773,0.905728,0.87688,0.876294,1.0,0.867719
linear_svc,0.841616,0.881959,0.865801,0.920903,0.867719,1.0


In [47]:
print("\nTable of accuracy for the individual base estimators on the test set:")
accuracy_table


Table of accuracy for the individual base estimators on the test set:


Unnamed: 0,decision_tree,random_forest,bagging,logistic_regression,k_nearest_neighbours,linear_svc
Accuracy,0.7616,0.8428,0.8328,0.8307,0.8206,0.8244
