# COMP47590: Advanced Machine Learning
# Assignment 1: The Super Learner

## Import Packages Etc

In [28]:
from sklearn.base import BaseEstimator, ClassifierMixin
import pandas as pd
from IPython.display import display, HTML, Image
import matplotlib.pyplot as plt
from matplotlib import pyplot
from random import randint
import numpy as np
import sys
from sklearn import dummy
import os

from sklearn import svm
from sklearn import tree
from sklearn import metrics
from sklearn import ensemble
from sklearn import linear_model
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from statsmodels.stats.contingency_tables import mcnemar
from scipy.stats.stats import pearsonr

%matplotlib inline
#%qtconsole

## Define Super Learner Classifier

The *Super Learner* is a heterogeneous stacked ensemble classifier. This is a classification model that uses a set of base classifiers of different types, the outputs of which are then combined in another classifier at the stacked layer. The Super Learner was described in [(van der Laan et al, 2007)](https://pdfs.semanticscholar.org/19e9/c732082706f39d2ba12845851309714db135.pdf) but the stacked ensemble idea has been around for a long time. 

Figure 1 shows a flow diagram of the Super Learner process (this is from (van der Laan et al, 2007) and the process is also described in the COMP47590 lecture "[COMP47590 2017-2018 L04 Supervised Learning Ensembles 3](https://www.dropbox.com/s/1ksx94nxtuyn4l8/COMP47590%202017-2018%20L04%20Supervised%20Learning%20Ensembles%203.pdf?raw=1)"). The base classifiers are trained and their outputs are combined along with the training dataset labels into a training set for the stack layer classifier. To avoid overfitting the generation of the stacked layer training set uses a k-fold cross validation process (described as V-fold in Figure 1). To further add variety to the base estimators a bootstrapping selection (as is used in the bagging ensemble approach).
 
![Super Learner Process Flow](SuperLearnerProcessFlow.png "Logo Title Text 1")
Figure 1: A flow diagram for the Super Learner


### Define the SuperLearnerClassifier Class

In [2]:
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class SuperLearnerClassifier(BaseEstimator, ClassifierMixin):
    
    """
    An ensemble classifier that uses heterogeneous models at the base layer and a aggregation model at the aggregation layer.



    Parameters
    ----------
    
    use_stacked_prob : bool, optional (default = False)
        Option to use probability estimates rather than classifiacations 
        for training at the stacked layer.
    
    stacked_classifier : string or None, optional (default = decision_tree)
        Choice of classifier on the stacked dataset Z. Options are: 
        "decision_tree", "logistic_regression", "k_nearest_neighbours", 
        "random_forest" or "most_frequent".
        
    estimators_to_remove : list or None, optional (default = None)
        Option to remove (in order to specify) one or more of the base
        estimators. Choose from: ["Decision tree", "Random forest", 
        "Linear SVM", "Bagging", "K neighbours", "Logistic regression"]
        
    include_original_input : bool, optional (default = False)
        Include original input data, X, at the stacked layer.
        
        
        
        
    Attributes
    ----------
    
    output = A dictionary of the form {"Base estimators": ... , "Stacked layer": ... } 
        storing all the trained based estimators as well as the trained stacked layer 
        estimator.
        
    Z = Pandas DataFrame containing the stacked layer dataset Z (when: use_stacked_prob 
        = False)
        
    Z_prob = Pandas DataFrame containing the stacked layer dataset Z (when: 
        use_stacked_prob = True)


    Notes
    -----
    

    See also
    --------
    
    ----------
    .. [1]  van der Laan, M., Polley, E. & Hubbard, A. (2007). 
            Super Learner. Statistical Applications in Genetics 
            and Molecular Biology, 6(1) 
            doi:10.2202/1544-6115.1309
            
    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = SuperLearnerClassifier()
    >>> iris = load_iris()
    >>> clf.fit(pd.DataFrame(iris.data), iris.target)
    >>> cross_val_score(clf, pd.DataFrame(iris.data), iris.target, cv=10)

    """
    # Constructor for the classifier object
    def __init__(self, use_stacked_prob = False, stacked_classifier = "decision_tree", estimators_to_remove = None,\
                include_original_input = False):
        """Setup a SuperLearner classifier .
        Parameters
        ----------

        Returns
        -------

        """     
        self.decision_tree = tree.DecisionTreeClassifier(criterion="entropy", max_depth=7, min_samples_split=11)
        self.random_forest = ensemble.RandomForestClassifier(n_estimators=500, max_features = 4) #change_max_features
        self.bagging = ensemble.BaggingClassifier(base_estimator = tree.DecisionTreeClassifier(criterion="entropy"), n_estimators=10)     
        self.logistic_model = linear_model.LogisticRegression()
        self.k_nearest_neighbours = neighbors.KNeighborsClassifier(n_neighbors=5)
        self.linear_svc = svm.SVC(kernel="linear",C=1.0, probability=True)

        self.include_original_input = include_original_input
        self.use_stacked_prob = use_stacked_prob
        
        #stacked layer classifier
        if stacked_classifier == "decision_tree" or stacked_classifier == None:
            self.Z_classifier = tree.DecisionTreeClassifier(criterion="entropy")
        elif stacked_classifier == "logistic_regression":
            self.Z_classifier = linear_model.LogisticRegression()
        elif stacked_classifier == "k_nearest_neighbours":
            self.Z_classifier = neighbors.KNeighborsClassifier(n_neighbors=5)
        elif stacked_classifier == "random_forest":
            self.Z_classifier = ensemble.RandomForestClassifier(n_estimators= 500)
        elif stacked_classifier == "most_frequent":
            self.Z_classifier = dummy.DummyClassifier(strategy="most_frequent")
        else:
            sys.exit('Error: Not known classifier for stacked layer classifier, check spelling')
        
        #select estimators
        self.estimators_to_remove = estimators_to_remove
        
        
    # The fit function to train a classifier
    def fit(self, X, Y):
        """Build a SuperLearner classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. 
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """    
        
        
        #full dictionary of estimators
        estimators = {"Decision tree":self.decision_tree, "Random forest":self.random_forest, "Bagging":self.bagging, "Logistic regression":self.logistic_model,\
                      "K neighbours":self.k_nearest_neighbours, "Linear SVC":self.linear_svc}
        
        #optional remove certain estimators 
        #can use any subset of the availabe estimators
        if self.estimators_to_remove == None:
            pass
        else:
            for key in self.estimators_to_remove:
                key_to_remove = key
                estimators = {key: value for key, value in estimators.items() if key != key_to_remove}

        
        results_list = []
        results_list_prob = []
        
        if self.use_stacked_prob == False: #use classifications at stacked layer
            k_fold = KFold(5, shuffle=False, random_state=None)
            for k, (train, test) in enumerate(k_fold.split(X, Y)): #looping through folds
                prediction_list = [] #creating a new list for each estimator
                for name, model in estimators.items(): #looping through folds
                    model.fit(X.iloc[train,], Y[train]) #fitting each model to fold training data
                    pred = model.predict(X.iloc[test,]) #predicting each model on its folds test data
                    prediction_list.append(np.array(pred)) 
                fold_k = pd.DataFrame(prediction_list) #Storing this paricular folds results as a dataframe
                fold_k = fold_k.T
                results_list.append(fold_k) #appending this folds results to a list for all folds
                

        
            self.Z = pd.concat(results_list).reset_index(drop = True) #Creating the Z dataframe
            self.Z.columns = estimators.keys()
            
            #include original inputs data?
            if self.include_original_input == False:
                pass
            else:
                X.reset_index(drop = True)
                self.Z = pd.concat([self.Z, X.reset_index(drop = True)], axis=1, join_axes=[self.Z.index])
            
            #fit Z_classifier to Z
            self.Z_classifier.fit(self.Z, Y)
           

        elif self.use_stacked_prob == True:
            k_fold = KFold(5, shuffle=False, random_state=None)
            for k, (train, test) in enumerate(k_fold.split(X, Y)): #looping through folds
                prediction_df = pd.DataFrame() 
                for name, model in estimators.items(): #looping through folds
                    model.fit(X.iloc[train,], Y[train]) #fitting each model to fold training data
                    pred_prob = model.predict_proba(X.iloc[test,]) #predicting each model on its folds test data
                    pred_prob = pd.DataFrame(pred_prob) 
                    prediction_df = pd.concat([prediction_df, pred_prob], axis=1) 
 
                fold_k_prob = prediction_df #Storing this paricular folds results as a dataframe
                results_list_prob.append(fold_k_prob) #appending this folds results to a list for all folds

            self.Z_prob = pd.concat(results_list_prob).reset_index(drop = True) #Creating the Z dataframe
            
            #include original inputs data?
            if self.include_original_input == False:
                pass
            else:
                X.reset_index(drop = True)
                self.Z_prob = pd.concat([self.Z_prob, X.reset_index(drop = True)], axis=1, join_axes=[self.Z_prob.index])
            


            #fit decision tree to Z
            self.Z_classifier.fit(self.Z_prob, Y)

            #Now retrain all estimators using full dataset
        for key, model in estimators.items():
             model.fit(X,Y)
             estimators[key] = model

        #return a list containing all relevant trained classifiers
        self.output = {"Base estimators":estimators, "Stacked layer": self.Z_classifier}


        # Return the classifier
        return self

    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        
        results_list2 = []
        
        if self.use_stacked_prob == False:
            for name, model in self.output["Base estimators"].items():
                results_list2.append(np.array(model.predict(X)))

            stacked_layer = pd.DataFrame(results_list2).T
            #check if predicting on input values too
            if self.include_original_input == False:
                pass
            else:
                X.reset_index(drop = True)
                stacked_layer = pd.concat([stacked_layer, X.reset_index(drop = True)], axis=1, join_axes=[stacked_layer.index])

        elif self.use_stacked_prob == True:
            for name, model in self.output["Base estimators"].items():
                results_list2.append(pd.DataFrame(model.predict_proba(X)))
        
            stacked_layer = pd.concat(results_list2, axis=1)
            #check if predicting on input values too
            if self.include_original_input == False:
                pass
            else:
                X.reset_index(drop = True)
                stacked_layer = pd.concat([stacked_layer, X.reset_index(drop = True)], axis=1, join_axes=[stacked_layer.index])



        return self.output["Stacked layer"].predict(stacked_layer)



### Test the SuperLearnerClassifier

Perform a simple test using the SuperLearnClassifier on the Iris dataset

In [3]:
from sklearn.datasets import load_iris
clf = SuperLearnerClassifier()
iris = load_iris()
clf.fit(pd.DataFrame(iris.data), iris.target)
cross_val_score(clf, pd.DataFrame(iris.data), iris.target, cv=10)


array([ 1.        ,  0.93333333,  1.        ,  0.93333333,  0.86666667,
        1.        ,  0.93333333,  1.        ,  1.        ,  1.        ])

## Load & Partition Data

### Setup - IMPORTANT

Take only a sample of the dataset for fast testing

In [4]:
data_sampling_rate = 0.05 

Setup the number of folds for all grid searches (should be 5 - 10)

In [5]:
cv_folds = 10

### Load Dataset

Load the dataset and explore it.

In [6]:
dataset = pd.read_csv('fashion-mnist_train.csv')
dataset = dataset.sample(frac=data_sampling_rate) #take a sample from the dataset so everyhting runs smoothly
num_classes = 10
classes = {0: "T-shirt/top", 1:"Trouser", 2: "Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle boot"}
display(dataset.head())

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
59794,0,0,0,0,0,0,0,0,60,179,...,78,73,90,71,0,2,1,0,0,0
17504,6,0,0,0,0,0,0,0,0,0,...,94,13,0,0,0,0,0,0,0,0
58278,3,0,0,0,0,0,0,0,0,0,...,10,0,3,0,0,0,0,0,0,0
47064,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4339,8,0,1,1,0,0,3,0,0,0,...,45,79,143,183,153,101,91,9,0,1


### Pre-process & Partition Data

Perform data pre-processing and manipulation as required

In [7]:
X = dataset[dataset.columns[1:]]
Y = np.array(dataset["label"])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

## Train and Evaluate a Simple Model

Train a Super Learner Classifier using the prepared dataset

Solution: To do a simple evaluation we split the dataset into training and testing, we then fit the basic superlearner on the training set and asses its accuracy on the test set. We can also construct a confusion matrix which compares the super classifier's prediction to the true labels. From these results we can see that the classifer is performing to a reasonable level and is predicting far better than random guessing. However further tuning and testing is required at this stage.

In [8]:
superlearner = SuperLearnerClassifier() #default model
superlearner.fit(X_train,Y_train)

SuperLearnerClassifier(estimators_to_remove=None,
            include_original_input=False, stacked_classifier=None,
            use_stacked_prob=False)

Evaluate the trained classifier

In [9]:
ypred = superlearner.predict(X_test)
acc = metrics.accuracy_score(ypred, Y_test) #accuracy of this model 
conf_matrix = metrics.confusion_matrix(ypred, Y_test)
print("Accuracy:", acc, "\n")
print("Report:\n",metrics.classification_report(Y_test, ypred))
print("Confusion Matrix:")
pd.crosstab(np.array(Y_test), ypred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.787878787879 

Report:
              precision    recall  f1-score   support

          0       0.78      0.81      0.79       108
          1       0.93      0.96      0.94        94
          2       0.60      0.66      0.63       103
          3       0.74      0.78      0.76        86
          4       0.63      0.52      0.57       109
          5       0.94      0.94      0.94       105
          6       0.52      0.50      0.51        88
          7       0.86      0.91      0.88       106
          8       0.92      0.93      0.93        87
          9       0.93      0.88      0.90       104

avg / total       0.79      0.79      0.79       990

Confusion Matrix:


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,87,0,0,5,1,0,13,0,2,0,108
1,1,90,1,2,0,0,0,0,0,0,94
2,4,4,68,3,16,0,6,0,2,0,103
3,6,2,1,67,7,0,3,0,0,0,86
4,2,0,28,4,57,0,17,0,1,0,109
5,0,0,0,0,0,99,0,2,2,2,105
6,12,1,14,8,9,0,44,0,0,0,88
7,0,0,0,0,0,5,0,96,0,5,106
8,0,0,2,1,0,0,2,1,81,0,87
9,0,0,0,0,0,1,0,12,0,91,104


## Cross Validation Experiment (Task 2)

Perfrom a 10-fold cross validation experiment to evaluate the performance of the SuperLearnerClassifier

In [10]:
cv_superlearner = SuperLearnerClassifier()
ten_fold_cv = cross_val_score(cv_superlearner, X, Y, cv=10)
print("Ten fold cross validation results:\n", ten_fold_cv, "\n\nMean accuracy score:", ten_fold_cv.mean())

Ten fold cross validation results:
 [ 0.79934211  0.79537954  0.80463576  0.83056478  0.81333333  0.81
  0.80936455  0.85284281  0.8590604   0.80952381] 

Mean accuracy score: 0.818404709226


## Comparing the Performance of Different Stack Layer Approaches (Task 5)

Compare the performance of the ensemble when a label based stack layer training set and a probability based stack layer training set is used.

In [11]:
#There are four possible combinations here: 
#probability based stacked layer with logistic regression
#probability based stacked layer with decision tree
#classification based stacked layer with logistic regression
#classification based stacked layer with decision tree

#First lets compare average accuracy when cross validation is applied
prob_logistic = SuperLearnerClassifier(use_stacked_prob=True, stacked_classifier="logistic_regression")
prob_logistic_acc = cross_val_score(prob_logistic, X, Y)

prob_tree = SuperLearnerClassifier(use_stacked_prob=True, stacked_classifier="decision_tree")
prob_tree_acc = cross_val_score(prob_tree, X, Y)

class_logistic = SuperLearnerClassifier(use_stacked_prob=False, stacked_classifier="logistic_regression")
class_logistic_acc = cross_val_score(class_logistic, X, Y)

class_tree = SuperLearnerClassifier(use_stacked_prob=False, stacked_classifier="decision_tree")
class_tree_acc = cross_val_score(class_tree, X, Y)

print("Mean accuracy scores:\nProbability based stacked layer with logistic regression:",prob_logistic_acc.mean()\
      ,"\nProbability based stacked layer with decision tree:", prob_tree_acc.mean()\
      ,"\nClassification based stacked layer with logistic regression:", class_logistic_acc.mean()\
      ,"\nClassification based stacked layer with decision tree:", class_tree_acc.mean())

#These accuracy scores indicate which of the combinations is performing well on this dataset

Mean accuracy scores:
Probability based stacked layer with logistic regression: 0.771306742201 
Probability based stacked layer with decision tree: 0.785683989744 
Classification based stacked layer with logistic regression: 0.811647322493 
Classification based stacked layer with decision tree: 0.81267237404


In [12]:
#Now lets examine is the most accurate combination statistically significantly better than the next best
#We can do this using McNemars test as follows:
#1. Retrain the estimators of interest on a training set
class_logistic.fit(X_train, Y_train)
class_tree.fit(X_train, Y_train)
#2. Create a bool vector of whether the predictions were correct or incorrect
class_logistic_predictions = class_logistic.predict(X_test)
class_logistic_predictions = (class_logistic_predictions==Y_test)*1
class_tree_predictions = class_tree.predict(X_test)
class_tree_predictions = (class_tree_predictions==Y_test)*1
#3. Create a contingincy table of agreement/disagreement between predictions
results_table = pd.crosstab(class_logistic_predictions,class_tree_predictions)
#4. Apply mcnemars test from the statsmodels library using chi-squared tables 
mcnemar_result = mcnemar(results_table, exact = False)

print("Chi-squared score is:", mcnemar_result.statistic,"\nGiving p-value of:",mcnemar_result.pvalue\
      ," \nIf chi-squared value is greater than 3.84 we may reject the null hypothesis that the classifiers agree"\
      ," \nThus we will accept the alternative hypothesis that there is a statistically significant difference between the two")


#This test could be repeated (4C2)=6 times to check all possible differences, however I will not do that here
#in order to avoid unnecessary additional computation

Chi-squared score is: 175.875796178 
Giving p-value of: 3.85441046655e-40  
If chi-squared value is greater than 3.84 we may reject the null hypothesis that the classifiers agree  
Thus we will accept the alternative hypothesis that there is a statistically significant difference between the two


## Grid Search Through SuperLearnerClassifier Architectures & Parameters (Task 7)

Perfrom a grid search experiment to detemrine the optimal architecture and hyper-parameter values for the SuperLearnClasssifier for the MNIST Fashion classification problem.

Solution: Note as discussed on the moodle, I have not included all possible subsets of the base classifiers. I instead have included the options to use them all or see if removing any single one might improve the performance. However one could easily grid search on all possible subsets of the base 

In [13]:
# Set up the parameter grid to seaerch
param_grid = [
    {"use_stacked_prob":[True, False]\
    ,"stacked_classifier": ["decision_tree", "logistic_regression", "k_nearest_neighbours", 
        "random_forest","most_frequent"]\
    ,"estimators_to_remove":[None, ["Decision tree"], ["Random forest"], 
        ["Linear SVM"], ["Bagging"], ["K neighbours"], ["Logistic regression"]]}
]

# Perform the search
my_tuned_model = GridSearchCV(SuperLearnerClassifier(), param_grid, cv=cv_folds, verbose = 2, n_jobs=-1)
my_tuned_model.fit(X_train, Y_train)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
print(my_tuned_model.best_score_)

Fitting 10 folds for each of 70 candidates, totalling 700 fits
[CV] estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True 
[CV] estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True 
[CV] estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True 
[CV] estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True, total= 2.7min
[CV] estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True, total= 2.7min
[CV] estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True, total= 2.7min
[CV] estimators_to_remove=None, stacked_classifier=decision_tree, use_stacked_prob=True 
[C

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 24.6min


[CV]  estimators_to_remove=None, stacked_classifier=logistic_regression, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=None, stacked_classifier=logistic_regression, use_stacked_prob=False 
[CV]  estimators_to_remove=None, stacked_classifier=logistic_regression, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=None, stacked_classifier=logistic_regression, use_stacked_prob=False 
[CV]  estimators_to_remove=None, stacked_classifier=logistic_regression, use_stacked_prob=False, total= 2.6min
[CV] estimators_to_remove=None, stacked_classifier=logistic_regression, use_stacked_prob=False 
[CV]  estimators_to_remove=None, stacked_classifier=logistic_regression, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=None, stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=None, stacked_classifier=logistic_regression, use_stacked_prob=False, total= 2.6min
[CV] estimators_to_remove=None, stacked_classifier=k_nearest_

[CV] estimators_to_remove=None, stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=None, stacked_classifier=random_forest, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=None, stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=None, stacked_classifier=random_forest, use_stacked_prob=False, total= 2.6min
[CV] estimators_to_remove=None, stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=None, stacked_classifier=random_forest, use_stacked_prob=False, total= 2.6min
[CV] estimators_to_remove=None, stacked_classifier=most_frequent, use_stacked_prob=True 
[CV]  estimators_to_remove=None, stacked_classifier=random_forest, use_stacked_prob=False, total= 2.6min
[CV] estimators_to_remove=None, stacked_classifier=most_frequent, use_stacked_prob=True 
[CV]  estimators_to_remove=None, stacked_classifier=random_forest, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=N

[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=decision_tree, use_stacked_prob=False, total= 2.3min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=decision_tree, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=decision_tree, use_stacked_prob=False, total= 2.3min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=decision_tree, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=decision_tree, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=logistic_regression, use_stacked_prob=True 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=decision_tree, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=logistic_regression, use_stacked_prob=True 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=decision_tree, use_stacked_prob=False, to

[CV] estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['Decision tree'], stacked_classif

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 104.7min


[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=random_forest, use_stacked_prob=True 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=random_forest, use_stacked_prob=True 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=random_forest, use_stacked_prob=True 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=k_nearest_neighbour

[CV] estimators_to_remove=['Decision tree'], stacked_classifier=most_frequent, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=most_frequent, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=most_frequent, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=most_frequent, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=most_frequent, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=most_frequent, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['Decision tree'], stacked_classifier=most_frequent, use_stacked_prob=False 
[CV]  estimators_to_remove=['Decision tree'], stacked_classifier=most_frequent, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['Random forest'], stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimato

[CV] estimators_to_remove=['Random forest'], stacked_classifier=logistic_regression, use_stacked_prob=False 
[CV]  estimators_to_remove=['Random forest'], stacked_classifier=logistic_regression, use_stacked_prob=False, total= 2.1min
[CV] estimators_to_remove=['Random forest'], stacked_classifier=logistic_regression, use_stacked_prob=False 
[CV]  estimators_to_remove=['Random forest'], stacked_classifier=logistic_regression, use_stacked_prob=False, total= 2.2min
[CV] estimators_to_remove=['Random forest'], stacked_classifier=logistic_regression, use_stacked_prob=False 
[CV]  estimators_to_remove=['Random forest'], stacked_classifier=logistic_regression, use_stacked_prob=False, total= 2.2min
[CV] estimators_to_remove=['Random forest'], stacked_classifier=logistic_regression, use_stacked_prob=False 
[CV]  estimators_to_remove=['Random forest'], stacked_classifier=logistic_regression, use_stacked_prob=False, total= 2.1min
[CV] estimators_to_remove=['Random forest'], stacked_classifier=logi

[CV] estimators_to_remove=['Random forest'], stacked_classifier=random_forest, use_stacked_prob=True 
[CV]  estimators_to_remove=['Random forest'], stacked_classifier=random_forest, use_stacked_prob=True, total= 2.2min
[CV] estimators_to_remove=['Random forest'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=['Random forest'], stacked_classifier=random_forest, use_stacked_prob=True, total= 2.3min
[CV] estimators_to_remove=['Random forest'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=['Random forest'], stacked_classifier=random_forest, use_stacked_prob=True, total= 2.2min
[CV] estimators_to_remove=['Random forest'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=['Random forest'], stacked_classifier=random_forest, use_stacked_prob=True, total= 2.2min
[CV] estimators_to_remove=['Random forest'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_t

[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=decision_tree, use_stacked_prob=True, total= 2.6min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=decision_tree, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=decision_tree, use_stacked_prob=True, total= 2.6min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=decision_tree, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=decision_tree, use_stacked_prob=False 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=decision_tree, use_stacked_prob=True, total= 2.6min
[CV] estimators_to_remove=['Linea

[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stack

[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 231.6min


[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.6min
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.6min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=random_forest, use_stacked_prob=True 
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=random_forest, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.6min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=random_forest, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=random_forest, use_stacked_prob=True, total= 2.6min
[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=random_forest, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=random_forest, use_stacked_prob=True, total= 2.6min
[CV] estim

[CV] estimators_to_remove=['Linear SVM'], stacked_classifier=most_frequent, use_stacked_prob=False 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=most_frequent, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=most_frequent, use_stacked_prob=False, total= 2.6min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifier=most_frequent, use_stacked_prob=False, total= 2.6min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=['Bagging'], stacked_classifier=decision_tree, use_stacked_prob=True, total= 1.7min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=decision_tree, use_stacked_prob=True 
[CV]  estimators_to_remove=['Linear SVM'], stacked_classifie

[CV]  estimators_to_remove=['Bagging'], stacked_classifier=logistic_regression, use_stacked_prob=False, total= 1.8min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=logistic_regression, use_stacked_prob=False 
[CV]  estimators_to_remove=['Bagging'], stacked_classifier=logistic_regression, use_stacked_prob=False, total= 1.8min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=logistic_regression, use_stacked_prob=False 
[CV]  estimators_to_remove=['Bagging'], stacked_classifier=logistic_regression, use_stacked_prob=False, total= 1.8min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=['Bagging'], stacked_classifier=logistic_regression, use_stacked_prob=False, total= 1.9min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=['Bagging'], stacked_classifier=logistic_regression, use_stacked_prob=False, total= 1.8mi

[CV]  estimators_to_remove=['Bagging'], stacked_classifier=random_forest, use_stacked_prob=False, total= 1.8min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=['Bagging'], stacked_classifier=random_forest, use_stacked_prob=False, total= 1.8min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=['Bagging'], stacked_classifier=random_forest, use_stacked_prob=False, total= 1.8min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=['Bagging'], stacked_classifier=random_forest, use_stacked_prob=False, total= 1.8min
[CV] estimators_to_remove=['Bagging'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=['Bagging'], stacked_classifier=random_forest, use_stacked_prob=False, total= 1.8min
[CV] estimators_to_remove=['Bagging'], stacked_class

[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=decision_tree, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=decision_tree, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=decision_tree, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=decision_tree, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=decision_tree, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=decision_tree, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=decision_tree, use_stacked_prob=False, total= 2.5min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=decision_tree, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=decision_tree, use_stacked_prob=False, total= 2.5min
[CV] es

[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 2.4min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False, total= 2.4min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=k_neare

[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=most_frequent, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=most_frequent, use_stacked_prob=True 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=most_frequent, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=most_frequent, use_stacked_prob=True 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=most_frequent, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=most_frequent, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=most_frequent, use_stacked_prob=True, total= 2.5min
[CV] estimators_to_remove=['K neighbours'], stacked_classifier=most_frequent, use_stacked_prob=False 
[CV]  estimators_to_remove=['K neighbours'], stacked_classifier=most_frequent, use_stacked_prob=True, total= 2.5min
[CV] estimator

[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=logistic_regression, use_stacked_prob=True 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=logistic_regression, use_stacked_prob=True, total= 1.9min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=logistic_regression, use_stacked_prob=True 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=logistic_regression, use_stacked_prob=True, total= 2.0min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=logistic_regression, use_stacked_prob=True 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=logistic_regression, use_stacked_prob=True, total= 1.9min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=logistic_regression, use_stacked_prob=True 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=logistic_regression, use_stacked_prob=True, total= 1.9min
[CV] estimators_to_remove=['

[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 393.9min


[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 1.9min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 1.9min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 2.0min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True, total= 1.9min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=k_nearest_neighbours, use_stacked_prob=True 
[CV]  estimators_to_

[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=random_forest, use_stacked_prob=False, total= 2.0min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=random_forest, use_stacked_prob=False 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=random_forest, use_stacked_prob=False, total= 2.0min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=most_frequent, use_stacked_prob=True 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=random_forest, use_stacked_prob=False, total= 2.0min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=most_frequent, use_stacked_prob=True 
[CV]  estimators_to_remove=['Logistic regression'], stacked_classifier=random_forest, use_stacked_prob=False, total= 2.0min
[CV] estimators_to_remove=['Logistic regression'], stacked_classifier=

[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed: 425.3min finished


Best parameters set found on development set:
{'estimators_to_remove': ['Logistic regression'], 'stacked_classifier': 'random_forest', 'use_stacked_prob': False}
0.817412935323


Evaluate the performance of the model selected by the grid search on a hold-out dataset

In [14]:
# Make a set of predictions for the test data
y_pred = my_tuned_model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(Y_test, y_pred) 
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(Y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(Y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)


Accuracy: 0.8
             precision    recall  f1-score   support

          0       0.77      0.82      0.79       108
          1       0.97      0.97      0.97        94
          2       0.64      0.68      0.66       103
          3       0.74      0.81      0.77        86
          4       0.63      0.61      0.62       109
          5       0.90      0.91      0.91       105
          6       0.56      0.45      0.50        88
          7       0.89      0.90      0.89       106
          8       0.94      0.92      0.93        87
          9       0.94      0.90      0.92       104

avg / total       0.80      0.80      0.80       990

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,89,0,1,9,1,0,7,0,1,0,108
1,0,91,2,1,0,0,0,0,0,0,94
2,2,2,70,4,16,0,8,0,1,0,103
3,6,0,1,70,7,0,2,0,0,0,86
4,2,0,24,4,67,0,12,0,0,0,109
5,0,0,0,0,0,96,0,4,2,3,105
6,16,1,11,6,13,0,40,0,1,0,88
7,0,0,0,0,0,8,0,95,0,3,106
8,1,0,0,1,2,0,2,1,80,0,87
9,0,0,0,0,0,3,0,7,0,94,104


## Evaluating the Impact of Adding Original Descriptive Features at the Stack Layer (Task 8)

Evaluate the impact of adding original descriptive features at the stack layer.

Solution: To test this hypothesis I have included our optimal parameters as two identical classifier set ups. One uses the descriptive features at the stacked layer while the other doesn't. The experiment is set up using cross validation as usual. It seems that including the descriptive features at the stacked layer can in fact decrease the overall accuracy of the superlearner.

In [15]:
#Saving best parameters
my_estimators_to_remove = my_tuned_model.best_params_["estimators_to_remove"]
my_stacked_classifier = my_tuned_model.best_params_["stacked_classifier"]
my_use_stacked_prob = my_tuned_model.best_params_["use_stacked_prob"]


include_descriptive_features = SuperLearnerClassifier(estimators_to_remove=my_estimators_to_remove\
                                                     ,stacked_classifier = my_stacked_classifier\
                                                     ,use_stacked_prob= my_use_stacked_prob\
                                                     ,include_original_input = True)

exclude_descriptive_features = SuperLearnerClassifier(estimators_to_remove=my_estimators_to_remove\
                                                     ,stacked_classifier = my_stacked_classifier\
                                                     ,use_stacked_prob= my_use_stacked_prob\
                                                     ,include_original_input = False)


include_accuracy = cross_val_score(include_descriptive_features, X, Y, cv=10)

exclude_accuracy = cross_val_score(exclude_descriptive_features, X, Y, cv=10)


In [16]:
print("Including descriptive features accuracy score:", include_accuracy.mean()\
      , "\nExcluding descriptive features accuracy score:", exclude_accuracy.mean())

Including descriptive features accuracy score: 0.784721154893 
Excluding descriptive features accuracy score: 0.829364463945


## Explore the Ensemble Model (Task 9)

Perform an analysis to investigate the strength of the base estimators and the strengths of the correlations between them.


Solution: There are a number of methods for examining the diversity of the base estimators in the superlearner model. The approach I have chosen to take is to examine the pearson correlation coefficient for predictions made by each of the base estimators. This will test the correlation between the vectors of predicted values. In order to obtain a good ensemble we wish to minimise the correlation while still maintaining the accuracy of the individual estimators. This will mean we have diversity in the ensemble. The results here seem to lie mostly between 0.8 and 0.9, this indicates reasonably strong correlation between the predictions. If we wish to improve the accuracy of the overall superclassifier a good approach would be to increase the diversity of the base estimators. This can be done by tuning parameters of existing models or adding new diverse models. I have also included the individual accuracy of the base estimators in a table below. We wish to always maintain the performance of the base estimators while trying to increase diversity. 


In [21]:
diversity = SuperLearnerClassifier() #fitting another superlearner
diversity.fit(X_train, Y_train)
trained_base_models = diversity.output["Base estimators"]

predictions = {} #obtaining predictions
for key,model in trained_base_models.items():
    predictions[key] =  model.predict(X_test)
    
A= predictions.keys()

#filling in the pearson table using the pearsonr function
pearson_table = pd.DataFrame(index = A, columns=A)
for i in A:
    for j in A:
            X = predictions[i]
            Y = predictions[j]
            pearson_table.loc[i,j] = pearsonr(X,Y)[0]

#measuring accuracy of individual base estimators on test set            
accuracy_table = pd.DataFrame(index = ["Accuracy"], columns=A)
for i in A:
    X = metrics.accuracy_score(predictions[i], Y_test)
    accuracy_table.loc["Accuracy",i] = X

In [25]:
print("\nPearson table of correlation between predictions of base level models:")
pearson_table


Pearson table of correlation between predictions of base level models:


Unnamed: 0,Decision tree,Random forest,Bagging,Logistic regression,K neighbours,Linear SVC
Decision tree,1.0,0.873177,0.868155,0.813103,0.834124,0.821288
Random forest,0.873177,1.0,0.899728,0.867155,0.89999,0.877812
Bagging,0.868155,0.899728,1.0,0.831083,0.864899,0.856286
Logistic regression,0.813103,0.867155,0.831083,1.0,0.846256,0.897554
K neighbours,0.834124,0.89999,0.864899,0.846256,1.0,0.879707
Linear SVC,0.821288,0.877812,0.856286,0.897554,0.879707,1.0


In [26]:
print("\nTable of accuracy for the individual base estimators on the test set:")
accuracy_table


Table of accuracy for the individual base estimators on the test set:


Unnamed: 0,Decision tree,Random forest,Bagging,Logistic regression,K neighbours,Linear SVC
Accuracy,0.743434,0.835354,0.780808,0.768687,0.780808,0.806061


In [29]:
os.system('jupyter nbconvert --to html Assignment1_SuperLearnerClassifier.ipynb')

0