# Evaluation of Decision Tree Algorithm Performance #

In [4]:
import scipy.io as sio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
sns.set()

## Read In Data ##

In [5]:
fitted_models = None
with open('./clfs.p', 'rb') as models:
    fitted_models = pickle.load(models)

data = sio.loadmat('../Data/data_cube_subject1.mat')
channel_labels = sio.loadmat('../Data/channel_label.mat')
data_matrix = data['data_cube']
data_labels = data['event_label'].reshape((200,))

In [26]:
train_idx = fitted_models[1]['train_idx']


#### Prepare Data for use in testing ####

In [18]:
# def Format(X:np.ndarray, 
#            y:np.ndarray, 
#            trials_per_condition:int=100, 
#            reduction_method:str= 'Ave_over_samples'):
#     """
#     Functionality: Method prepares data for usage in a ML algorithm. After feeding X,y through this algorithm, you can expect:
#         X and y will be split according to the different experiment conditions. Location of split is determined by trials_per_condition
#         X will have its number of dimensions reduced by one, according to the reducion method specified. Also, output data will be transposed
#             so that the first index is the number of trials.
#     Param:
#         X:: ndarray of unlabeled EEG data with dimensions (channels, samples, trials). 
#         y:: ndarray of data labels for X with dimensions (trials).
#         trials_per_condition:: number of trials done for each condition. Method assumes equal number of trials per condition
#         reduction_method:: function with signature (channels x samples x trials)darray ==> ndarray, where the co-domain contains arrays of one fewer dimension than the domain
#     Returns:
#         X_by_classes_reduced:: Dictionary with keys A,B and values of dimensionality reduced X separated by condition type A or B. The first index of arrays in X_by_classes_reduced
#                                     will correspond to the trial number. The second index will correspond to a value which depends on the reduction_method.
#         y_by_classes:: Dictionary with keys A,B and values of y separated by condition type A or B.
#     """
#     X_by_classes, y_by_classes = class_split(X,y, trials_per_condition)
#     reduce = reduction_methods[reduction_method]
#     X_by_classes_reduced = {typ: reduce(X_by_classes[typ]).T\
#                             for typ in X_by_classes}
#     return X_by_classes_reduced, y_by_classes

# def class_split(X:np.ndarray, y:np.ndarray, trials_per_condition:int):
#     """
#     Helper Method to Format, separates X and y into condition types
#     """
#     AX, BX = X[:,:,:trials_per_condition], X[:,:,trials_per_condition:]
#     Ay, By = y[:trials_per_condition], y[trials_per_condition:]
#     X_dict, y_dict = {'A': AX, 'B':BX}, {'A':Ay, 'B':By}
#     return X_dict, y_dict
# #Reduction methods
# #All reduction methods should assume a np.ndarray of shape (channels x samples x trials)
# #All reduction methods should return a np.ndarray of shape (dim x trials), where dim depends on the reduction method
# reduction_methods = {
#     'Ave_over_channels': lambda X: np.mean(X,0),
#     'Ave_over_samples': lambda X: np.mean(X,1),
# }

In [25]:
# def split(X,y, train_idx):
#     all_idx = list(range(len(X['A'])))
#     test_idx = [idx for idx in all_idx if idx not in train_idx]
#     X_train = {'A': X['A'][train_idx], 'B':X['B'][train_idx]}
#     y_train = {'A': y['A'][train_idx], 'B':y['B'][train_idx]}
#     X_test = {'A': X['A'][test_idx], 'B':X['B'][test_idx]}
#     y_test = {'A': y['A'][test_idx], 'B':y['B'][test_idx]}
#     test_dict = {'X': X_test, 'y': y_test}
#     train_dict = {'X': X_train, 'y': y_train}
#     return {'test': test_dict, 'train': train_dict}

X_ave_over_samples, y_ave_over_samples = Format(data_matrix, data_labels, reduction_method = 'Ave_over_samples')
X_ave_over_channels, y_ave_over_channels = Format(data_matrix, data_labels, reduction_method = 'Ave_over_channels')

data_ave_over_samples = split(X_ave_over_samples, y_ave_over_samples, train_idx)
data_ave_over_channels = split(X_ave_over_channels, y_ave_over_channels, train_idx)

{'test': {'X': {'A': array([[-1.32261406e-14, -5.49186600e-15,  4.68384451e-15, ...,
            5.31244354e-15,  2.80673315e-15, -1.93071475e-16],
          [-6.68553412e-15, -1.38140407e-15,  3.79938855e-15, ...,
           -4.28864358e-15, -4.64637177e-15, -9.01256655e-15],
          [-8.29165056e-15, -5.40997193e-16, -8.14970084e-16, ...,
            4.45603009e-15,  7.73278557e-16, -3.69962668e-15],
          ...,
          [-2.19332174e-14, -8.97732728e-16,  1.75551108e-15, ...,
           -1.06511924e-15, -1.07926458e-15, -4.96377333e-15],
          [ 1.46540754e-15, -1.15619538e-15, -1.67014271e-16, ...,
           -3.30057921e-16,  1.64929695e-15, -6.84882594e-15],
          [ 3.45444079e-16,  1.69744074e-15, -1.11524834e-15, ...,
           -3.12686451e-16,  1.27159157e-15, -4.84812898e-15]]),
   'B': array([[ 1.68552887e-15,  2.12180092e-17, -1.24603069e-15, ...,
            1.14056106e-15, -5.61470711e-18,  3.61425831e-15],
          [-9.34585060e-16, -1.37929468e-15,  1.30

In [7]:
clfs = {
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier()
}

In [8]:
# def train_test_partition(X_dict, y_dict, test_size=0.3, random_state=None):
#     all_idx = list(range(X_dict['A'].shape[0]))
#     train_idx, test_idx = train_test_split(all_idx, test_size=test_size, random_state=random_state)
#     #make splits
#     X_dict_train = {key:X_dict[key][train_idx] for key in X_dict}
#     X_dict_test = {key:X_dict[key][test_idx] for key in X_dict}
#     y_dict_train = {key:y_dict[key][train_idx] for key in y_dict}
#     y_dict_test = {key:y_dict[key][test_idx] for key in y_dict}
    
#     return (X_dict_train, 
#             X_dict_test, 
#             y_dict_train, 
#             y_dict_test, 
#             train_idx, 
#             test_idx)

# def recombine_test_train_partitions(X_train_dict, y_train_dict, X_test_dict, y_test_dict):
#     XA_train, XB_train = X_train_dict['A'], X_train_dict['B']
#     yA_train, yB_train = y_train_dict['A'], y_train_dict['B']
#     XA_test, XB_test = X_test_dict['A'], X_test_dict['B']
#     yA_test, yB_test = y_test_dict['A'], y_test_dict['B']
#     XA = np.vstack([XA_train, XA_test])
#     XB = np.vstack([XA_test, XB_test])
#     yA = np.hstack([yA_train, yA_test])
#     yB = np.hstack([yB_train, yB_test])
#     X_dict = {'A': XA, 'B': XB}
#     y_dict = {'A': yA, 'B': yB}
#     return X_dict, y_dict

In [3]:
#function used to re train model only when necessary
#function compares training data to training data model was last
#trained with. Only retrains if differences between old and new training data 
#are found
last_X_train = np.array([])
last_y_train = np.array([])
last_clf = None

def retrain(clf, X, y):
    #exeption prevention
    #if no training data is given, return input classifier
    if X is None:
        return clf['clf']
    #all of the conditions in the following list must be true 
    #for the previous trained model to be used
    no_retrain_conditions = [
        last_X_train.shape == X.shape,
        (last_X_train == X).all(),
        (last_y_train == y).all()
    ]
    
    if no_retrain_conditions.all():
        return last_clf
    else:
        clf_type = clf['clf_type']
        clf_new = clfs[clf_type]
        clf_new.fit(X, y)
        last_clf = clf_new
        last_X_train = X
        last_y_train = y
        return clf_new

In [4]:
def evaluate_model_on_condition(clf, 
                   X_test:np.ndarray,
                   y_test:np.ndarray,
                   X_train:np.ndarray=None,
                   y_train:np.ndarray=None,
                   n_rounds:int=1,
                   n_train_sample_bootstrap:int=None,
                   n_test_sample_bootstrap:int=None):
    aucs = [] #return variable
    X, y = None, None #placeholder variables for features and labels
    for n in range(n_rounds):
        X,y = bootstrap(X_train, y_train, n_train_sample_bootstrap)
        mod = retrain(clf, X, y)
        X,y = bootstrap(X_test, y_test, n_test_sample_bootstrap)
        y_pred = clf.predict_proba(X)
        aucs.append(roc_auc_score(y,y_pred))
    return np.array(aucs)
        
def evaluate_model(clf:dict,
                  X_test_dict:dict,
                  y_test_dict:dict,
                  X_train_dict:dict=None,
                  y_train_dict:dict=None,
                  n_rounds_bootstrap:int=1,
                  n_train_sample_bootstrap:int=None,
                  n_test_sample_bootstrap:int=None,
                  redo_test_train_partition:bool=False,
                  train_AB_separately:bool=False):   
    if redo_test_train_partition:
        if X_train_dict is not None:
            X_dict, y_dict = recombine_test_train_partitions(X_train_dict,
                                                             y_train_dict, 
                                                             X_test_dict, 
                                                             y_test_dict)
        else:
            X_dict, y_dict = X_test_dict, y_test_dict
        X_train_dict, X_test_dict, y_train_dict, y_test_dict, _, _ = test_train_partition(X_dict, y_dict)

    if train_AB_separately:
        XA_train, XB_train = X_train_dict['A'], X_train_dict['B']
        yA_train, yB_train = y_train_dict['A'], X_train_dict['B']
    else:
        if X_train_dict is None:
            XA_train, XB_train, yA_train, yB_train = None, None, None, None
        else:
            XA_train = np.vstack([X_train_dict['A'], X_train_dict['B']])
            XB_train = XA_train
            yA_train = np.hstack([y_train_dict['A'], X_train_dict['B']])
            yB_train = yA_train
    A_aucs = evaluate_model_on_condition(X_test_dict['A'],
                                        y_test_dict['A'],
                                        XA_train,
                                        yA_train,
                                        n_rounds_bootstrap,
                                        n_train_sample_bootstrap,
                                        n_test_sample_bootstrap)
    B_aucs = evaluate_model_on_condition(X_test_dict['B'],
                                        y_test_dict['B'],
                                        XB_train,
                                        yB_train,
                                        n_rounds_bootstrap,
                                        n_train_sample_bootstrap,
                                        n_test_sample_bootstrap)
    return {'A_aucs': A_aucs, 'B_aucs': B_aucs}
    
     

def bootstrap(X,y, n_sample_bootstrap):
    if None in [X, y, n_sample_bootstrap]:
        return X, y
    bootstrap_idx = np.random.randint(X.shape[0], size=n_sample_bootstraps)
    return X[bootstrap_idx], y[bootstrap_idx]
                   

In [2]:

#The following function calls evaluate_model on the specified 
# input, and returns an auc distribution in a dictionary along
# with information speciyfing which model was tested
def get_model_eval(clf:dict,
                   X_test_dict:dict,
                   y_test_dict:dict,
                   X_train_dict:dict=None,
                   y_train_dict:dict=None,
                   n_rounds_bootstrap:int=1,
                   n_train_sample_bootstrap:int=None,
                   n_test_sample_bootstrap:int=None,
                   n_rounds_test_train_split:int=None,
                   separate_AB_models:bool=False):
    output_dict = {'clf_type': clf['clf_type'],
                   'reduction_method': clf['reduction_method'],
                   'n_rounds_test_train_split': n_rounds_test_train_split,
                   'separate_AB_models' : separate_AB_models}
    if n_rounds_test_train_split is None:
        output_dict['aucs'] = evaluate_model(clf, 
                                             X_test_dict,
                                             y_test_dict,
                                             X_train_dict,
                                             y_train_dict,
                                             n_rounds_bootstrap,
                                             n_train_sample_bootstrap,
                                             n_test_sample_bootstrap,
                                             False,
                                             separate_AB_models)
    else:
        auc_dict = {'A': [], 'B': []}
        for n in range(n_rounds_test_train_split):
            auc_i = evaluate_model(clf, 
                                   X_test_dict,
                                   y_test_dict,
                                   X_train_dict,
                                   y_train_dict,
                                   n_rounds_bootstrap,
                                   n_train_sample_bootstrap,
                                   n_test_sample_bootstrap,
                                   True,
                                   separate_AB_models)
            auc_dict['A'] += auc_i['A']
            auc_dict['B'] += auc_i['B']
            
        output_dict['aucs'] = auc_dict
        
    return output_dict


In [11]:
with open('./clfs.p', 'rb') as models:
    fitted_models = pickle.load(models)
    

{'clf_type': 'RandomForestClassifier',
 'reduction_method': 'Ave_over_samples',
 'train_idx': [67,
  99,
  54,
  95,
  88,
  40,
  48,
  59,
  23,
  34,
  86,
  53,
  77,
  15,
  83,
  41,
  45,
  91,
  26,
  98,
  43,
  55,
  24,
  4,
  58,
  49,
  21,
  87,
  3,
  74,
  30,
  66,
  70,
  42,
  47,
  89,
  8,
  60,
  0,
  90,
  57,
  22,
  61,
  63,
  7,
  96,
  13,
  68,
  85,
  14,
  29,
  28,
  11,
  18,
  20,
  50,
  25,
  6,
  71,
  76,
  1,
  16,
  64,
  79,
  5,
  75,
  9,
  72,
  12,
  37],
 'test_idx': [80,
  84,
  33,
  81,
  93,
  17,
  36,
  82,
  69,
  65,
  92,
  39,
  56,
  52,
  51,
  32,
  31,
  44,
  78,
  10,
  2,
  73,
  97,
  62,
  19,
  35,
  94,
  27,
  46,
  38],
 'clf': GridSearchCV(estimator=RandomForestClassifier(), n_jobs=7,
              param_grid={'criterion': ['gini', 'entropy'],
                          'max_depth': (None, 10, 50, 100, 500),
                          'max_features': ('auto', 'sqrt', 'log2', None),
                          'min_impuri