# Evaluation of Decision Tree Algoithm Performance #



## Import Statements ##

In [41]:
import scipy.io as sio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
sns.set()

## Constant Variable Definitions ##

In [42]:
clfs = {
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier()
}

## Function Definitions ##
### Data Formatting and Preparation Functions ###

In [87]:
def Format(X:np.ndarray, 
           y:np.ndarray, 
           trials_per_condition:int=100, 
           reduction_method:str= 'Ave_over_samples'):
    """
    Functionality: Method prepares data for usage in a ML algorithm. After feeding X,y through this algorithm, you can expect:
        X and y will be split according to the different experiment conditions. Location of split is determined by trials_per_condition
        X will have its number of dimensions reduced by one, according to the reducion method specified. Also, output data will be transposed
            so that the first index is the number of trials.
    Param:
        X:: ndarray of unlabeled EEG data with dimensions (channels, samples, trials). 
        y:: ndarray of data labels for X with dimensions (trials).
        trials_per_condition:: number of trials done for each condition. Method assumes equal number of trials per condition
        reduction_method:: function with signature (channels x samples x trials)darray ==> ndarray, where the co-domain contains arrays of one fewer dimension than the domain
    Returns:
        X_by_classes_reduced:: Dictionary with keys A,B and values of dimensionality reduced X separated by condition type A or B. The first index of arrays in X_by_classes_reduced
                                    will correspond to the trial number. The second index will correspond to a value which depends on the reduction_method.
        y_by_classes:: Dictionary with keys A,B and values of y separated by condition type A or B.
    """
    X_by_classes, y_by_classes = class_split(X,y, trials_per_condition)
    reduce = reduction_methods[reduction_method]
    X_by_classes_reduced = {typ: reduce(X_by_classes[typ]).T\
                            for typ in X_by_classes}
    return X_by_classes_reduced, y_by_classes

def class_split(X:np.ndarray, y:np.ndarray, trials_per_condition:int):
    """
    Helper Method to Format, separates X and y into condition types
    """
    AX, BX = X[:,:,:trials_per_condition], X[:,:,trials_per_condition:]
    Ay, By = y[:trials_per_condition], y[trials_per_condition:]
    X_dict, y_dict = {'A': AX, 'B':BX}, {'A':Ay, 'B':By}
    return X_dict, y_dict
#Reduction methods
#All reduction methods should assume a np.ndarray of shape (channels x samples x trials)
#All reduction methods should return a np.ndarray of shape (dim x trials), where dim depends on the reduction method
reduction_methods = {
    'Ave_over_channels': lambda X: np.mean(X,0),
    'Ave_over_samples': lambda X: np.mean(X,1),
    'Ave_over_samples_ch24_ch8': lambda X: (X[23,:] - X[7,:])
}

def split(X,y, train_idx):
    all_idx = list(range(len(X['A'])))
    test_idx = [idx for idx in all_idx if idx not in train_idx]
    X_train = {'A': X['A'][train_idx], 'B':X['B'][train_idx]}
    y_train = {'A': y['A'][train_idx], 'B':y['B'][train_idx]}
    X_test = {'A': X['A'][test_idx], 'B':X['B'][test_idx]}
    y_test = {'A': y['A'][test_idx], 'B':y['B'][test_idx]}
    test_dict = {'X': X_test, 'y': y_test}
    train_dict = {'X': X_train, 'y': y_train}
    return {'test': test_dict, 'train': train_dict}

def train_test_partition(X_dict, y_dict, test_size=0.3, random_state=None):
    all_idx = list(range(X_dict['A'].shape[0]))
    train_idx, test_idx = train_test_split(all_idx, test_size=test_size, random_state=random_state)
    #make splits
    X_dict_train = {key:X_dict[key][train_idx] for key in X_dict}
    X_dict_test = {key:X_dict[key][test_idx] for key in X_dict}
    y_dict_train = {key:y_dict[key][train_idx] for key in y_dict}
    y_dict_test = {key:y_dict[key][test_idx] for key in y_dict}
    
    return (X_dict_train, 
            X_dict_test, 
            y_dict_train, 
            y_dict_test, 
            train_idx, 
            test_idx)

def recombine_test_train_partitions(X_train_dict, y_train_dict, X_test_dict, y_test_dict):
    XA_train, XB_train = X_train_dict['A'], X_train_dict['B']
    yA_train, yB_train = y_train_dict['A'], y_train_dict['B']
    XA_test, XB_test = X_test_dict['A'], X_test_dict['B']
    yA_test, yB_test = y_test_dict['A'], y_test_dict['B']
    XA = np.vstack([XA_train, XA_test])
    XB = np.vstack([XB_train, XB_test])
    yA = np.hstack([yA_train, yA_test])
    yB = np.hstack([yB_train, yB_test])
    X_dict = {'A': XA, 'B': XB}
    y_dict = {'A': yA, 'B': yB}
    return X_dict, y_dict

### Model Training Function ###

In [40]:
#function used to re train model only when necessary
#function compares training data to training data model was last
#trained with. Only retrains if differences between old and new training data 
#are found
last_X_train = np.array([])
last_y_train = np.array([])
last_clf_params = None
last_clf = None

def retrain(clf:dict, X:np.ndarray, y:np.ndarray):
    global last_X_train
    global last_y_train
    global last_clf_params
    global last_clf
    #exeption prevention
    #if no training data is given, return input classifier
    if X is None:
        return clf['clf']
    #all of the following conditions in the following list must be 
    # for previously trained model to be usable
    if last_X_train.shape == X.shape:
        if clf['clf'].best_params_ == last_clf_params:
            if (last_X_train == X).all():
                if (last_y_train==y).all():
                    return last_clf
    clf_type = clf['clf_type']
    last_clf_params = clf['clf'].best_params_
    clf_new = clfs[clf_type]
    clf_new.set_params(**last_clf_params)
    clf_new.fit(X, y)
    last_clf = clf_new
    last_X_train = X
    last_y_train = y
    return clf_new

### Model Evaluation Functions ###

In [96]:
def evaluate_model_on_condition(clf, 
                   X_test:np.ndarray,
                   y_test:np.ndarray,
                   X_train:np.ndarray=None,
                   y_train:np.ndarray=None,
                   n_rounds:int=1,
                   n_train_sample_bootstrap:int=None,
                   n_test_sample_bootstrap:int=None):
    aucs = [] #return variable
    X, y = None, None #placeholder variables for features and labels
    for n in range(n_rounds):
        try:
            X,y = bootstrap(X_train, y_train, n_train_sample_bootstrap)
            mod = retrain(clf, X, y)
            mod1 = RandomForestClassifier()
            mod1.fit(X,y)
            X,y = bootstrap(X_test, y_test, n_test_sample_bootstrap)
            y_pred = mod.predict_proba(X)
            aucs.append(roc_auc_score(y,y_pred[:,1]))
        except ValueError:
            aucs.append(-1)
    return np.array(aucs)
        
def evaluate_model(clf:dict,
                  X_test_dict:dict,
                  y_test_dict:dict,
                  X_train_dict:dict=None,
                  y_train_dict:dict=None,
                  n_rounds_bootstrap:int=1,
                  n_train_sample_bootstrap:int=None,
                  n_test_sample_bootstrap:int=None,
                  redo_test_train_partition:bool=False,
                  train_AB_separately:bool=False):   
    if redo_test_train_partition:
        if X_train_dict is not None:
            X_dict, y_dict = recombine_test_train_partitions(X_train_dict,
                                                             y_train_dict, 
                                                             X_test_dict, 
                                                             y_test_dict)
        else:
            X_dict, y_dict = X_test_dict, y_test_dict
        X_train_dict, X_test_dict, y_train_dict, y_test_dict, _, _ = train_test_partition(X_dict, y_dict)
    if train_AB_separately:
        XA_train, XB_train = X_train_dict['A'], X_train_dict['B']
        yA_train, yB_train = y_train_dict['A'], y_train_dict['B']
    else:
        if X_train_dict is None:
            XA_train, XB_train, yA_train, yB_train = None, None, None, None
        else:
            XA_train = np.vstack([X_train_dict['A'], X_train_dict['B']])
            XB_train = XA_train
            yA_train = np.hstack([y_train_dict['A'], y_train_dict['B']])
            yB_train = yA_train
    A_aucs = evaluate_model_on_condition(clf,
                                         X_test_dict['A'],
                                         y_test_dict['A'],
                                         XA_train,
                                         yA_train,
                                         n_rounds_bootstrap,
                                         n_train_sample_bootstrap,
                                         n_test_sample_bootstrap)
    B_aucs = evaluate_model_on_condition(clf,
                                         X_test_dict['B'],
                                         y_test_dict['B'],
                                         XB_train,
                                         yB_train,
                                         n_rounds_bootstrap,
                                         n_train_sample_bootstrap,
                                         n_test_sample_bootstrap)
    return {'A': A_aucs, 'B': B_aucs}
    

#The following function calls evaluate_model on the specified 
# input, and returns an auc distribution in a dictionary along
# with information speciyfing which model was tested
def get_model_eval(clf:dict,
                   X_test_dict:dict,
                   y_test_dict:dict,
                   X_train_dict:dict=None,
                   y_train_dict:dict=None,
                   n_rounds_bootstrap:int=1,
                   n_train_sample_bootstrap:int=None,
                   n_test_sample_bootstrap:int=None,
                   n_rounds_test_train_split:int=None,
                   separate_AB_models:bool=False):
    output_dict = {'clf_type': clf['clf_type'],
                   'reduction_method': clf['reduction_method'],
                   'bootstrapped_training_data': n_train_sample_bootstrap,
                   'n_rounds_test_train_split': n_rounds_test_train_split,
                   'separate_AB_models' : separate_AB_models}
    if n_rounds_test_train_split is None:
        output_dict['aucs'] = evaluate_model(clf, 
                                             X_test_dict,
                                             y_test_dict,
                                             X_train_dict,
                                             y_train_dict,
                                             n_rounds_bootstrap,
                                             n_train_sample_bootstrap,
                                             n_test_sample_bootstrap,
                                             False,
                                             separate_AB_models)
    else:
        auc_dict = {'A': np.array([]), 'B': np.array([])}
        for n in range(n_rounds_test_train_split):
            auc_i = evaluate_model(clf, 
                                   X_test_dict,
                                   y_test_dict,
                                   X_train_dict,
                                   y_train_dict,
                                   n_rounds_bootstrap,
                                   n_train_sample_bootstrap,
                                   n_test_sample_bootstrap,
                                   True,
                                   separate_AB_models)
            auc_dict['A'] = np.append(auc_dict['A'], auc_i['A'])
            auc_dict['B'] = np.append(auc_dict['B'], auc_i['B'])
        output_dict['aucs'] = auc_dict
        
    return output_dict

        

def bootstrap(X,y, n_sample_bootstrap):
    if X is None or y is None or n_sample_bootstrap is None:
        return X, y
    bootstrap_idx = np.random.randint(X.shape[0], size=n_sample_bootstrap)
    return X[bootstrap_idx], y[bootstrap_idx]
                   

In [44]:
def run_model_tests(data_matrix:np.ndarray, 
                       data_labels:np.ndarray,
                       clf_list:list, 
                       n_rounds_bootstrap:int=300, 
                       n_train_sample_bootstrap:int=500, 
                       n_test_sample_bootstrap:int=30, 
                       n_rounds_test_train_split:int=300):
    test_results = []
    #define testing options
    bootstrap_cond = (n_train_sample_bootstrap, None)
    redo_test_train_cond = (n_rounds_test_train_split, None)
    separate_AB_model_cond = (True, False)
    for clf in clf_list:
        reduction_method = clf['reduction_method']
        train_idx = clf['train_idx']
        for training_bootstraps in bootstrap_cond:
            for splits in redo_test_train_cond:
                if splits is not None:
                    bootstrap_rounds = 1
                else:
                    bootstrap_rounds = n_rounds_bootstrap
                for AB_model in separate_AB_model_cond:
                    X,y = Format(data_matrix, data_labels, reduction_method=reduction_method)
                    data_dict = split(X, y, train_idx)
                    parameters = {
                        'clf': clf,
                        'X_test_dict': data_dict['test']['X'],
                        'y_test_dict': data_dict['test']['y'],
                        'X_train_dict': data_dict['train']['X'],
                        'y_train_dict': data_dict['train']['y'],
                        'n_rounds_bootstrap': bootstrap_rounds,
                        'n_train_sample_bootstrap': training_bootstraps,
                        'n_test_sample_bootstrap': n_test_sample_bootstrap,
                        'n_rounds_test_train_split': splits,
                        'separate_AB_models': AB_model
                    }
                    test_results.append(get_model_eval(**parameters))
                    
    return test_results


## Read in and prepare Data ##

In [56]:
fitted_models = None
with open('./clfs.p', 'rb') as models:
    fitted_models = pickle.load(models)
for mod in fitted_models:
    mod['clf'].best_params_['random_state'] = None

data = sio.loadmat('../Data/data_cube_subject1.mat')
channel_labels = sio.loadmat('../Data/channel_label.mat')
data_matrix = data['data_cube']
data_labels = data['event_label'].reshape((200,))

## Test Models ##

In [92]:
%time test_results = run_model_tests(data_matrix, data_labels, fitted_models)

#save test results to file
with open('../Data/test_results.p', 'wb') as results_file:
    pickle.dump(test_results, results_file)

1
1
1


KeyboardInterrupt: 

NameError: name 'test_results' is not defined

In [None]:
#test_results

In [98]:
new_rf = fitted_models[0]
new_rf['reduction_method'] = 'Ave_over_samples_ch24_ch8'
new_ada = fitted_models[3]
new_ada['reduction_method'] = 'Ave_over_samples_ch24_ch8'
new_models = [new_rf, new_ada]

%time reduced_ds_test_results = run_model_tests(data_matrix, data_labels, new_models, 1, 500,30,3)

with open('../Data/CH24_minus_CH8_test_results.p','wb') as results_file:
        pickle.dump(reduced_ds_test_results, results_file)
        
reduced_ds_test_results

CPU times: user 1min 31s, sys: 312 ms, total: 1min 32s
Wall time: 1min 32s


[{'clf_type': 'RandomForestClassifier',
  'reduction_method': 'Ave_over_samples_ch24_ch8',
  'bootstrapped_training_data': 500,
  'n_rounds_test_train_split': 3,
  'separate_AB_models': True,
  'aucs': {'A': array([0.6937799 , 0.88392857, 0.88392857]),
   'B': array([0.84598214, 0.59330144, 0.52631579])}},
 {'clf_type': 'RandomForestClassifier',
  'reduction_method': 'Ave_over_samples_ch24_ch8',
  'bootstrapped_training_data': 500,
  'n_rounds_test_train_split': 3,
  'separate_AB_models': False,
  'aucs': {'A': array([0.85267857, 0.74444444, 0.86444444]),
   'B': array([0.92410714, 0.81100478, 0.95535714])}},
 {'clf_type': 'RandomForestClassifier',
  'reduction_method': 'Ave_over_samples_ch24_ch8',
  'bootstrapped_training_data': 500,
  'n_rounds_test_train_split': None,
  'separate_AB_models': True,
  'aucs': {'A': array([0.96296296]), 'B': array([0.69444444])}},
 {'clf_type': 'RandomForestClassifier',
  'reduction_method': 'Ave_over_samples_ch24_ch8',
  'bootstrapped_training_data': 

In [84]:
data_matrix
f= lambda X: (X[23,:] - X[7,:]).T

X = f(data_matrix).T



XA = X[:100]
XB = X[100:]
yA = data_labels[:100]
yB = data_labels[100:]

mod = RandomForestClassifier()
XA_train, XA_test, yA_train, yA_test = train_test_split(XA, yA)
mod.fit(XA_train,yA_train)
y_pred = mod.predict_proba(XA_test)[:,1]
roc_auc_score(yA_test, y_pred)
#print(y_pred)
#data_matrix\


0.5625