In [1]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.datasets
import sklearn.metrics
from sklearn.inspection import plot_partial_dependence, permutation_importance

import autosklearn.classification
import autosklearn.metrics
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

In [2]:
def collect_dataset_all_participants():
    
    data_test = pd.read_csv('df_SPSS_Final_all_VP_with_recording_problems_removed.csv')
    data_test =  data_test.sort_values('VP').reset_index(drop=True)

    appended_datapoints = data_test[['VP', 'baseline-eye_0_lhipa', 'baseline-eye_1_lhipa','baseline-HR', 'baseline-HRV', 'baseline-HR_std','baseline-HR_max', 'baseline-HR_min','baseline-Driving_Performance',
                           'Nback_LL-eye_0_lhipa','Nback_LL-eye_1_lhipa','Nback_LL-HR', 'Nback_LL-HRV',  'Nback_LL-HR_std','Nback_LL-HR_max', 'Nback_LL-HR_min','Nback_LL-Driving_Performance',
                           'Nback_ML-eye_0_lhipa','Nback_ML-eye_1_lhipa', 'Nback_ML-HR','Nback_ML-HRV',  'Nback_ML-HR_std','Nback_ML-HR_max', 'Nback_ML-HR_min','Nback_ML-Driving_Performance',
                           'Nback_HL-eye_0_lhipa','Nback_HL-eye_1_lhipa', 'Nback_HL-HR','Nback_HL-HRV',  'Nback_HL-HR_std','Nback_HL-HR_max', 'Nback_HL-HR_min','Nback_HL-Driving_Performance',
                           'VIS_LL-eye_0_lhipa','VIS_LL-eye_1_lhipa', 'VIS_LL-HR', 'VIS_LL-HRV', 'VIS_LL-HR_std','VIS_LL-HR_max', 'VIS_LL-HR_min','VIS_LL-Driving_Performance',
                           'VIS_ML-eye_0_lhipa','VIS_ML-eye_1_lhipa', 'VIS_ML-HR', 'VIS_ML-HRV', 'VIS_ML-HR_std','VIS_ML-HR_max', 'VIS_ML-HR_min','VIS_ML-Driving_Performance',
                           'VIS_HL-eye_0_lhipa','VIS_HL-eye_1_lhipa', 'VIS_HL-HR', 'VIS_HL-HRV', 'VIS_HL-HR_std','VIS_HL-HR_max', 'VIS_HL-HR_min','VIS_HL-Driving_Performance',]]
    
    ##### Labelling the data ####
    
    appended_datapoints_with_labels = []
    number_of_features_per_class = 8
    for i in range((len(appended_datapoints.columns)-1)//number_of_features_per_class):
    #         print(i)
        df_temp = appended_datapoints.iloc[:, np.r_[0, 1 + (i*number_of_features_per_class) : 1 + ((i+1)*number_of_features_per_class)]].copy()
#         print(df_temp)
        df_temp['label_task_class'] = appended_datapoints.columns[(i*number_of_features_per_class)+ 2 + 2].split('-')[0]
#         print(df_temp)
        if i == 0:
            df_temp['label_mental_load_level'] = appended_datapoints.columns[(i*number_of_features_per_class)+ 2 + 1].split('-')[0]
        else:
            df_temp['label_mental_load_level'] = (appended_datapoints.columns[(i*number_of_features_per_class)+ 2 + 2].split('-')[0]).split('_')[1]
#         print(df_temp)
        df_temp.columns = ['Participant', appended_datapoints.columns[1].split('-')[1],appended_datapoints.columns[2].split('-')[1], 
                           appended_datapoints.columns[3].split('-')[1],appended_datapoints.columns[4].split('-')[1],
                           appended_datapoints.columns[5].split('-')[1],appended_datapoints.columns[6].split('-')[1],
                           appended_datapoints.columns[7].split('-')[1],appended_datapoints.columns[8].split('-')[1],
                          'label_task_class','label_mental_load_level']
#         print(df_temp)    

        appended_datapoints_with_labels.append(df_temp)
    appended_datapoints_with_labels = pd.concat(appended_datapoints_with_labels)
    appended_datapoints_with_labels['label_task_class_factorized'] = pd.factorize(appended_datapoints_with_labels.label_task_class)[0]
    appended_datapoints_with_labels['label_mental_load_level_factorized'] = pd.factorize(appended_datapoints_with_labels.label_mental_load_level)[0]
    appended_datapoints_with_labels.reset_index(drop=True, inplace = True)
    appended_datapoints_with_labels.sort_values(by=['Participant','label_task_class_factorized'], inplace=True)
    appended_datapoints_with_labels.reset_index(drop=True, inplace = True)
#     appended_datapoints_with_labels.to_csv('df_SPSS_Final_all_with_labels.csv',index =False)
    
    return appended_datapoints_with_labels

In [3]:
def class_type_to_number_of_classes(classes_type):
    if classes_type == 'Mental_load_nback_only':
            return 3
    elif classes_type == 'Mental_load_and_Sec_task':
            return 7
    elif classes_type == 'Mental_load_two_levels':
            return 2

In [4]:
def Dataset_adjustor(no_classes):

    #load dataset and put it in x and y
    dataset__ = collect_dataset_all_participants()
    if no_classes == 3:
        # compare the two levels Nback_LL and Nback_ML (won't work with processed_ll)
        dataset__ = dataset__[(dataset__['label_task_class'] == 'Nback_LL') | (dataset__['label_task_class'] == 'Nback_ML') | (dataset__['label_task_class'] == 'Nback_HL')].reset_index(drop=True)
        dataset__['label_task_class_factorized'] = pd.factorize(dataset__.label_task_class)[0]
        dataset__['label_mental_load_level_factorized'] = pd.factorize(dataset__.label_mental_load_level)[0]
        y = dataset__['label_task_class_factorized'].values
    
    elif no_classes == 7:
        y = dataset__['label_task_class_factorized'].values
    
    elif no_classes == 2:
        
        # compare the two levels Nback_LL and Nback_ML (won't work with processed_ll)
        dataset__ = dataset__[(dataset__['label_task_class'] == 'Nback_LL') | (dataset__['label_task_class'] == 'Nback_ML')].reset_index(drop=True)
        dataset__['label_task_class_factorized'] = pd.factorize(dataset__.label_task_class)[0]
        dataset__['label_mental_load_level_factorized'] = pd.factorize(dataset__.label_mental_load_level)[0]
        y = dataset__['label_task_class_factorized'].values

    x = dataset__[['HR','HRV','HR_std','HR_max','HR_min','eye_0_lhipa','eye_1_lhipa','Driving_Performance']].values.astype(float)
        
    return x , y
        

In [5]:
def main_inner_loop_cv(X_train, X_test, y_train, y_test):
    
    print("#"*80)
    # print("Use predefined accuracy metric")
    cls = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=360,
        per_run_time_limit=30,
        seed=1514,
        metric=autosklearn.metrics.balanced_accuracy,
#         ensemble_nbest=1,
#         tmp_folder='/tmp/autosklearn_interpretable_models_example_tmp',
#         include_estimators=['lda','adaboost','k_nearest_neighbors'],
#         include_preprocessors=['no_preprocessing', 'polynomial', 'select_percentile_classification'],
#         resampling_strategy='cv',
#         resampling_strategy_arguments={'folds': 3},
    )
    cls.fit(X_train, y_train)


    train_predictions = cls.predict(X_train)
    train_accuracy_per_outer_fold = sklearn.metrics.accuracy_score(y_train, train_predictions)
    print("Train Accuracy score", train_accuracy_per_outer_fold)
    print(f"Train Classification report for classifier {cls}:\n" f"{sklearn.metrics.classification_report(y_train, train_predictions)}\n")

    predictions = cls.predict(X_test)
    test_accuracy_per_outer_fold = sklearn.metrics.accuracy_score(y_test, predictions)
    print("Test Accuracy score", test_accuracy_per_outer_fold)
    print(f"Test Classification report for classifier {cls}:\n" f"{sklearn.metrics.classification_report(y_test, predictions)}\n")
    print("#"*80)

    print(cls.leaderboard())
    print("#"*80)

    print(cls.sprint_statistics())
    print("#"*80)


    features_name = ['HR','HRV','HR_std','HR_max','HR_min','eye_0_lhipa','eye_1_lhipa','Driving_Performance']

    print('train permutation importance')

    r = permutation_importance(cls, X_train, y_train,
                               n_repeats=30,
                               random_state=120)

    sort_idx = r.importances_mean.argsort()[::-1]
    plt.boxplot(r.importances[sort_idx].T, labels=[features_name[i] for i in sort_idx])
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

    for i in sort_idx[::-1]:
        print(f"{features_name[i]:10s}: {r.importances_mean[i]:.3f} +/- "
              f"{r.importances_std[i]:.3f}")

    print("#"*80)

    print('test permutation importance')

    r = permutation_importance(cls, X_test, y_test,
                               n_repeats=30,
                               random_state=230)

    sort_idx = r.importances_mean.argsort()[::-1]
    plt.boxplot(r.importances[sort_idx].T, labels=[features_name[i] for i in sort_idx])
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

    for i in sort_idx[::-1]:
        print(f"{features_name[i]:10s}: {r.importances_mean[i]:.3f} +/- "
              f"{r.importances_std[i]:.3f}")

    print("#"*80)
    
    return train_accuracy_per_outer_fold, test_accuracy_per_outer_fold

In [6]:
#features options
classes_type = 'Mental_load_nback_only' #options: 'Mental_load' for independency from tasks (4 classes) or 'Mental_load_and_Sec_task' for relating to all task levels (7 classes) or 'Mental_load_two_levels' for testing which uses 2 classes 'Nback_ML' and 'Nback_HL'
no_classes = class_type_to_number_of_classes(classes_type)
k_folds = 5



x_all , y_all = Dataset_adjustor(no_classes)
# x_all , y_all = shuffle(x_all , y_all)
# Define the K-fold Cross Validator
kfold = KFold(n_splits=k_folds)

# Start print
print('--------------------------------')
print(x_all.shape, x_all , y_all.shape , y_all)



--------------------------------
(135, 8) [[78.36956522 11.44072156  0.7330311  ...  1.84444444  1.81231932
   0.22411563]
 [80.67391304 13.95085566  2.04359389 ...  2.22222222  2.33333333
   0.19627159]
 [80.06521739 12.88111319  1.13064379 ...  2.33333333  2.3
   0.09948809]
 ...
 [93.5        18.73675356  1.99183114 ...  2.12222222  2.03718134
   0.21394494]
 [93.66666667 17.51710128  2.4712911  ...  2.1         2.11111111
   0.23378205]
 [90.80645161 18.25573168  3.7019555  ...  1.86895094  1.87777778
   0.72466483]] (135,) [0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0
 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1
 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2
 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2]


In [None]:
#X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(x_all, y_all)
#print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# For fold results
train_accuracy_average = []
test_accuracy_average = []

########### for debugging ##########
for fold, (train_ids, test_ids) in enumerate(kfold.split(np.arange(len(x_all)))):
    
    print('-=========================================-')
    print(f'FOLD {fold}')
    print('------------')
    print(train_ids, test_ids)
    print(len(train_ids), len(test_ids))
#     print(x_all[test_ids])
#     print(y_all[test_ids])
    
    X_train = x_all[train_ids]
    y_train = y_all[train_ids]
    
    X_test = x_all[test_ids]
    y_test = y_all[test_ids]

    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    train_accuracy, test_accuracy = main_inner_loop_cv(X_train, X_test, y_train, y_test)
    train_accuracy_average.append(train_accuracy)
    test_accuracy_average.append(test_accuracy)

FOLD 0
------------
[ 27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44
  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62
  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80
  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98
  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26]
108 27
(108, 8) (108,)
(27, 8) (27,)
################################################################################


In [None]:
print(train_accuracy_average,test_accuracy_average)
print(np.mean(train_accuracy_average), np.mean(test_accuracy_average))
print("#"*80)