# Predicting with EEG

In [None]:
import warnings
warnings.filterwarnings('ignore')
from utils import create_dataset_mri, create_dataset_eeg, create_dataset_eeg_old
import numpy as np
import tensorflow as tf
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, balanced_accuracy_score
from sklearn.decomposition import PCA

### Utilities

In [None]:
def drop_nans_based_on_column(dataframe, column): 
    nans_indices = []
    for index, value in enumerate(dataframe.loc[:,column].isna()):
        if value==True: 
            nans_indices.append(index)
    print('dropped', len(nans_indices), 'samples')

    return dataframe.drop(nans_indices, axis=0)

In [None]:
def drop_nulls_based_on_column(dataframe, column): 
    nulls_indices = []
    for index, value in enumerate(dataframe.loc[:,column].isnull()):
        if value==True: 
            nulls_indices.append(index)
    print('dropped', len(nulls_indices), 'samples')

    return dataframe.drop(nulls_indices, axis=0)

In [None]:
def get_statistics_column(dataframe, column, show_histo=True):
    
    values = set(dataframe.loc[:,column].values)
    values_dict = {}
    for value in values: 
        values_dict[value] = len(dataframe.groupby(column).get_group(value).index)
    if show_histo == True:
        plt.bar(range(len(values_dict)), list(values_dict.values()), align='center', color='lightblue')
        plt.xticks(range(len(values_dict)), range(1,len(values_dict)+1))
        plt.show()
    return values_dict

In [None]:
# helper function to get params for model to be inserted into sklearn pipeline
def make_pipe_model_params(params): 
    new_params = {}
    for key in params.keys(): 
        newkey = 'model_class__'+key
        new_params[newkey] = params[key]
    return new_params

In [None]:
# helper function to get params for model to be inserted into sklearn pipeline (when using also pca)
def make_pipe_model_params_pca(params): 
    new_params = {}
    for key in params.keys():
        if key != 'n_components':
            newkey = 'model_class__'+key
            new_params[newkey] = params[key]
        else:
            newkey = 'pca__'+key
            new_params[newkey] = params[key]
    return new_params

In [None]:
# Helper function for cross-validation in One-Class Classification
def cv_one_class_classification(model, data, labels, n_splits = 5):
    from sklearn.metrics import balanced_accuracy_score
    from sklearn.metrics import f1_score
    '''
    model: must be a sklearn object with .fit and .predict methods
    data: the X matrix containing the features, can be a pd.DataFrame or a np object (array or matrix)
    labels: y, can be a pd.DataFrame or a np array
    n_splits: number of desired folds
    => returns array of mean suqared error calculated on each fold
    '''
    kf = KFold(n_splits=n_splits, shuffle=True)
    data = np.array(data)
    labels = np.array(labels)
    scores = {
        'balanced_accuracy' : [],
        'f1_score': []
    }
    i = 1
    for train, test in kf.split(data):
        print("Split: {}".format(i), end="\r")
        X_train, X_test, y_train, y_test = data[train], data[test], labels[train], labels[test]
        model.fit(X=X_train, y=y_train)
        pred = model.predict(X_test)
        bal_accuracy = balanced_accuracy_score(y_true=y_test, y_pred=pred)
        f1 = f1_score(y_true=y_test, y_pred=pred)
        scores['balanced_accuracy'].append(bal_accuracy)
        scores['f1_score'].append(f1)
        i = i+1
    return scores

In [None]:
def one_class_classify_CV(X, y, models={'LinearSVC': 5}, CV_n_splits = 5, verbose=2, impute_strategy='mean'):
    '''
    implements CV using 'CV helper function for One-Class Classification'
    input: dataset X, label y, dict of models to be used as 'model name' : number of combinations of parameters
    output: 2 dictionaries 
    '''
    # import libraries
    import random
    from sklearn.model_selection import ParameterGrid
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    
    # results list to be returned
    results= {}
    bests= {}
    
    
    # Ridge Classification ----------------------------------------------------------------------------------------------
    if 'RidgeClassifier' in models.keys(): 
        # model class
        from sklearn.linear_model import RidgeClassifier
        model_class_name = 'RidgeClassifier'
        model_class = RidgeClassifier()
        #create list for model class results
        model_class_results = []
        # get number of combinations
        n_param_combinations = models.get(model_class_name)
        # parameter grid for the model class
        param_grid ={
            'alpha' : [0.5, 0.7, 1.0, 1.1, 1.3, 1.5, 2.0]
            }
        #create list of parameter combinations
        p_grids = list(ParameterGrid(param_grid))
        total_combinations = len(p_grids)
        if n_param_combinations >= total_combinations: 
            print('trying all possible combiations of parameters for ',model_class_name)
        else: 
            print('trying ', n_param_combinations,' possible combiations of parameters for ',model_class_name)
        # loop over parameter combinations
        i = 1
        while (i <= n_param_combinations) and (i <= total_combinations):
            try:
                params = random.choice(p_grids)
                print(params)
                p_grids.remove(params)
                pipe = Pipeline([('imputing',SimpleImputer(strategy=impute_strategy)),('scaling', StandardScaler()), ('model_class', model_class)])
                pipe_params = make_pipe_model_params(params)
                model = pipe.set_params(**pipe_params)
                scores = cv_one_class_classification(model=model, data=X, labels=y, n_splits=CV_n_splits)
                model_class_results.append([params, scores])
            except ValueError: 
                print('non supported combination of parameters')
                model_class_results.append(['non-valid',0])
            print('done ', i , ' out of ',n_param_combinations, ' combinations')
            i+=1
        # put model class results into dictionary
        results[model_class_name] = model_class_results
        print(model_class_name, ' done')
    
    # Linear SVC ----------------------------------------------------------------------------------------------
    if 'LinearSVC' in models.keys(): 
        # model class
        from sklearn.svm import LinearSVC
        model_class_name = 'LinearSVC'
        model_class = LinearSVC()
        #create list for model class results
        model_class_results = []
        # get number of combinations
        n_param_combinations = models.get(model_class_name)
        # parameter grid for the model class
        param_grid ={
            'penalty': ['l1','l2'],
            'loss' : ['squared_hinge','hinge'],
            'dual' : [False],                   #according to scikit-learn better False if n_samples > n_features
            'tol' : [0.0001],
            'C': [1.0 , 2.0 , 5.0 , 10.0, 0.5, 0.2],
            #'multi_class' : ['ovr', 'crammer_singer'],
            'fit_intercept' : [False],           #since data is assumed to be already centered 
            #'intercept_scaling'=[1], 
            'class_weight':['balanced'], 
            #'verbose'=[0], 
            #'random_state'=None, 
            'max_iter' : [1000],
            'verbose' : [verbose], 
            #'n_components' : [1300, 1100, 1000,  800, 600, 500, 300, 200, 150]
            }
        #create list of parameter combinations
        p_grids = list(ParameterGrid(param_grid))
        total_combinations = len(p_grids)
        if n_param_combinations >= total_combinations: 
            print('trying all possible combiations of parameters for ',model_class_name)
        else: 
            print('trying ', n_param_combinations,' possible combiations of parameters for ',model_class_name)
        # loop over parameter combinations
        i = 1
        while (i <= n_param_combinations) and (i <= total_combinations):
            try:
                params = random.choice(p_grids)
                print(params)
                p_grids.remove(params)
                pipe = Pipeline([('imputing',SimpleImputer(strategy=impute_strategy)),('scaling', StandardScaler()),('model_class', model_class)])
                pipe_params = make_pipe_model_params(params)
                model = pipe.set_params(**pipe_params)
                scores = cv_one_class_classification(model=model, data=X, labels=y, n_splits=CV_n_splits)
                model_class_results.append([params, scores])
            except ValueError: 
                print('non supported combination of parameters')
                model_class_results.append(['non-valid',0])
            print('done ', i , ' out of ',n_param_combinations, ' combinations')
            i+=1
        # put model class results into dictionary
        results[model_class_name] = model_class_results
        print(model_class_name, ' done')
        
    # SVC --------------------------------------------------------------------------------------------------- 
    if 'SVC' in models.keys(): 
        # model class
        from sklearn.svm import SVC
        model_class_name = 'SVC'
        model_class = SVC()
        #create list for model class results
        model_class_results = []
        # get number of combinations
        n_param_combinations = models.get(model_class_name)
        # parameter grid for the model class
        param_grid ={
            'C' : [1.0 , 2.0 , 5.0 , 10.0, 0.5, 0.2],
            'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
            'coef0': [0.0, 0.1, 0.2, 0.5],
            'class_weight':['balanced'], 
            'shrinking' : [True, False],
            'verbose' : [verbose], 
            #'n_components' : [1300, 1100, 1000,  800, 600, 500, 300, 200, 150]
            }
        #create list of parameter combinations
        p_grids = list(ParameterGrid(param_grid))
        total_combinations = len(p_grids)
        if n_param_combinations >= total_combinations: 
            print('trying all possible combiations of parameters for ',model_class_name)
        else: 
            print('trying ', n_param_combinations,' possible combiations of parameters for ',model_class_name)     
        # loop over parameter combinations
        i = 1
        while (i <= n_param_combinations) and (i <= total_combinations):
            try:
                params = random.choice(p_grids)
                p_grids.remove(params)
                pipe = Pipeline([('imputing',SimpleImputer(strategy=impute_strategy)),('scaling', StandardScaler()),('model_class', model_class)])
                pipe_params = make_pipe_model_params(params)
                model = pipe.set_params(**pipe_params)
                scores = cv_one_class_classification(model=model, data=X, labels=y, n_splits=CV_n_splits)
                model_class_results.append([params, scores])
            except ValueError: 
                print('non supported combination of parameters')
                model_class_results.append(['non-valid', 0])
            print('done ', i , ' out of ',n_param_combinations, ' combinations')
            i+=1
        # put model class results into dictionary
        results[model_class_name] = model_class_results
        print(model_class_name, ' done')
    
    # Extra Trees classifier ------------------------------------------------------------------------------------
    if 'ExtraTreesClassifier' in models.keys(): 
        # model class 
        from sklearn.ensemble import ExtraTreesClassifier
        model_class_name = 'ExtraTreesClassifier'
        model_class = ExtraTreesClassifier()
        #create list for model class results
        model_class_results = []
        # gte numbr of combinations
        n_param_combinations = models.get(model_class_name)
        # parameter grid for the model class
        param_grid ={
            'n_estimators' : [200, 250, 300],
            'criterion' : ['gini', 'entropy'],
            'min_samples_split' : [2, 3, 4],
            'bootstrap' : [True, False],
            'class_weight' : ['balanced', 'balanced_subsample'],
            'verbose' : [verbose],
            'n_jobs' : [3]
            }
        #create list of parameter combinations
        p_grids = list(ParameterGrid(param_grid))
        total_combinations = len(p_grids)
        if n_param_combinations >= total_combinations: 
            print('trying all possible combiations of parameters for ',model_class_name)
        else: 
            print('trying ', n_param_combinations,' possible combiations of parameters for ',model_class_name)    
        # loop over parameter combinations
        i = 1
        while (i <= n_param_combinations) and (i <= total_combinations):
            try:
                params = random.choice(p_grids)
                p_grids.remove(params)
                pipe = Pipeline([('imputing',SimpleImputer(strategy=impute_strategy)),('scaling', StandardScaler()), ('model_class', model_class)])
                pipe_params = make_pipe_model_params(params)
                model = pipe.set_params(**pipe_params)
                scores = cv_one_class_classification(model=model, data=X, labels=y, n_splits=CV_n_splits)
                model_class_results.append([params, scores])
            except ValueError: 
                print('non supported combination of parameters')
                model_class_results.append(['non-valid', 0])
            print('done ', i , ' out of ',n_param_combinations, ' combinations')
            i+=1
        # put model class results into dictionary
        results[model_class_name] = model_class_results
        print(model_class_name, ' done')
        
    # Random Forest classifier ----------------------------------------------------------------------------------
    if 'RandomForestClassifier' in models.keys(): 
        # import model class 
        from sklearn.ensemble import RandomForestClassifier
        model_class_name = 'RandomForestClassifier'
        model_class = RandomForestClassifier()
        #create list for model class results
        model_class_results = []
        # gte numbr of combinations
        n_param_combinations = models.get(model_class_name)
        # parameter grid for the model class
        param_grid ={
            'n_estimators' : [150, 175, 200, 250],
            'criterion' : ['gini', 'entropy'],
            'min_samples_split' : [2, 3,4],
            'bootstrap' : [True, False],
            'class_weight' : ['balanced', 'balanced_subsample'],
            'verbose' : [verbose],
            'n_jobs' : [3]
            }
        #create list of parameter combinations
        p_grids = list(ParameterGrid(param_grid))
        total_combinations = len(p_grids)
        if n_param_combinations >= total_combinations: 
            print('trying all possible combiations of parameters for ',model_class_name)
        else: 
            print('trying ', n_param_combinations,' possible combiations of parameters for ',model_class_name)
        # loop over parameter combinations
        i = 1
        while (i <= n_param_combinations) and (i <= total_combinations):
            try:
                params = random.choice(p_grids)
                p_grids.remove(params)
                pipe = Pipeline([('imputing',SimpleImputer(strategy=impute_strategy)),('scaling', StandardScaler()), ('model_class', model_class)])
                pipe_params = make_pipe_model_params(params)
                model = pipe.set_params(**pipe_params)
                scores = cv_one_class_classification(model=model, data=X, labels=y, n_splits=CV_n_splits)
                model_class_results.append([params, scores])
            except ValueError: 
                print('non supported combination of parameters')
                model_class_results.append(['non-valid', 0])
            print('done ', i , ' out of ',n_param_combinations, ' combinations')
            i+=1 
        # put model class results into dictionary
        results[model_class_name] = model_class_results
        print(model_class_name, ' done')
    
    # AdaBoost classifier -----------------------------------------------------------------------------------
    if 'AdaBoostClassifier' in models.keys(): 
        # import model class 
        from sklearn.ensemble import AdaBoostClassifier
        from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
        model_class_name = 'AdaBoostClassifier'
        model_class = AdaBoostClassifier()
        #create list for model class results
        model_class_results = []
        #get number of combinations
        n_param_combinations = models.get(model_class_name)
        # parameter grid for the model class
        param_grid ={
            'base_estimator' : [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2)],
            'n_estimators' : [75, 100, 150],
            'learning_rate' : [1.0, 1.5, 2.0, 1.75]
            }
        #create list of parameter combinations
        p_grids = list(ParameterGrid(param_grid))
        total_combinations = len(p_grids)
        if n_param_combinations >= total_combinations: 
            print('trying all possible combiations of parameters for ',model_class_name)
        else: 
            print('trying ', n_param_combinations,' possible combiations of parameters for ',model_class_name)       
        # loop over parameter combinations
        i = 1
        while (i <= n_param_combinations) and (i <= total_combinations):
            try:
                params = random.choice(p_grids)
                p_grids.remove(params)
                pipe = Pipeline([('imputing',SimpleImputer(strategy=impute_strategy)),('scaling', StandardScaler()), ('model_class', model_class)])
                pipe_params = make_pipe_model_params(params)
                model = pipe.set_params(**pipe_params)
                scores = cv_one_class_classification(model=model, data=X, labels=y, n_splits=CV_n_splits)
                model_class_results.append([params, scores])
            except ValueError: 
                print('non supported combination of parameters')
                model_class_results.append(['non-valid', 0])
            print('done ', i , ' out of ',n_param_combinations, ' combinations')
            i+=1
        # put model class results into dictionary
        results[model_class_name] = model_class_results
        print(model_class_name, ' done')
    # XGBoost classifier -----------------------------------------------------------------------------------
    if 'XGBClassifier' in models.keys(): 
        # import model class 
        from xgboost import XGBClassifier
        model_class_name = 'XGBClassifier'
        model_class = XGBClassifier()
        #create list for model class results
        model_class_results = []
        #get number of combinations
        n_param_combinations = models.get(model_class_name)
        # parameter grid for the model class
        param_grid ={
            'max_depth' : [1,2,3], 
            'learning_rate' : [0.1,0.2,0.3,0.4],
            'n_estimators' : [150,200,250, 300],
            'verbosity' : [verbose],
            'objective' : ['binary:logistic'],
            'n_jobs' : [3], 
            'gamma' : [0, 0.1, 0.5, 1],
            'reg_alpha' : [0, 0.5, 1],
            'reg_alpha' : [0, 0.5, 1, 2],
            'scale_pos_weight': [6.6]     # computed as negative samples/positive samples
            }
        #create list of parameter combinations
        p_grids = list(ParameterGrid(param_grid))
        total_combinations = len(p_grids)
        if n_param_combinations >= total_combinations: 
            print('trying all possible combiations of parameters for ',model_class_name)
        else: 
            print('trying ', n_param_combinations,' possible combiations of parameters for ',model_class_name)       
        # loop over parameter combinations
        i = 1
        while (i <= n_param_combinations) and (i <= total_combinations):
            try:
                params = random.choice(p_grids)
                p_grids.remove(params)
                pipe = Pipeline([('imputing',SimpleImputer(strategy=impute_strategy)),('scaling', StandardScaler()), ('model_class', model_class)])
                pipe_params = make_pipe_model_params(params)
                model = pipe.set_params(**pipe_params)
                scores = cv_one_class_classification(model=model, data=X, labels=y, n_splits=CV_n_splits)
                model_class_results.append([params, scores])
            except ValueError: 
                print('non supported combination of parameters')
                model_class_results.append(['non-valid', 0])
            print('done ', i , ' out of ',n_param_combinations, ' combinations')
            i+=1
        # put model class results into dictionary
        results[model_class_name] = model_class_results
        print(model_class_name, ' done') 
    
    for model_class_name in results.keys(): 
        means_b_accuracy = []
        means_f1_score = []
        medians_b_accuracy  = []
        medians_f1_score = []
        for params, scores in results[model_class_name]:
            if params != 'non-valid':
                means_b_accuracy.append(np.mean(np.asarray(scores['balanced_accuracy'])))
                means_f1_score.append(np.mean(np.asarray(scores['f1_score'])))
                medians_b_accuracy.append(np.median(np.asarray(scores['balanced_accuracy'])))
                medians_f1_score.append(np.median(np.asarray(scores['f1_score'])))
            else:
                means_b_accuracy.append(-1)
                means_f1_score.append(-1)
                medians_b_accuracy.append(-1)
                medians_f1_score.append(-1)
                
        index_b_accuracy = np.argmax(np.asarray(means_b_accuracy))
        index_f1_score = np.argmax(np.asarray(means_f1_score))
        
        best_score_b_acc_mean = np.asarray(means_b_accuracy)[index_b_accuracy]
        best_score_f1_score_mean = np.asarray(means_f1_score)[index_f1_score]
        
        best_score_b_acc_median = np.asarray(medians_b_accuracy)[index_b_accuracy]
        best_score_f1_score_median = np.asarray(medians_f1_score)[index_f1_score]
        
        best_params_b_acc = results[model_class_name][index_b_accuracy][0]
        best_params_f_score = results[model_class_name][index_f1_score][0]
        
        
        bests[model_class_name] ={
            'balanced_accuracy':{'mean':best_score_b_acc_mean,'median': best_score_b_acc_median,'parameters':best_params_b_acc, 
                        'f1_score': {'mean':np.asarray(means_f1_score)[index_b_accuracy],'median':np.asarray(medians_f1_score)[index_b_accuracy]} },

            'f1_score':{'mean':best_score_f1_score_mean,'median': best_score_f1_score_median,'parameters':best_params_f_score, 
                        'balanced_accuracy': {'mean':np.asarray(means_b_accuracy)[index_f1_score],'median':np.asarray(medians_b_accuracy)[index_f1_score]} },

        }
    return results, bests

### Importing data 

In [None]:
#import data non-extensive (only clusters and ratios)
eeg_clusters = create_dataset_eeg(SCORE = 'Age', clusters = True, ratios = True)

#drop samples with Nan in 'DX_01_Cat'
eeg_clusters = drop_nans_based_on_column(eeg_clusters, 'DX_01_Cat')

eeg_clusters.shape

In [None]:
eeg_clusters

### Statistics

In [None]:
#show statistics for DISEASE CATEGORIES 'DX_01_Cat' 
get_statistics_column(eeg_clusters, 'DX_01_Cat')

We see that 'Neurodevelopmental Disorders' is by far the most present category, so we do further statistics. 

In [None]:
# Neurodevelopmental Disorders data stats
#create dataset for only ND patients and drop patients with nulls in 'DX_01_Sub', 'DX_01'
ND = eeg_clusters.groupby('DX_01_Cat').get_group('Neurodevelopmental Disorders')
ND = drop_nans_based_on_column(ND, 'DX_01_Sub')
ND = drop_nans_based_on_column(ND, 'DX_01')

# show stats for subcategories in ND data
get_statistics_column(ND, 'DX_01_Sub')

# show stats for 'DX_01' in ND data
get_statistics_column(ND, 'DX_01')

#### Some takeaway points are: 
- Neurodevelopmental Disorders(ND) data not have Nans in Subcategories or Diagnoses and are a lot (833/1305)
- ADHD Subcategory is 577 samples out of 1305 (around 0.44) --> can try to predict that against all

## Predicting ADHD (as a Subcategory)

### predicting ADHD subcat: preprocessing

In [None]:

# create list of subcategories
subcats = list(set(eeg_clusters['DX_01_Sub'].values))
print(subcats)

# create dict to later map ADHD-->1 vs all other subcats-->0
d = { 
 'Attention-Deficit/Hyperactivity Disorder' : 1
}
subcats.remove( 'Attention-Deficit/Hyperactivity Disorder')
for subcat in subcats: 
    d[subcat]=0
print(d)

#create ADHD dataset
ADHD = eeg_clusters.copy()
ADHD['DX_01_Sub'] = ADHD['DX_01_Sub'].map(d)

# remove othe diagnoses-related columns
ADHD = ADHD.drop(['DX_01_Cat', 'DX_01'], axis=1)

#rename 'DX_01_Sub'-->'label' column
ADHD = ADHD.rename(columns={'DX_01_Sub': 'label'})

ADHD

In [None]:
# import behavioral data to later get SWAN data, that will be used as a 'baseline' for predicting ADHD
behavioral = pd.read_csv('data/Behavioral/cleaned/HBNFinalSummaries.csv')
behavioral

# get SWAN columns
swan_cols = list(behavioral.filter(like='SWAN').columns)
swan_cols

#get SWAN data
SWAN = behavioral[ ['EID','Sex'] + list(behavioral.filter(like='SWAN').columns)]
#rename id column
SWAN = SWAN.rename(columns={'EID': 'id'})

SWAN

In [None]:
#join on patient id and get ADHD_SWAN dataset

try: 
    ADHD_SWAN = pd.merge(SWAN, ADHD, on='id', how='inner',validate='one_to_one')
except pd.errors.MergeError: 
    SWAN = SWAN.drop_duplicates(subset=['id'], keep='first')
    ADHD = ADHD.drop_duplicates(subset=['id'], keep='first')
    ADHD_SWAN = pd.merge(SWAN, ADHD, on='id', how='inner',validate='one_to_one')

ADHD_SWAN

In [None]:
#plot distributions of SWAN columns values

col = 'SWAN_IN_Avg'
plt.title(col)
plt.hist(ADHD_SWAN[col], bins=50)
plt.xlabel(col)
plt.ylabel('n° of patients')
plt.show()

col = 'SWAN_HY_Avg'
plt.title(col)
plt.xlabel(col)
plt.ylabel('n° of patients')
plt.hist(ADHD_SWAN[col], bins=50, color='green')
plt.show()

col = 'SWAN_Avg'
plt.title(col)
plt.xlabel(col)
plt.ylabel('n° of patients')
plt.hist(ADHD_SWAN[col], bins=50, color='orange')
plt.show()

# correlation matrix for SWAN columns
ax = plt.axes()
sns.heatmap(ADHD_SWAN[swan_cols].corr(),annot=True, cmap=sns.color_palette("coolwarm", 4), ax=ax)
ax.set_title('Correlation matrix SWAN columns')
plt.show()


SWAN_Avg is a lor correlated (about 0.9) to the other two SWAN columns. Better drop it. 

In [None]:
#drop SWAN Avg
ADHD_SWAN = ADHD_SWAN.drop('SWAN_Avg',axis=1)
swan_cols.remove('SWAN_Avg')

In [None]:
ADHD_SWAN

In [None]:
# dealing with nans 

for column in ADHD_SWAN.columns: 
    nan_sum = ADHD_SWAN.loc[:, column].isna().sum()
    if nan_sum >= 1: 
        print(column, '\t', nan_sum)

### predicting ADHD subact : modelling

In [None]:
# get list for test ids
test_ids = pd.read_csv('data/test_IDS.csv')
test_ids_a = test_ids['ID'].values
test_ids_l = list(test_ids_a)

In [None]:
# split train and test data ADHD_SWAN
ADHD_SWAN_test = ADHD_SWAN.loc[ADHD_SWAN['id'].isin(test_ids_l)]
ADHD_SWAN_train = ADHD_SWAN.loc[~ADHD_SWAN['id'].isin(test_ids_l)]

#pop id and label column train
id_column_train = ADHD_SWAN_train.pop('id')
label_train = ADHD_SWAN_train.pop('label')

#pop id and label column test 
id_column_test = ADHD_SWAN_test.pop('id')
label_test = ADHD_SWAN_test.pop('label')

In [None]:
# dealing with Nans in training data
for column in ADHD_SWAN_train.columns: 
    nan_sum = ADHD_SWAN_train.loc[:, column].isna().sum()
    if nan_sum >= 1: 
        print(column, '\t', nan_sum)
'''
# imputing missing values: strategy= median
from sklearn.impute import SimpleImputer
impute_strategy = 'median'
imputer = SimpleImputer(strategy=impute_strategy)

imputer.fit(ADHD_SWAN_train)
ADHD_SWAN_train = pd.DataFrame(imputer.transform(ADHD_SWAN_train), index=ADHD_SWAN_train.index, columns=ADHD_SWAN_train.columns)
ADHD_SWAN_test = pd.DataFrame(imputer.transform(ADHD_SWAN_test), index=ADHD_SWAN_test.index, columns=ADHD_SWAN_test.columns)

# THIS PART IS REMOVED SINCE THE SCALING IS DONE DIRECTLY IN THE PIPELINE

# scaling data 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(ADHD_SWAN_train)

ADHD_SWAN_train = pd.DataFrame(scaler.transform(ADHD_SWAN_train), index=ADHD_SWAN_train.index, columns=ADHD_SWAN_train.columns)
ADHD_SWAN_test = pd.DataFrame(scaler.transform(ADHD_SWAN_test), index=ADHD_SWAN_test.index, columns=ADHD_SWAN_test.columns)
'''

#### predicting ADHD using only SWAN (baseline)

In [None]:
# get train and test dataset SWAN columns
X_swan_train = ADHD_SWAN_train[swan_cols]  #only SWAN columns dataset
X_swan_test = ADHD_SWAN_test[swan_cols]

##### SWAN impute strategy='median''

In [None]:
# PIPELINE 1

imp_strategy = 'median'

models = {
    'RidgeClassifier' : 10, 
    'LinearSVC': 10,
    'SVC' : 15,
    'RandomForestClassifier' : 15, 
    'ExtraTreesClassifier' : 15, 
    'AdaBoostClassifier' : 15, 
    'XGBClassifier' : 15
}
results_SWAN, bests_SWAN = one_class_classify_CV(X=X_swan_train, y=label_train, models=models, CV_n_splits=5, verbose=0, impute_strategy=imp_strategy)

# PIPELINE 2 
# decide to go for the accuracy metric of the three
selected_metric = 'balanced_accuracy'

model_classes_dict = {
    'RidgeClassifier' : RidgeClassifier(), 
    'LinearSVC': LinearSVC(),
    'SVC' : SVC(),
    'RandomForestClassifier' : RandomForestClassifier(), 
    'ExtraTreesClassifier' : ExtraTreesClassifier(), 
    'AdaBoostClassifier' : AdaBoostClassifier(), 
    'XGBClassifier' : XGBClassifier()
}

for model_class_name, model_class in model_classes_dict.items():
    parameters = bests_SWAN[model_class_name][selected_metric]['parameters']
    estimator = model_class.set_params(**parameters)
    model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('estimator', estimator)])
    results = cv_one_class_classification(model, X_swan_train, label_train, n_splits=5)
    b_acc_med = np.median(np.array(results['balanced_accuracy']))
    f1_score_med = np.median(np.array(results['f1_score']))
    b_acc_m = np.array(results['balanced_accuracy']).mean()
    f1_score_m = np.array(results['f1_score']).mean()
    print('\n\n',
          model_class_name, '\n\t balanced_accuracy:  mean {} \t median {}' 
          '\n f1_score:  mean {} \t median {} \n\n'.format(b_acc_m, b_acc_med, f1_score_m, f1_score_med))
    
    

In [None]:
# PIPELINE 3
# select best model (manually) and predict on test set 

best_model_class = 'XGBClassifier'

best_model_parameters = bests_SWAN[best_model_class][selected_metric]['parameters']
best_estimator = model_classes_dict[best_model_class].set_params(**best_model_parameters)
best_model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('model', best_estimator)])

best_model.fit(X_swan_train, label_train)
best_model_preds = best_model.predict(X_swan_test)

best_model_test_b_accuracy = balanced_accuracy_score(label_test, best_model_preds)
best_model_test_f1_score= f1_score(label_test, best_model_preds)

print('\n\n Best model results test set: \n balanced_accuracy {} \t f1_score {} \n'.format(best_model_test_b_accuracy, best_model_test_f1_score))

##### SWAN: impute strategy='mean'

In [None]:
# PIPELINE 1

imp_strategy = 'mean'

models = {
    'RidgeClassifier' : 10, 
    'LinearSVC': 10,
    'SVC' : 15,
    'RandomForestClassifier' : 15, 
    'ExtraTreesClassifier' : 15, 
    'AdaBoostClassifier' : 15, 
    'XGBClassifier' : 15
}
results_SWAN, bests_SWAN = one_class_classify_CV(X=X_swan_train, y=label_train, models=models, CV_n_splits=5, verbose=0, impute_strategy=imp_strategy)

# PIPELINE 2 
# decide to go for the accuracy metric of the three
selected_metric = 'balanced_accuracy'

model_classes_dict = {
    'RidgeClassifier' : RidgeClassifier(), 
    'LinearSVC': LinearSVC(),
    'SVC' : SVC(),
    'RandomForestClassifier' : RandomForestClassifier(), 
    'ExtraTreesClassifier' : ExtraTreesClassifier(), 
    'AdaBoostClassifier' : AdaBoostClassifier(), 
    'XGBClassifier' : XGBClassifier()
}

for model_class_name, model_class in model_classes_dict.items():
    parameters = bests_SWAN[model_class_name][selected_metric]['parameters']
    estimator = model_class.set_params(**parameters)
    model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('estimator', estimator)])
    results = cv_one_class_classification(model, X_swan_train, label_train, n_splits=5)
    b_acc_med = np.median(np.array(results['balanced_accuracy']))
    f1_score_med = np.median(np.array(results['f1_score']))
    b_acc_m = np.array(results['balanced_accuracy']).mean()
    f1_score_m = np.array(results['f1_score']).mean()
    print('\n\n',
          model_class_name, '\n\t balanced_accuracy:  mean {} \t median {}' 
          '\n\t f1_score:  mean {} \t median {} \n\n'.format(b_acc_m, b_acc_med, f1_score_m, f1_score_med))
    
    

In [None]:
# PIPELINE 3
# select best model (manually) and predict on test set 

best_model_class = 'LinearSVC'

best_model_parameters = bests_SWAN[best_model_class][selected_metric]['parameters']
best_estimator = model_classes_dict[best_model_class].set_params(**best_model_parameters)
best_model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('model', best_estimator)])

best_model.fit(X_swan_train, label_train)
best_model_preds = best_model.predict(X_swan_test)

best_model_test_b_accuracy = balanced_accuracy_score(label_test, best_model_preds)
best_model_test_f1_score= f1_score(label_test, best_model_preds)

print('\n\n Best model results test set: \n balanced_accuracy {} \t f1_score {} \n'.format(best_model_test_b_accuracy, best_model_test_f1_score))

#### predicting using only EEG 

In [None]:
# get train and test dataset EEG data for ADHD
X_EEGxADHD_train = ADHD_SWAN_train.drop(['Sex','SWAN_IN_Avg','SWAN_HY_Avg','Age'], axis=1)
X_EEGxADHD_test = ADHD_SWAN_test.drop(['Sex','SWAN_IN_Avg','SWAN_HY_Avg','Age'], axis=1)

##### predicting using only EEG: impute strategy 'median'

In [None]:
# PIPELINE 1

imp_strategy = 'median'

models = {
    'RidgeClassifier' : 10, 
    'LinearSVC': 15,
    'SVC' : 20,
    'RandomForestClassifier' : 20, 
    'ExtraTreesClassifier' : 20, 
    'AdaBoostClassifier' : 20, 
    'XGBClassifier' : 20
}
results_EEGxADHD, bests_EEGxADHD = one_class_classify_CV(X=X_EEGxADHD_train, y=label_train, models=models, CV_n_splits=5, verbose=0, impute_strategy=imp_strategy)

# PIPELINE 2 
# decide to go for the accuracy metric of the three
selected_metric = 'balanced_accuracy'

model_classes_dict = {
    'RidgeClassifier' : RidgeClassifier(), 
    'LinearSVC': LinearSVC(),
    'SVC' : SVC(),
    'RandomForestClassifier' : RandomForestClassifier(), 
    'ExtraTreesClassifier' : ExtraTreesClassifier(), 
    'AdaBoostClassifier' : AdaBoostClassifier(), 
    'XGBClassifier' : XGBClassifier()
}

for model_class_name, model_class in model_classes_dict.items():
    parameters = bests_EEGxADHD[model_class_name][selected_metric]['parameters']
    estimator = model_class.set_params(**parameters)
    model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('estimator', estimator)])
    results = cv_one_class_classification(model, X_EEGxADHD_train, label_train, n_splits=5)
    b_acc_med = np.median(np.array(results['balanced_accuracy']))
    f1_score_med = np.median(np.array(results['f1_score']))
    b_acc_m = np.array(results['balanced_accuracy']).mean()
    f1_score_m = np.array(results['f1_score']).mean()
    print('\n\n',
          model_class_name, '\n\t balanced_accuracy:  mean {} \t median {}' 
          '\n\t f1_score:  mean {} \t median {} \n\n'.format(b_acc_m, b_acc_med, f1_score_m, f1_score_med))
    
    

In [None]:
# PIPELINE 3
# select best model (manually) and predict on test set 

best_model_class = 'XGBClassifier'

best_model_parameters = bests_EEGxADHD[best_model_class][selected_metric]['parameters']
best_estimator = model_classes_dict[best_model_class].set_params(**best_model_parameters)
best_model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('model', best_estimator)])

best_model.fit(X_EEGxADHD_train, label_train)
best_model_preds = best_model.predict(X_EEGxADHD_test)

best_model_test_b_accuracy = balanced_accuracy_score(label_test, best_model_preds)
best_model_test_f1_score= f1_score(label_test, best_model_preds)
print('\n\n Best model results test set: \n balanced_accuracy {} \t f1_score {} \n'.format(best_model_test_b_accuracy, best_model_test_f1_score))

In [None]:
# get feature importances for XGBoost model for EEG 

from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib.pyplot import figure

model = XGBClassifier()
model.set_params(**(bests_EEGxADHD['XGBClassifier']['accuracy']['parameters']))

model.fit(X_EEGxADHD, label_column)

plot_importance(model)
plt.show()

feature_importances = model.feature_importances_

n_features = 5
n_most_imp_feat = feature_importances.argsort()[-n_features:][::-1]

for i, feature in enumerate(X_EEGxADHD.columns): 
    if i in n_most_imp_feat: 
        print(feature, feature_importances[i])

#### predicting using SWAN + EEG + Sex + Age

##### predicting using impute strategy 'median'

In [None]:
# PIPELINE 1

imp_strategy = 'median'

models = {
    'RidgeClassifier' : 10, 
    'LinearSVC': 15,
    'SVC' : 20,
    'RandomForestClassifier' : 20, 
    'ExtraTreesClassifier' : 20, 
    'AdaBoostClassifier' : 20, 
    'XGBClassifier' : 25
}
results_EEGSWAN, bests_EEGSWAN = one_class_classify_CV(X=ADHD_SWAN_train, y=label_train, models=models, CV_n_splits=5, verbose=0, impute_strategy=imp_strategy)

# PIPELINE 2 
# decide to go for the accuracy metric of the three
selected_metric = 'balanced_accuracy'

model_classes_dict = {
    
    'RidgeClassifier' : RidgeClassifier(), 
    'LinearSVC': LinearSVC(),
    'SVC' : SVC(),
    'RandomForestClassifier' : RandomForestClassifier(), 
    'ExtraTreesClassifier' : ExtraTreesClassifier(), 
    'AdaBoostClassifier' : AdaBoostClassifier(), 
    'XGBClassifier' : XGBClassifier()
}

for model_class_name, model_class in model_classes_dict.items():
    parameters = bests_EEGSWAN[model_class_name][selected_metric]['parameters']
    estimator = model_class.set_params(**parameters)
    model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('estimator', estimator)])
    results = cv_one_class_classification(model, ADHD_SWAN_train, label_train, n_splits=5)
    b_acc_med = np.median(np.array(results['balanced_accuracy']))
    f1_score_med = np.median(np.array(results['f1_score']))
    b_acc_m = np.array(results['balanced_accuracy']).mean()
    f1_score_m = np.array(results['f1_score']).mean()
    print('\n\n',
          model_class_name, '\n\t balanced_accuracy:  mean {} \t median {}' 
          '\n\t f1_score:  mean {} \t median {} \n\n'.format(b_acc_m, b_acc_med, f1_score_m, f1_score_med))
    
    

In [None]:
# PIPELINE 3
# select best model (manually) and predict on test set 

best_model_class = 'XGBClassifier'

best_model_parameters = bests_EEGSWAN[best_model_class][selected_metric]['parameters']
best_estimator = model_classes_dict[best_model_class].set_params(**best_model_parameters)
best_model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('model', best_estimator)])

best_model.fit(ADHD_SWAN_train, label_train)
best_model_preds = best_model.predict(ADHD_SWAN_test)


best_model_test_b_accuracy = balanced_accuracy_score(label_test, best_model_preds)
best_model_test_f1_score= f1_score(label_test, best_model_preds)
print('\n\n Best model results test set: \n balanced_accuracy {} \t f1_score {} \n'.format(best_model_test_b_accuracy, best_model_test_f1_score))

In [None]:
# get feature importances for XGBoost model for SWAN + EEG + Sex + Age 

from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib.pyplot import figure

best_model.fit(ADHD_SWAN_train, label_train)

feature_importances = best_model['model'].feature_importances_

n_features = 10
n_most_imp_feat = feature_importances.argsort()[-n_features:][::-1]

for i, feature in enumerate(ADHD_SWAN.columns): 
    if i in n_most_imp_feat: 
        print(feature, feature_importances[i])

In [None]:
# scatter plot for seemingly most relevant EEG features

figure(figsize = (15,10))
plt.scatter(ADHD_SWAN['eyesclosed_fooof_aperiodic_slope_lfront'], ADHD_SWAN['eyesclosed_fband_alpha_absmean_mfront'], alpha=0.2,
            s=5, c=label_column, cmap='RdBu')
plt.xlabel('eyesclosed_fooof_aperiodic_slope_lfront')
plt.ylabel('eyesclosed_fband_alpha_absmean_mfront')

In [None]:
# plot distributons for 'eyesclosed_ratios_lower2alpha_beta_lfront_mpari ' for positive vs negative samples

col_name = 'eyesclosed_ratios_lower2alpha_beta_lfront_mpari'
column = pd.DataFrame(ADHD_SWAN_train['eyesclosed_ratios_lower2alpha_beta_lfront_mpari'], index=ADHD_SWAN_train.index ,columns = ['eyesclosed_ratios_lower2alpha_beta_lfront_mpari'])
column ['label'] = label_train

column_pos = column.loc[column['label'] == 1]
column_neg = column.loc[column['label'] == 0]


figure(figsize = (11,8))
plt.title('Patients with ADHD')
plt.xlabel(col_name)
plt.ylabel('n° of patients')
sns.distplot(column_pos[col_name], bins=50, norm_hist=True)
plt.show()

figure(figsize = (11,8))
plt.title('Patients without ADHD')
plt.xlabel(col_name)
plt.ylabel('n° of patients')
sns.distplot(column_neg[col_name], bins=50, color='orange', norm_hist=True)
plt.show()


figure(figsize = (11,8))
#plt.title('Patients with ADHD')
plt.xlabel(col_name)
plt.ylabel('n° of patients')
sns.distplot(column_pos[col_name], bins=50, norm_hist=True)
#plt.show()
#plt.title('Patients without ADHD')
plt.xlabel(col_name)
plt.ylabel('n° of patients')
sns.distplot(column_neg[col_name], bins=50, color='orange', norm_hist=True)
plt.show()

In [None]:
# see if there is correlation between ratios and ADHD

eeg_clusters_c = eeg_clusters.copy()
# keep only ND and No Diagnoses (healthy)
for row in eeg_clusters_c.index: 
    DX1Cat = eeg_clusters_c.loc[row,'DX_01_Cat'] 
    if DX1Cat != 'Neurodevelopmental Disorders' and DX1Cat != 'No Diagnosis Given':
        eeg_clusters_c.drop(index=[row], inplace=True)

for row in eeg_clusters_c.index: 
    DX1Cat = eeg_clusters_c.loc[row,'DX_01_Cat'] 
    DX1Sub = eeg_clusters_c.loc[row,'DX_01_Sub']
    if DX1Cat == 'Neurodevelopmental Disorders' and DX1Sub != 'Attention-Deficit/Hyperactivity Disorder':
        eeg_clusters_c.drop(index=[row], inplace=True)
        


In [None]:
eeg_clusters_c

In [None]:
sig_ratios = ['eyesclosed_ratios_theta_lower2alpha_mfront_mpari', 
'eyesclosed_ratios_theta_lower2alpha_rpari_mpari', 
'eyesclosed_ratios_lower1alpha_lower2alpha_rfront_lfront',
'eyesclosed_ratios_lower2alpha_beta_lfront_mpari',
'eyesopen_ratios_theta_lower1alpha_rpari_lfront',
'eyesopen_ratios_lower2alpha_beta_mfront_lpari']

eeg_clusters_c['sum_sig_ratios'] = eeg_clusters_c[sig_ratios].sum(axis=1)

In [None]:
# plot distributons for 'eyesclosed_ratios_lower2alpha_beta_lfront_mpari ' healthy vs ADHD

col_name = 'sum_sig_ratios'
column = pd.DataFrame(eeg_clusters_c[col_name], index=eeg_clusters_c.index ,columns = [col_name])
column ['label'] = eeg_clusters_c['DX_01_Cat']

column_h = column.loc[column['label'] == 'No Diagnosis Given' ]
column_adhd = column.loc[column['label'] == 'Neurodevelopmental Disorders']


figure(figsize = (11,8))
plt.title('Patients with ADHD')
plt.xlabel(col_name)
plt.ylabel('n° of patients')
sns.distplot(column_adhd[col_name], bins=1000, norm_hist=True)
plt.show()

figure(figsize = (11,8))
plt.title('Healthy patients')
plt.xlabel(col_name)
plt.ylabel('n° of patients')
sns.distplot(column_h[col_name], bins=1000, color='orange', norm_hist=True)
plt.show()


figure(figsize = (11,8))
#plt.title('Patients with ADHD')
plt.xlabel(col_name)
plt.ylabel('n° of patients')
sns.distplot(column_adhd[col_name], bins=1000, norm_hist=True)
#plt.show()
#plt.title('Patients without ADHD')
plt.xlabel(col_name)
plt.ylabel('n° of patients')
sns.distplot(column_h[col_name], bins=1000, color='orange', norm_hist=True)
plt.show()

In [None]:
# try to aggregate ratios 

sum_ratios_vs_diag = eeg_clusters_c[['DX_01_Cat', 'sum_sig_ratios']]
for index in sum_ratios_vs_diag.index:
    if sum_ratios_vs_diag.loc[index, 'DX_01_Cat'] == 'Neurodevelopmental Disorders': 
        sum_ratios_vs_diag.loc[index, 'DX_01_Cat'] = 1
    else: 
        sum_ratios_vs_diag.loc[index, 'DX_01_Cat'] = 0
sum_ratios_vs_diag['DX_01_Cat'] = sum_ratios_vs_diag['DX_01_Cat'].astype('int64')


print(sum_ratios_vs_diag)

s_ratios_adhd =sum_ratios_vs_diag.loc[sum_ratios_vs_diag['DX_01_Cat'] == 1 ]['sum_sig_ratios'].values
s_ratios_h =sum_ratios_vs_diag.loc[sum_ratios_vs_diag['DX_01_Cat'] == 0 ]['sum_sig_ratios'].values

print(s_ratios_adhd.mean(),np.median(s_ratios_adhd),  s_ratios_h.mean(), np.median(s_ratios_h))



In [None]:
# check if the model predicts always the most frequent class

#EEG
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

best_model_svc = SVC().set_params(**(bests_EEGxADHD['SVC']['accuracy']['parameters']))
best_model_xgb = XGBClassifier().set_params(**(bests_EEGxADHD['XGBClassifier']['accuracy']['parameters']))
best_model_et = ExtraTreesClassifier().set_params(**(bests_EEGxADHD['ExtraTreesClassifier']['accuracy']['parameters']))

best_models_list = [best_model_svc, best_model_xgb, best_model_et]

for model in best_models_list: 
    scores = cv_one_class_classification(model, X_EEGxADHD, label_column, n_splits = 5)
    
#SWAN + EEG + Sex + Age

best_model_svc = SVC().set_params(**(bests_SWAN_EEG['SVC']['accuracy']['parameters']))
best_model_xgb = XGBClassifier().set_params(**(bests_SWAN_EEG['XGBClassifier']['accuracy']['parameters']))
best_model_et = ExtraTreesClassifier().set_params(**(bests_SWAN_EEG['ExtraTreesClassifier']['accuracy']['parameters']))

best_models_list = [best_model_svc, best_model_xgb, best_model_et]

for model in best_models_list: 
    scores = cv_one_class_classification(model, ADHD_SWAN, label_column, n_splits = 5)
    


Classifiers do not always predict the most frequent class, which means that model is not 'ill-posed'.

## Predicting diagnosis vs no-diagnosis

We try to predict from EEG data whether a patient is healthy (no-diagnosis given, positive samples) or ill (diagnosis given, negative samples) 

In [None]:

cats = list(set(eeg_clusters['DX_01_Cat'].values))

print(cats)


In [None]:
# create healthy/ill (HI) dataset

HI = eeg_clusters.copy()

# remove incomplete evaluation
to_drop_inc_eval = []
for sample in HI.index: 
    if HI.loc[sample,'DX_01_Cat'] == 'No Diagnosis Given: Incomplete Eval': 
        to_drop_inc_eval.append(sample)
        

HI = HI.drop(to_drop_inc_eval, axis=0)


In [None]:
# dict for healthy/ill mapping
d = { 
    'No Diagnosis Given' : 1
}
cats.remove('No Diagnosis Given')
for cat in cats: 
    d[cat]=0
d

In [None]:
# create label column
HI['DX_01_Cat'] = HI['DX_01_Cat'].map(d).fillna(0).astype('int64')

# remove othe diagnoses-related columns
HI = HI.drop(['DX_01_Sub', 'DX_01'], axis=1)

#rename label column
HI = HI.rename(columns={'DX_01_Cat': 'label'})

HI

In [None]:
# pop age column 
HI.pop('Age')
HI

In [None]:
# split train and test data HI
HI_test = HI.loc[HI['id'].isin(test_ids_l)]
HI_train = HI.loc[~HI['id'].isin(test_ids_l)]

#pop id and label column train
id_column_HI_train = HI_train.pop('id')
label_HI_train = HI_train.pop('label')

#pop id and label column test 
id_column_HI_test = HI_test.pop('id')
label_HI_test = HI_test.pop('label')

In [None]:
HI_train

In [None]:
# PIPELINE 1

imp_strategy = 'median'

models = {
    'RidgeClassifier' : 10, 
    'LinearSVC': 20,
    'SVC' : 20,
    'RandomForestClassifier' : 20, 
    'ExtraTreesClassifier' : 20, 
    'AdaBoostClassifier' : 20, 
    'XGBClassifier' : 20
}
results_HI_EEG, bests_HI_EEG = one_class_classify_CV(X=HI_train, y=label_HI_train, models=models, CV_n_splits=5, verbose=0, impute_strategy=imp_strategy)

# PIPELINE 2 
# decide to go for the accuracy metric of the three
selected_metric = 'balanced_accuracy'

model_classes_dict = {
    
    'RidgeClassifier' : RidgeClassifier(), 
    'LinearSVC': LinearSVC(),
    'SVC' : SVC(),
    'RandomForestClassifier' : RandomForestClassifier(), 
    'ExtraTreesClassifier' : ExtraTreesClassifier(), 
    'AdaBoostClassifier' : AdaBoostClassifier(), 
    'XGBClassifier' : XGBClassifier()
}

for model_class_name, model_class in model_classes_dict.items():
    parameters = bests_HI_EEG[model_class_name][selected_metric]['parameters']
    params=make_pipe_model_params(parameters)
    model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('model_class', model_class)])
    model.set_params(**params)
    results = cv_one_class_classification(model, HI_train, label_HI_train, n_splits=5)
    b_acc_med = np.median(np.array(results['balanced_accuracy']))
    f1_score_med = np.median(np.array(results['f1_score']))
    b_acc_m = np.array(results['balanced_accuracy']).mean()
    f1_score_m = np.array(results['f1_score']).mean()
    print('\n\n',
          model_class_name, '\n\t balanced_accuracy:  mean {} \t median {}' 
          '\n\t f1_score:  mean {} \t median {} \n\n'.format(b_acc_m, b_acc_med, f1_score_m, f1_score_med))
    
    

In [None]:
# PIPELINE 3
# select best model (manually) and predict on test set 

best_model_class = 'XGBClassifier'

best_model_parameters = bests_HI_EEG[best_model_class][selected_metric]['parameters']
best_model_params = make_pipe_model_params(best_model_parameters)
best_estimator = model_classes_dict[best_model_class]
best_model = Pipeline([('imputing',SimpleImputer(strategy=imp_strategy)),('scaling', StandardScaler()), ('model_class', best_estimator)])
best_model.set_params(**best_model_params)

best_model.fit(HI_train, label_HI_train)
best_model_preds = best_model.predict(HI_test)


best_model_test_b_accuracy = balanced_accuracy_score(label_HI_test, best_model_preds)
best_model_test_f1_score= f1_score(label_HI_test, best_model_preds)
print('\n\n Best model results test set: \n balanced_accuracy {} \t f1_score {} \n'.format(best_model_test_b_accuracy, best_model_test_f1_score))

In [None]:

# try with randomized search and multiple metrics instead of my function (NOT WORKING, SLOW)
'''
def rando_search_model_class (X, y, model_class, model_class_name, param_grid, n_combinations):
    
    from sklearn.metrics import make_scorer
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.model_selection import cross_val_score
    
    inner_cv = KFold(n_splits=5, shuffle=True)   # no random state passed
    outer_cv = KFold(n_splits=5, shuffle=True)   # no random state passed
    
    scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score), 
           'recall': make_scorer(recall_score)}
    
    # accuracy re-fit and nested scoring
    rgs_acc = RandomizedSearchCV(estimator=model_class, param_distributions=param_grid, n_iter=n_combinations, scoring=scoring, cv=inner_cv, refit='accuracy')
    rgs_acc.fit(X,y)
    model_nested_score_acc = cross_val_score(rgs_acc, X=X, y=y, cv=outer_cv)
    print('DONE ACCURACY')

    # precision re-fit and nested scoring
    rgs_prc = RandomizedSearchCV(estimator=model_class, param_distributions=param_grid, n_iter=n_combinations, scoring=scoring, cv=inner_cv, refit='precision')
    rgs_prc.fit(X,y)
    model_nested_score_prc = cross_val_score(rgs_prc, X=X, y=y, cv=outer_cv)
    print('DONE PRECISION')

    # recall re-fit and nested scoring
    rgs_rec = RandomizedSearchCV(estimator=model_class, param_distributions=param_grid, n_iter=n_combinations, scoring=scoring, cv=inner_cv, refit='recall')
    rgs_rec.fit(X,y)
    model_nested_score_rec = cross_val_score(rgs_rec, X=X, y=y, cv=outer_cv)
    print('DONE RECALL')

    best_models = {
        'accuracy': rgs_acc, 
        'precision': rgs_prc, 
        'recall': rgs_rec
    }
    best_results = {
        'accuracy': [model_nested_score_acc.mean(), model_nested_score_acc.median()],
        'precision': [model_nested_score_prc.mean(), model_nested_score_prc.median()], 
        'recall': [model_nested_score_rec.mean(), model_nested_score_rec.median()]
    }

    return best_models, best_results
    
model_class = ExtraTreesClassifier()
model_class_name = 'ExtraTreesClassifier'

param_grid ={
    'n_estimators' : [200, 250, 300],
    'criterion' : ['gini', 'entropy'],
    'min_samples_split' : [2, 3, 4],
    'bootstrap' : [True, False],
    'class_weight' : ['balanced', 'balanced_subsample'],
    'verbose' : [0],
    'n_jobs' : [3]
    }

best_models,best_results = rando_search_model_class(X_swan_train,label_train, model_class,model_class_name,param_grid, 3 )
'''