In [1]:
import pandas as pd
import numpy as np
import scipy.io as sio
from scipy import signal

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold, GridSearchCV  , StratifiedGroupKFold, cross_validate
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor, MLPClassifier

from random import sample, shuffle
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone
from copy import deepcopy

import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
dataset_path_root = "/home/bruno/Academico/Doctorado/Neuro_Fairness/Shu_Dataset/004_Dataset_6_Random_Feature_Extraction/"
participants=["sub-001","sub-002","sub-003","sub-004","sub-005",
              "sub-006","sub-007","sub-008","sub-009","sub-010",
              "sub-011","sub-012","sub-013","sub-014","sub-015",
              "sub-016","sub-017","sub-018","sub-019","sub-020",
              "sub-021","sub-022","sub-023","sub-024","sub-025"]
sessions = ["ses-01","ses-02","ses-03","ses-04","ses-05"]

In [3]:
dataset={}
for participant in participants:
    dataset[participant]={}
    data_path=participant+"_task_motorimagery_eeg_preprocessing_random_extractor_feature.mat"
    data=sio.loadmat(dataset_path_root + data_path)
    for session in sessions:
        dataset[participant][session +'_data_random']=data[session +'_data_random']
        dataset[participant][session +'_labels_trials']=data[session +'_labels_trials']
    dataset[participant]['sfreq']=np.squeeze(data['sfreq'])
    dataset[participant]['age']=np.squeeze(data['age'])
    dataset[participant]['gender']=data['gender'][0]
    dataset[participant]['group_medidator']=data['group_medidator'][0]
    dataset[participant]['id_participant']=data['id_participant'][0]

In [4]:
index_female = []
index_male = []
for participant in participants:
    if dataset[participant]['gender'] == 'M':
        index_male.append(participant)
    elif dataset[participant]['gender'] == 'F':
        index_female.append(participant)
print(f"Participantes hombres: {index_male}")
print(f"Participantes mujeres: {index_female}")

Participantes hombres: ['sub-001', 'sub-002', 'sub-008', 'sub-012', 'sub-013', 'sub-015', 'sub-017', 'sub-018', 'sub-019', 'sub-021', 'sub-022', 'sub-023', 'sub-025']
Participantes mujeres: ['sub-003', 'sub-004', 'sub-005', 'sub-006', 'sub-007', 'sub-009', 'sub-010', 'sub-011', 'sub-014', 'sub-016', 'sub-020', 'sub-024']


In [5]:
for participant in participants:
    data_ = np.concatenate((dataset[participant]['ses-01_data_random'],
                            dataset[participant]['ses-02_data_random'],
                            dataset[participant]['ses-03_data_random'],
                            dataset[participant]['ses-04_data_random'],
                            dataset[participant]['ses-05_data_random']),axis=0)
    
    dataset[participant]['data_random'] = data_
    dataset[participant]['data_gender'] =  np.array(list(dataset[participant]['gender']) * data_.shape[0])
    dataset[participant]['group_participant'] =  np.array(list([participant]) * data_.shape[0])

In [6]:
n_features = data_.shape[-1]
N_it = 20
n_test_participant = 2
n_val_participant = 2 
n_ign_participant = 1

max_iter = 200
patience = 50 
info_exp = {}
info_clfs = {}
for it in range(N_it):
    dic_aux = {}
    
    X_train = None
    X_val = None
    X_test = None 
    
    X_train_ = None
    X_val_ = None
    X_test_ = None
    
    idx_male = index_male.copy()
    idx_female = index_female.copy()
    
    # TEST PARTICIPANTS:
    idx_male_test = sample(idx_male, n_test_participant)
    idx_female_test = sample(idx_female, n_test_participant)
    for it_ in range(n_test_participant):
        idx_male.remove(idx_male_test[it_])
        idx_female.remove(idx_female_test[it_])
    idx_test = idx_male_test + idx_female_test
    dic_aux['reg_idx_test'] = idx_test   
    
    # VALIDATION PARTICIPANTS:
    idx_male_val = sample(idx_male, n_val_participant)
    idx_female_val = sample(idx_female, n_val_participant)
    for it_ in range(n_val_participant):
        idx_male.remove(idx_male_val[it_])
        idx_female.remove(idx_female_val[it_])
    idx_val = idx_male_val + idx_female_val
    dic_aux['reg_idx_val'] = idx_val  
    
    # TRAIN PARTICIPANTS:
    idx_male_ignore = sample(idx_male, n_ign_participant)
    for it_ in range(n_ign_participant):
        idx_male.remove(idx_male_ignore[it_])
    idx_male_train = idx_male.copy()
    idx_female_train = idx_female.copy()
    idx_train = idx_male_train + idx_female_train
    dic_aux['reg_idx_train'] = idx_train    
        
    
    # CONCATENAMOS EL CONJUNTO DE DATOS
    # TEST
    X_test = np.zeros((1,n_features))
    Y_test = np.zeros(1)
    for participant in idx_test:
        X_test = np.concatenate((X_test, dataset[participant]['data_random']),axis=0)
        Y_test = np.concatenate((Y_test, dataset[participant]['data_gender']),axis=0)
    X_test = X_test[1:,:]
    Y_test = Y_test[1:]
    dic_aux['n_trials_test'] = {'male':np.sum(Y_test == 'M'),'female':np.sum(Y_test == 'F')}
    dic_aux['proportion_trials_test'] = {'male':np.sum(Y_test == 'M')/(np.sum(Y_test == 'M')+np.sum(Y_test == 'F')),
                                      'female':np.sum(Y_test == 'F')/(np.sum(Y_test == 'M')+np.sum(Y_test == 'F'))}
    # VALIDATION
    X_val = np.zeros((1,n_features))
    Y_val = np.zeros(1)
    for participant in idx_val:
        X_val = np.concatenate((X_val, dataset[participant]['data_random']),axis=0)
        Y_val = np.concatenate((Y_val, dataset[participant]['data_gender']),axis=0)
    X_val = X_val[1:,:]
    Y_val = Y_val[1:]
    dic_aux['n_trials_val'] = {'male':np.sum(Y_val == 'M'),'female':np.sum(Y_val == 'F')}
    dic_aux['proportion_trials_val'] = {'male':np.sum(Y_val == 'M')/(np.sum(Y_val == 'M')+np.sum(Y_val == 'F')),
                                      'female':np.sum(Y_val == 'F')/(np.sum(Y_val == 'M')+np.sum(Y_val == 'F'))}
        
    # TRAIN
    X_train = np.zeros((1,n_features))
    Y_train = np.zeros(1)
    for participant in idx_train:
        X_train = np.concatenate((X_train, dataset[participant]['data_random']),axis=0)
        Y_train = np.concatenate((Y_train, dataset[participant]['data_gender']),axis=0)
    X_train = X_train[1:,:]
    Y_train = Y_train[1:]
    dic_aux['n_trials_train'] = {'male':np.sum(Y_train == 'M'),'female':np.sum(Y_train == 'F')}
    dic_aux['proportion_trials_train'] = {'male':np.sum(Y_train == 'M')/(np.sum(Y_train == 'M')+np.sum(Y_train == 'F')),
                                      'female':np.sum(Y_train == 'F')/(np.sum(Y_train == 'M')+np.sum(Y_train == 'F'))}   
    
    info_exp[f'it_{it}']=dic_aux
    dic_aux={}
    
    #-----------------------CLASSIFIER 1 -----------------------------#
    acc_train = None
    acc_val = None
    acc_test = None
    
    tol = 1e-4
    scaler = StandardScaler()
    clf = LinearDiscriminantAnalysis(tol=tol)
    
    X_train_ = scaler.fit_transform(X_train, Y_train)
    X_val_ = scaler.transform(X_val)
    X_test_ = scaler.transform(X_test)
    
    
    clf.fit(X_train_, Y_train)
    acc_train = clf.score(X_train_ , Y_train)
    acc_val = clf.score(X_val_ , Y_val)
    acc_test = clf.score(X_test_, Y_test)
    
    dic_aux['clf_1']={'estimator': clf, 'acc_train': acc_train, 'acc_val' : acc_val, 'acc_test' : acc_test}
    
    #-----------------------CLASSIFIER 2 -----------------------------#
    acc_train = None
    acc_val = None
    acc_test = None
    
    tol = 1e-4
    scaler = StandardScaler()
    clf = svm.SVC(C=1.0, kernel='linear', tol=tol)
    
    X_train_ = scaler.fit_transform(X_train, Y_train)
    X_val_ = scaler.transform(X_val)
    X_test_ = scaler.transform(X_test)
    
    
    clf.fit(X_train_, Y_train)
    acc_train = clf.score(X_train_ , Y_train)
    acc_val = clf.score(X_val_ , Y_val)
    acc_test = clf.score(X_test_, Y_test)
    
    dic_aux['clf_2']={'estimator': clf, 'acc_train': acc_train, 'acc_val' : acc_val, 'acc_test' : acc_test}
    
    #-----------------------CLASSIFIER 3 -----------------------------#
    acc_train = None
    acc_val = None
    acc_test = None
    
    tol = 1e-4
    scaler = StandardScaler()
    clf = svm.SVC(C=1.0, kernel='rbf', tol=tol)
    
    X_train_ = scaler.fit_transform(X_train, Y_train)
    X_val_ = scaler.transform(X_val)
    X_test_ = scaler.transform(X_test)
    
    
    clf.fit(X_train_, Y_train)
    acc_train = clf.score(X_train_ , Y_train)
    acc_val = clf.score(X_val_ , Y_val)
    acc_test = clf.score(X_test_, Y_test)
    
    dic_aux['clf_3']={'estimator': clf, 'acc_train': acc_train, 'acc_val' : acc_val, 'acc_test' : acc_test}
    
    
    #-----------------------CLASSIFIER 4 -----------------------------#
    acc_train_scores = []
    acc_val_scores = []
    acc_train = 0
    acc_val = 0
    acc_test = 0
    
    # Counter for patience
    it_patience = 0
    
    # Best classifier in early stopping
    best_clf = None
    it_stop = 0 
    
    scaler = StandardScaler()
    clf = MLPClassifier(hidden_layer_sizes=(10,8,5), activation='relu', solver='adam', alpha=0.0001,
                        learning_rate='constant', learning_rate_init=0.01,max_iter=max_iter)
    classes = np.unique(Y_train)

    for it_ in range(max_iter):
        # Scaler fit/transform
        scaler.partial_fit(X_train, Y_train)
        X_train_ = scaler.transform(X_train)
        X_val_ = scaler.transform(X_val)
        
        # Classifier fit / evaluate
        clf.partial_fit(X_train_,Y_train,classes=classes)
        acc_train_scores.append(clf.score(X_train_,Y_train))
        acc_val_scores.append(clf.score(X_val_,Y_val))
        
        if acc_val_scores[-1] >= acc_val:
            acc_val = acc_val_scores[-1]
            best_clf = deepcopy(clf)
            it_stop = it_
            it_patience = 0
        else: 
            it_patience += 1
        
        if it_patience >= patience:
            break
        
    X_test_ = scaler.transform(X_test)    
    acc_test = best_clf.score(X_test_,Y_test)
    acc_train = acc_train_scores[it_stop] 
     
    dic_aux['clf_4']={'estimator': clf, 'acc_train': acc_train, 'acc_val' : acc_val, 'acc_test' : acc_test,
                      'acc_train_scores': acc_train_scores, 'acc_val_scores': acc_val_scores, 'it_stop':it_stop}
    
    #-----------------------CLASSIFIER 5 -----------------------------#
    acc_train_scores = []
    acc_val_scores = []
    acc_train = 0
    acc_val = 0
    acc_test = 0
    
    # Counter for patience
    it_patience = 0
    
    # Best classifier in early stopping
    best_clf = None
    it_stop = 0 
    
    scaler = StandardScaler()
    clf = MLPClassifier(hidden_layer_sizes=(10,8,3), activation='relu', solver='adam', alpha=0.0001,
                        learning_rate='constant', learning_rate_init=0.01,max_iter=max_iter)
    classes = np.unique(Y_train)

    for it_ in range(max_iter):
        # Scaler fit/transform
        scaler.partial_fit(X_train, Y_train)
        X_train_ = scaler.transform(X_train)
        X_val_ = scaler.transform(X_val)
        
        # Classifier fit / evaluate
        clf.partial_fit(X_train_,Y_train,classes=classes)
        acc_train_scores.append(clf.score(X_train_,Y_train))
        acc_val_scores.append(clf.score(X_val_,Y_val))
        
        if acc_val_scores[-1] >= acc_val:
            acc_val = acc_val_scores[-1]
            best_clf = deepcopy(clf)
            it_stop = it_
            it_patience = 0
        else: 
            it_patience += 1
        
        if it_patience >= patience:
            break
        
    X_test_ = scaler.transform(X_test)    
    acc_test = best_clf.score(X_test_,Y_test)
    acc_train = acc_train_scores[it_stop] 
     
    dic_aux['clf_5']={'estimator': clf, 'acc_train': acc_train, 'acc_val' : acc_val, 'acc_test' : acc_test,
                      'acc_train_scores': acc_train_scores, 'acc_val_scores': acc_val_scores, 'it_stop':it_stop}
    
    #-----------------------CLASSIFIER 6 -----------------------------#
    acc_train_scores = []
    acc_val_scores = []
    acc_train = 0
    acc_val = 0
    acc_test = 0
    
    # Counter for patience
    it_patience = 0
    
    # Best classifier in early stopping
    best_clf = None
    it_stop = 0 
    
    scaler = StandardScaler()
    clf = MLPClassifier(hidden_layer_sizes=(10,4,2), activation='relu', solver='adam', alpha=0.0001,
                        learning_rate='constant', learning_rate_init=0.01,max_iter=max_iter)
    classes = np.unique(Y_train)

    for it_ in range(max_iter):
        # Scaler fit/transform
        scaler.partial_fit(X_train, Y_train)
        X_train_ = scaler.transform(X_train)
        X_val_ = scaler.transform(X_val)
        
        # Classifier fit / evaluate
        clf.partial_fit(X_train_,Y_train,classes=classes)
        acc_train_scores.append(clf.score(X_train_,Y_train))
        acc_val_scores.append(clf.score(X_val_,Y_val))
        
        if acc_val_scores[-1] >= acc_val:
            acc_val = acc_val_scores[-1]
            best_clf = deepcopy(clf)
            it_stop = it_
            it_patience = 0
        else: 
            it_patience += 1
        
        if it_patience >= patience:
            break
        
    X_test_ = scaler.transform(X_test)    
    acc_test = best_clf.score(X_test_,Y_test)
    acc_train = acc_train_scores[it_stop] 
     
    dic_aux['clf_6']={'estimator': clf, 'acc_train': acc_train, 'acc_val' : acc_val, 'acc_test' : acc_test,
                      'acc_train_scores': acc_train_scores, 'acc_val_scores': acc_val_scores, 'it_stop':it_stop}
    
    info_clfs[f'it_{it}']=dic_aux

In [7]:
#Generamos la tabla de los acc para cada iteración y clasificador
n_clfs = 6

matrix_acc_train = np.zeros((n_clfs,N_it))
matrix_acc_val = np.zeros((n_clfs,N_it))
matrix_acc_test = np.zeros((n_clfs,N_it))

for it0 in range(N_it):
    for it1 in range(n_clfs):
        matrix_acc_train[it1,it0] = info_clfs[f'it_{it0}'][f'clf_{it1+1}']['acc_train']
        matrix_acc_val[it1,it0] = info_clfs[f'it_{it0}'][f'clf_{it1+1}']['acc_val']
        matrix_acc_test[it1,it0] = info_clfs[f'it_{it0}'][f'clf_{it1+1}']['acc_test']

acc_train_clfs_mean = matrix_acc_train.mean(axis=1,keepdims=True)
acc_train_clfs_std = np.std(matrix_acc_train,axis=1,keepdims=True)

acc_val_clfs_mean = matrix_acc_val.mean(axis=1,keepdims=True)
acc_val_clfs_std = np.std(matrix_acc_val,axis=1,keepdims=True)

acc_test_clfs_mean = matrix_acc_test.mean(axis=1,keepdims=True)
acc_test_clfs_std = np.std(matrix_acc_test,axis=1,keepdims=True)

In [8]:
accs_train_df = pd.DataFrame(matrix_acc_train,index=['clf_1','clf_2','clf_3','clf_4','clf_5','clf_6'])
accs_val_df = pd.DataFrame(matrix_acc_val,index=['clf_1','clf_2','clf_3','clf_4','clf_5','clf_6'])
accs_test_df = pd.DataFrame(matrix_acc_test,index=['clf_1','clf_2','clf_3','clf_4','clf_5','clf_6'])
accs_test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
clf_1,0.552008,0.423536,0.562338,0.531599,0.466838,0.483299,0.550155,0.490206,0.53057,0.525494,0.524277,0.592344,0.48488,0.531892,0.466528,0.489902,0.560606,0.440382,0.414545,0.56112
clf_2,0.523687,0.475894,0.535954,0.5077,0.506427,0.472338,0.519131,0.472737,0.52487,0.49948,0.540289,0.508536,0.470015,0.496574,0.528282,0.480062,0.514629,0.492316,0.466494,0.556812
clf_3,0.558187,0.438569,0.538541,0.513542,0.477121,0.48643,0.515512,0.454738,0.537824,0.498959,0.482955,0.528195,0.48488,0.508171,0.415153,0.482651,0.472832,0.447271,0.433247,0.561659
clf_4,0.562822,0.50959,0.531816,0.481678,0.46581,0.478601,0.515512,0.465326,0.529534,0.529657,0.493285,0.555096,0.49308,0.493938,0.429165,0.483687,0.492685,0.45257,0.392208,0.508347
clf_5,0.556643,0.451011,0.53492,0.50239,0.459126,0.473904,0.523268,0.511911,0.531606,0.522893,0.477273,0.559234,0.475654,0.542963,0.423975,0.520974,0.507837,0.417594,0.416104,0.533118
clf_6,0.546859,0.497667,0.52716,0.485396,0.475064,0.477035,0.513961,0.511911,0.515544,0.526015,0.485021,0.508536,0.501281,0.508698,0.503373,0.481098,0.530825,0.434022,0.383377,0.491653


In [9]:
results_df = pd.DataFrame(np.concatenate((acc_train_clfs_mean,acc_train_clfs_std,
                                          acc_val_clfs_mean,acc_val_clfs_std,
                                          acc_test_clfs_mean,acc_test_clfs_std),axis=1),
                          columns = ['TRAIN MEAN', 'TRAIN STD','VAL MEAN', 'VAL STD','TEST MEAN', 'TEST STD']
                          ,index=['clf_1','clf_2','clf_3','clf_4','clf_5','clf_6'])
results_df

Unnamed: 0,TRAIN MEAN,TRAIN STD,VAL MEAN,VAL STD,TEST MEAN,TEST STD
clf_1,0.530961,0.016044,0.507276,0.046478,0.509126,0.048578
clf_2,0.520984,0.014518,0.503293,0.028143,0.504611,0.025526
clf_3,0.555641,0.010357,0.492028,0.034115,0.491822,0.040359
clf_4,0.542257,0.021659,0.521478,0.026976,0.49322,0.039831
clf_5,0.540794,0.021161,0.520173,0.03423,0.49712,0.0441
clf_6,0.525154,0.022135,0.521308,0.026734,0.495225,0.035226


In [10]:
from scipy.stats import ks_1samp
from scipy.stats import ttest_1samp
from scipy.stats import wilcoxon
from scipy import stats

In [11]:
alpha = 0.05
alpha_bonferroni = alpha / 6

test_norm = pd.DataFrame(columns=['statistics','p-value','H0'])
clf_names = accs_test_df.index.values.tolist()

for clf in clf_names: 
    acc_test = accs_test_df.loc[clf]
    norm_test = ks_1samp(acc_test,stats.norm.cdf)
    if norm_test.pvalue <= alpha_bonferroni:
        test_norm.loc[clf]=[norm_test.statistic,norm_test.pvalue,False]
    else:
        test_norm.loc[clf]=[norm_test.statistic,norm_test.pvalue,True]

print(f'Significancia: {alpha_bonferroni:.2}')
print(f'Intervalo de confianza: {100*(1-alpha_bonferroni)}')
test_norm

Significancia: 0.0083
Intervalo de confianza: 99.16666666666667


Unnamed: 0,statistics,p-value,H0
clf_1,0.660763,4.199124e-09,False
clf_2,0.679569,1.105249e-09,False
clf_3,0.660985,4.134972e-09,False
clf_4,0.652548,7.368066e-09,False
clf_5,0.661333,4.036507e-09,False
clf_6,0.64928,9.183885e-09,False


In [12]:
mu = 0.5
test = pd.DataFrame(columns=['statics','p-value','H0'])

for clf in clf_names: 
    acc_test = accs_test_df.loc[clf]
    w_test = wilcoxon(acc_test-mu)
    if w_test.pvalue <= alpha_bonferroni:
        test.loc[clf]=[w_test.statistic,w_test.pvalue,False]
    else: 
        test.loc[clf]=[w_test.statistic,w_test.pvalue,True]

print(f'Significancia: {alpha_bonferroni:.2}')
print(f'Intervalo de confianza: {100*(1-alpha_bonferroni)}')
test

Significancia: 0.0083
Intervalo de confianza: 99.16666666666667


Unnamed: 0,statics,p-value,H0
clf_1,82.0,0.409098,True
clf_2,87.0,0.521673,True
clf_3,81.0,0.388376,True
clf_4,89.0,0.570597,True
clf_5,103.0,0.956329,True
clf_6,103.0,0.956329,True
