### Définition des imports

In [3]:
import pandas as pd
import numpy as np
import random
from path import Path as path
from sklearn.model_selection import LeavePOut, cross_val_score, StratifiedKFold, permutation_test_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier as RF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC as SVM
from sklearn.tree import DecisionTreeClassifier
from numpy.random import permutation
from scipy.io import savemat, loadmat
from tools import *

### Chargement des données

In [4]:
# Definition des paramètres
data_path = path('/home/tarek/Documents/arthur/Lab2/Loubna') # the path where the data is
save_path = data_path / '../results' # path where saves are going to be made
if not save_path.isdir(): # creates save dir if it does not exists
    save_path.mkdir()
    
df = pd.read_csv(data_path / 'BD_17_11.csv') # loading data in pandas dataframe format

rep_number = 100 # for unbalances classes : number of bootstraps
n_permutations = 1000 # for permutation test
subject_list = df['CODE'] # how to differentiate the subjects in the database : their code

# Préparation des conditions

## Conditions originales

In [9]:
''' A exécuter en deuxième à chaque fois'''
def CondNames(cond):
    '''permet de convertir les infos (1,2), (0) par exemple en infos intelligibles
    dans notre cas (1,2) correspond à Parkinson, Démence
    et (0) correspond à Disease Free
    On fait donc dans l'exemple Conv vs DF '''
    # ici on convertis les infos de la première colonne (indicée 0)
    if cond[0][0] == cond[0][1]: # si les deux chiffres de la première colonne sont les mêmes
                                 # On vérifie à quelle condition cela correspond
        if cond[0][0] == 1:
            name1 = 'Parkinson Disease'
        elif cond[0][0] == 2:
            name1 = 'Dementia Lewy bodies'
        elif cond[0][0] == 0:
            name1 = 'Disease Free'
    else: # sinon, c'est qu'on a des chiffres différents donc on étudie les "convertis"
        name1 = 'Converted'
        
    # ici on convertis les infos de la deuxième colonne (indicée 1)
    if cond[1] == 0:
        name2 = 'Disease Free'
    elif cond[1] == 2:
        name2 = 'Dementia Lewy bodies'
    if cond[1] == 3:
        name2 = 'Control'
    return name1, name2

def CreateLabels(dataset, cond):
    # génère la liste des étiquettes en fonction du dataset et de la condition
    label1_index = []
    label0_index = []
    for index, row in dataset.iterrows():
        look_at = row['Type de Conversion']
        if look_at == cond[0][0] or look_at == cond[0][1]:
            label0_index.append(index)
        elif look_at == cond[1]:
            label1_index.append(index)
    return label0_index, label1_index

# Ici on définit les conditions : ajouter ou supprimer des conditions
conds_list = [#((2, 2), (3)), #  Démence vs Contrôle
              ((1, 1), (2))] #  Parkinson vs Démence
#               ((1, 1), (3)), #  PD vs COntrôle
#               ((2, 2), (0)), #  Démence vs DF
#               ((1, 1), (0)), #  Parkinson vs DF
#               ((1, 2), (0)), #  Conv vs DF
#               ((1, 2), (3)), #  Conv vs Contrôle
#               ((0, 0), (3))] #  DF vs Contrôle

columns_to_drop = ['Conversion', 'PDvsDLB', 'DLBvs Ctrl']
columns_to_drop += ['Age']
MCI = False
print('Tout est OK')

Tout est OK


## Conditions MCI

In [None]:
def ConditionsMCI(o):
    name1, name2 = '',''
    if o == 0:
        name1, name2 = 'RBDMCI', 'RBDnoMCI'
        conditions = [0,1,2]
    elif o == 1:
        name1, name2 = 'RBD', 'Control'
        conditions = [0,1,2,3]
    elif o == 2:
        name1, name2 = 'RBDnoMCI + Control', 'RBDMCI'
        conditions = [0,1,2,3]
    return (name1, name2), conditions

def CreateLabelsMCI(dataset, o):
    label1_index = []
    label0_index = []
    for index, row in dataset.iterrows():
        look_at2 = float(row['MCI au T1'])
        look_at = row['Type de Conversion']
        if o in (0,2):
            if look_at2 == 1:
                label0_index.append(index)
            elif look_at2 == 0:
                label1_index.append(index)
            else:
                dataset = dataset.drop(index, 0)
        elif o == 1:
            if look_at in (0,1,2):
                label0_index.append(index)
            elif look_at == 3:
                label1_index.append(index)
            else:
                dataset = dataset.drop(index, 0)
    return label0_index, label1_index

conds_list = ['trois', 'trucs', 'random']

columns_to_drop = ['ss-type MCI T1 (DxBrain)', 'RBD_MCI single/multiple domain']
MCI = True

# Classification with selected features

In [10]:
if __name__ == '__main__':
    # Pour classif Parkinson-Démence :
    # DLB=2 PD=1 Normal=0 Contrôle=3
    o=0
#     columns_to_keep = [['TrailBris'] + ['Empanpon', 'MCI au T1', 'UPDRS3'],
#                        ['Educ', 'Stroop43err','TrailBris'],
#                        ['MCI au T1'],
#                        ['TrailBris'],
#                        ['ReyTot', 'ReyRI', 'Age'] + ['ReyB', 'EmpanIndir'],
#                        ['Educ', 'Sex']]

#     columns_to_keep = [['TrailBris', 'Empanpon' ]] # pour changer les features

    rep_number = 5
    n_permutations = 1000
    columns_to_keep = [['TrailBris', 'ReyTot']] # pour changer les features
    conds_list = [conds_list[0]] # pour selectionner la condition dans cond_list
    
    for cond in conds_list:
        kept_features = []
        names = CondNames(cond)
        dataset = df
        
        columns_to_keep[o] += ['CODE', 'Type de Conversion']
        for column in dataset:
            if column not in columns_to_keep[o]:
                try:
                    dataset = dataset.drop(column, 1)
                except:
                    print('there was a problem droping', column)
                    
        conditions = [cond[0][0], cond[0][1], cond[1]]
        dataset = SelectSubjects(dataset, conditions)
        dataset, dropped_columns, dropped_subjects = CleanDataset(dataset, columns_to_drop)
        label0_index, label1_index = CreateLabels(dataset, cond)
        dataset = dataset.drop('Type de Conversion', 1)
    
        for column in dataset:
            kept_features.append(column)
            
        print('\n%s features were dropped. Kept features :' % len(dropped_columns))
        print(kept_features)
        print(len(dropped_subjects), 'subjects were dropped :')
        print(dropped_subjects)

        m_class, M_class, m_class_index, M_class_index = FindMinorClass(label0_index, label1_index)
        nb_minority_class = len(m_class_index)
        
        print('%i %s vs %i %s' % (nb_minority_class, names[m_class], len(M_class_index), names[M_class]))
        if nb_minority_class > 4:
            print("Il y a %i features et %i sujets." % (dataset.shape[1], len(dataset)))
            file_name = '%svs%s_with_selfeatures.mat' % (names[m_class], names[M_class])
            file_path = save_path / file_name
            if not file_path.isfile():
                labels = [M_class]*nb_minority_class + [m_class]*nb_minority_class
                labels = np.asarray(labels, dtype=int)

                number_of_folds = int(nb_minority_class/2)  # aura pour équivalent Leave 4 subject Out stratifié
                cv = StratifiedKFold(n_splits=number_of_folds, shuffle=True)
                clf_choice = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=200, learning_rate=1)

                random_sets = CreateRandomBalancedDataset(dataset, m_class_index, M_class_index, rep_number)

                accuracies = []
                first = True
                pvalue = 0

                len(dataset)
                for perm in range(n_permutations+1):
                    for my_set in random_sets:
                        data = pd.concat([dataset.loc[my_set], dataset.loc[m_class_index]])
                        data = np.asarray(data)

                        if first == True:
                            labels = [M_class]*nb_minority_class + [m_class]*nb_minority_class
                            labels = np.asarray(labels, dtype=int)
                            first = False

                        clf = clf_choice
                        accuracies.append(cross_val_score(clf, X=data, y=labels, cv=cv, n_jobs=-1).mean())

                    labels = permutation(labels)
                donnees = {'data':accuracies}
                savemat(file_path, donnees)
                
            else:
                accuracies = loadmat(file_path)['data'].ravel()
            
            pvalue = 0                
            for score in accuracies[rep_number:]:
                if score > np.mean(accuracies[:rep_number]):
                    pvalue+=1/(n_permutations*rep_number)

            print('%0.2f (+/-%0.2f) significatif a p=%0.4f\n' % (np.mean(accuracies[:rep_number]), np.std(accuracies[:rep_number]), pvalue))

        else:
            print('Not enough subjects to perform classification\n')
        o += 1


1 features were dropped. Kept features :
['TrailBris', 'ReyTot']
0 subjects were dropped :
[]
16 Dementia Lewy bodies vs 18 Parkinson Disease
Il y a 2 features et 34 sujets.


KeyboardInterrupt: 