### Définition des imports

In [None]:
import pandas as pd
import numpy as np
import random
from path import Path as path
from sklearn.model_selection import LeavePOut, cross_val_score, StratifiedKFold, permutation_test_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier as RF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC as SVM
from sklearn.tree import DecisionTreeClassifier
from numpy.random import permutation
from scipy.io import savemat, loadmat
from tools import *

### Chargement des données

In [2]:
# Definition des paramètres
data_path = path('/home/tarek/Documents/arthur/Lab2/Loubna') # the path where the data is
save_path = data_path / '../results' # path where saves are going to be made
if not save_path.isdir(): # creates save dir if it does not exists
    save_path.mkdir()
    
df = pd.read_csv(data_path / 'BD_17_11.csv') # loading data in pandas dataframe format

rep_number = 100 # for unbalances classes : number of bootstraps
n_permutations = 1000 # for permutation test
subject_list = df['CODE'] # how to differentiate the subjects in the database : their code

# Préparation des conditions

## Conditions originales

In [3]:
''' A exécuter en deuxième à chaque fois'''
def CondNames(cond):
    '''permet de convertir les infos (1,2), (0) par exemple en infos intelligibles
    dans notre cas (1,2) correspond à Parkinson, Démence
    et (0) correspond à Disease Free
    On fait donc dans l'exemple Conv vs DF '''
    # ici on convertis les infos de la première colonne (indicée 0)
    if cond[0][0] == cond[0][1]: # si les deux chiffres de la première colonne sont les mêmes
                                 # On vérifie à quelle condition cela correspond
        if cond[0][0] == 1:
            name1 = 'Parkinson Disease'
        elif cond[0][0] == 2:
            name1 = 'Dementia Lewy bodies'
        elif cond[0][0] == 0:
            name1 = 'Disease Free'
    else: # sinon, c'est qu'on a des chiffres différents donc on étudie les "convertis"
        name1 = 'Converted'
        
    # ici on convertis les infos de la deuxième colonne (indicée 1)
    if cond[1] == 0:
        name2 = 'Disease Free'
    elif cond[1] == 2:
        name2 = 'Dementia Lewy bodies'
    if cond[1] == 3:
        name2 = 'Control'
    return name1, name2

def CreateLabels(dataset, cond):
    # génère la liste des étiquettes en fonction du dataset et de la condition
    label1_index = []
    label0_index = []
    for index, row in dataset.iterrows():
        look_at = row['Type de Conversion']
        if look_at == cond[0][0] or look_at == cond[0][1]:
            label0_index.append(index)
        elif look_at == cond[1]:
            label1_index.append(index)
    return label0_index, label1_index

# Ici on définit les conditions : ajouter ou supprimer des conditions
conds_list = [#((2, 2), (3)), #  Démence vs Contrôle
              ((1, 1), (2))] #  Parkinson vs Démence
#               ((1, 1), (3)), #  PD vs COntrôle
#               ((2, 2), (0)), #  Démence vs DF
#               ((1, 1), (0)), #  Parkinson vs DF
#               ((1, 2), (0)), #  Conv vs DF
#               ((1, 2), (3)), #  Conv vs Contrôle
#               ((0, 0), (3))] #  DF vs Contrôle

# columns_to_drop = ['Conversion', 'PDvsDLB', 'DLBvs Ctrl', 'Type de Conversion']
columns_to_drop = ['Age']
MCI = False
print('Tout est OK')

Tout est OK


## Conditions MCI

In [None]:
def ConditionsMCI(o):
    name1, name2 = '',''
    if o == 0:
        name1, name2 = 'RBDMCI', 'RBDnoMCI'
        conditions = [0,1,2]
    elif o == 1:
        name1, name2 = 'RBD', 'Control'
        conditions = [0,1,2,3]
    elif o == 2:
        name1, name2 = 'RBDnoMCI + Control', 'RBDMCI'
        conditions = [0,1,2,3]
    return (name1, name2), conditions

def CreateLabelsMCI(dataset, o):
    label1_index = []
    label0_index = []
    for index, row in dataset.iterrows():
        look_at2 = float(row['MCI au T1'])
        look_at = row['Type de Conversion']
        if o in (0,2):
            if look_at2 == 1:
                label0_index.append(index)
            elif look_at2 == 0:
                label1_index.append(index)
            else:
                dataset = dataset.drop(index, 0)
        elif o == 1:
            if look_at in (0,1,2):
                label0_index.append(index)
            elif look_at == 3:
                label1_index.append(index)
            else:
                dataset = dataset.drop(index, 0)
    return label0_index, label1_index

conds_list = ['trois', 'trucs', 'random']

columns_to_drop = ['ss-type MCI T1 (DxBrain)', 'RBD_MCI single/multiple domain']
MCI = True

# Dataset info for conditions

In [None]:
if __name__ == '__main__':
    # Pour classif Parkinson-Démence :
    # DLB=2 PD=1 Normal=0 Contrôle=3
    for o, cond in enumerate(conds_list):
        dataset = df
        kept_features = []
        if MCI:
            names, conditions = ConditionsMCI(o)
        else:
            names = CondNames(cond)
            conditions = [cond[0][0], cond[0][1], cond[1]]
        # trie les sujets :
        dataset = SelectSubjects(dataset, conditions)
        # on nettoie plus de sujets pour les conditions MCI
        # nettoie les donnees :
        dataset, dropped_columns, dropped_subjects = CleanDataset(dataset, columns_to_drop)
        # cree les labels :
        if MCI:
            label0_index, label1_index = CreateLabelsMCI(dataset, o)       
            dataset = dataset.drop('MCI au T1', 1)
        else:
            label0_index, label1_index = CreateLabels(dataset, cond)
        dataset = dataset.drop('Type de Conversion', 1)
        for column in dataset:
            kept_features.append(column)

        print('\n%s features were dropped. Kept features :' % len(dropped_columns))
        print(kept_features)
        print(len(dropped_subjects), 'subjects were dropped :')
        print(dropped_subjects)
        # Verifier quelle est la classe minoritaire
        m_class, M_class, m_class_index, M_class_index = FindMinorClass(label0_index, label1_index)
        nb_minority_class = len(m_class_index)
        print('%i %s vs %i %s' % (nb_minority_class, names[m_class], len(M_class_index), names[M_class]))
        print("Il y a %i features et %i sujets." % (dataset.shape[1], len(dataset)))

# Feature selection

In [None]:
if __name__ == '__main__':
    # Pour classif Parkinson-Démence :
    # DLB=2 PD=1 Normal=0 Contrôle=3
    rep_number = 100
    for o, cond in enumerate(conds_list):
        kept_features = []
        dataset = df
        
        if MCI:
            names, conditions = ConditionsMCI(o)
        else:
            names = CondNames(cond)
            conditions = [cond[0][0], cond[0][1], cond[1]]
#         if 3 in conditions:
#             if 'MCI au T1' not in columns_to_drop:
#                 columns_to_drop.append('MCI au T1')
                
        dataset = SelectSubjects(dataset, conditions)
        dataset, dropped_columns, dropped_subjects = CleanDataset(dataset, columns_to_drop)
        if MCI:
            label0_index, label1_index = CreateLabelsMCI(dataset, o)
            dataset = dataset.drop('MCI au T1', 1)
        else:
            label0_index, label1_index = CreateLabels(dataset, cond)
        dataset = dataset.drop('Type de Conversion', 1)
        for column in dataset:
            kept_features.append(column)
            
        print('\n%s features were dropped. Kept features :' % len(dropped_columns))
        print(kept_features)
        print(len(dropped_subjects), 'subjects were dropped :')
        print(dropped_subjects)

        m_class, M_class, m_class_index, M_class_index = FindMinorClass(label0_index, label1_index)
        nb_minority_class = len(m_class_index)
        
        print('%i %s vs %i %s' % (nb_minority_class, names[m_class], len(M_class_index), names[M_class]))
        kept_features = np.asarray(kept_features)
        if nb_minority_class > 4:
            print("Il y a %i features et %i sujets." % (dataset.shape[1], len(dataset)))
            labels = [M_class]*nb_minority_class + [m_class]*nb_minority_class
            labels = np.asarray(labels, dtype=int)

            number_of_folds = int(nb_minority_class/2)  # aura pour équivalent Leave 4 subject Out stratifié
            clf_choice = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=200, learning_rate=1)

            random_sets = CreateRandomBalancedDataset(dataset, m_class_index, M_class_index, rep_number)

            accuracies = []
            first = True
            pvalue = 0
            file_name = '%svs%s_features.mat' % (names[m_class], names[M_class])
            file_path = save_path / file_name
            feats = []
            if not file_path.isfile():
                for my_set in random_sets:
                    data = pd.concat([dataset.loc[my_set], dataset.loc[m_class_index]])
                    data = np.asarray(data)

                    if first == True:
                        labels = [M_class]*nb_minority_class + [m_class]*nb_minority_class
                        labels = np.asarray(labels, dtype=int)
                        first = False

                    clf = clf_choice
                    sfs1 = SFS(estimator=clf,
                               k_features=(1,data.shape[1]),
    #                            k_features=3,
                               forward=False,
                               floating=False,
                               scoring='accuracy',
                               verbose=1,
                               cv=5,
                               n_jobs=6)

                    sfs1.fit(data, labels)
                    print('Best combin (ACC= %.3f): %s' % (sfs1.k_score_, kept_features[list(sfs1.k_feature_idx_)].tolist()))
                    feats.append(kept_features[list(sfs1.k_feature_idx_)].tolist())
                sizes = [len(feat) for feat in feats]
                for feat in feats:
                    while len(feat) < max(sizes):
                        feat.append('empty')
                savemat(file_path, {'features': feats, 'score': sfs1.k_score_ })
        else:
            print('Not enough subjects to perform classification\n')

In [None]:
import operator
for file in [f for f in save_path.files() if f.endswith('_features.mat')]:
    a = loadmat(file)
    features = a['features'].ravel()
    for i, feature in enumerate(features):
        features[i] = feature.replace(" ", "")
    scores = a['score']
    del a
    compte = {}.fromkeys(set(features),0)
    for feature in features:
        compte[feature] += 1
    del compte['empty']
    best_features = sorted(compte.items(), key=operator.itemgetter(1), reverse=True)
    print(file.name[:-13].replace('vs', ' vs ') + ':', best_features, '\n')