# Testing

### Prueba de evaluacion

El framework se evalúa con un enfoque de validación cruzada. Las métricas utilizadas son las AUC bajo el ROC y la precisión.

In [1]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from problem import get_cv

def evaluation(X, y):
    pipe = make_pipeline(FeatureExtractor(), Classifier())
    cv = get_cv(X, y)
    results = cross_validate(pipe, X, y, scoring=['roc_auc', 'accuracy'], cv=cv,
                             verbose=1, return_train_score=True,
                             n_jobs=1)
    
    return results

### Data

In [2]:
from problem import get_train_data
data_train, labels_train = get_train_data()

#data_train_anatomy = data_train[[col for col in data_train.columns if col.startswith('anatomy')]]
#data_train_functional = data_train[[col for col in data_train.columns if col.startswith('fmri')]]

## FeatureExtractor

FeatureExtractor extrae información de conectividad estructural y funcional y los *concatena*. 
Tener en cuenta que cada columna contendrá en su nombre ya sea connectome o anatomía dependiendo del tipo de característica. Se usará para entrenar diferentes clasificadores más adelante.

Connectome funcional: Una matriz de correlación, también se puede ver como un "gráfico": un conjunto de nodos, conectados por bordes. Cuando estos nodos son regiones del cerebro y los bordes capturan las interacciones entre ellos, este gráfico es un "conectoma funcional".

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn import preprocessing
from nilearn.connectome import ConnectivityMeasure


def _load_fmri(fmri_filenames):
    """cargar series de tiempo extraídas de la fMRI usando un atlas específico"""
    return np.array([pd.read_csv(subject_filename,
                                 header=None).values
                     for subject_filename in fmri_filenames])

# BaseEstimator: Clase base para todos los estimadores en scikit-learn
# TransformerMixin: Clase Mixin para todos los transformadores en scikit-learn
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        # transformador que cargará las series de tiempo de un fmri y calcula matrix connectome
        # ConnectivityMeasure: conectividad funcional entre regiones de interés
        self.transformer_fmri = make_pipeline(
            FunctionTransformer(func=_load_fmri, validate=False),
            ConnectivityMeasure(kind='correlation', vectorize=True))

        
    def fit(self, X_df, y):       
        # especifico que fmri usar 
        fmri_filenames = X_df['fmri_msdl'] 
        self.transformer_fmri.fit(fmri_filenames, y) 
        fmri_filenames2 = X_df['fmri_power_2011'] 
        self.transformer_fmri.fit(fmri_filenames2, y)
        '''
        fmri = ['fmri_basc064', 'fmri_basc122', 'fmri_basc197', 'fmri_craddock_scorr_mean',
        'fmri_harvard_oxford_cort_prob_2mm', 'fmri_motions', 'fmri_msdl (atlas)', 'fmri_power_2011 (atlas)',
        'fmri_select']
        '''
        return self

    def transform(self, X_df):
        # obtengo fmri especifico. sujeto - atlas(./data/fmri/msdl/1932355398536124106/run_1/193...)
        fmri_filenames = X_df['fmri_msdl']  
        # data = connectome: es un conjunto de conexiones que representa las interacciones cerebrales entre regiones
        X_connectome = self.transformer_fmri.transform(fmri_filenames)
        X_connectome = pd.DataFrame(X_connectome, index=X_df.index)
        X_connectome.columns = ['connectome_{}'.format(i)
                                for i in range(X_connectome.columns.size)]     
        fmri_filenames2 = X_df['fmri_power_2011']
        X_connectome2 = self.transformer_fmri.transform(fmri_filenames2)
        X_connectome2 = pd.DataFrame(X_connectome2, index=X_df.index)
        X_connectome2.columns = ['connectome2_{}'.format(i)
                                for i in range(X_connectome2.columns.size)]   
                
        # obtiene la información anatómica
        X_anatomy = X_df[[col for col in X_df.columns
                          if col.startswith('anatomy')]]       
        
        X_participants = X_df[[col for col in X_df.columns
                          if col.startswith('participants')]]
                       
        # concatenar la data
        concat = pd.concat([X_connectome, X_connectome2, X_anatomy, X_participants], axis=1)
    
        concat[(concat['anatomy_select'] == 1)]    
        concat['participants_sex'] = concat['participants_sex'].map({'F': 1, 'M': 0})

        return concat

## Classifier

Formando 2 clasificadores independientes en las funciones derivadas de sMRI y fMRI. Luego, se usará un meta clasificador para combinar ambas informaciones. Dejamos afuera algunos datos para poder entrenar el meta clasificador.

In [2]:
import numpy as np

from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

#Classifiers
#from sklearn.linear_model import LogisticRegression
#from sklearn.svm import LinearSVC
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.naive_bayes import GaussianNB
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#from sklearn.dummy import DummyRegressor
#from sklearn.ensemble import ExtraTreesClassifier

#preprocessing
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.preprocessing import minmax_scale
#from sklearn.preprocessing import MaxAbsScaler
#from sklearn.preprocessing import RobustScaler
#from sklearn.preprocessing import Normalizer
#from sklearn.preprocessing.data import QuantileTransformer

class Classifier(BaseEstimator):
    def __init__(self):  
        
        # StandardScaler: Estandariza las características eliminando la media y escalando a la varianza de la unidad    
        # Prueba de clasificadores:
        # MLPClassifier : Clasificador de perceptrón multicapa. Optimiza la función de pérdida de registros usando 
        #  LBFGS (Broyden–Fletcher–Goldfarb–Shanno algorithm): método iterativo para resolver probl no lineales de optimización no lineal 
        #  o SGD (descenso de gradiente estocástico): aprendizaje discriminatorio de clasificadores lineales bajo func de pérdida convexa
                
        self.clf_connectome = make_pipeline(StandardScaler(),
                                            MLPClassifier(alpha=1))
        self.clf_connectome2 = make_pipeline(StandardScaler(),
                                            MLPClassifier(alpha=1))
        self.clf_anatomy = make_pipeline(StandardScaler(),
                                         MLPClassifier(alpha=1))       
        self.clf_participants = make_pipeline(StandardScaler(),
                                         MLPClassifier(alpha=1))
        self.meta_clf = MLPClassifier(alpha=1)
        

    def fit(self, X, y):        
        # obtener data
        X_anatomy = X[[col for col in X.columns 
                       if col.startswith('anatomy')]]
        X_connectome = X[[col for col in X.columns
                          if col.startswith('connectome')]]
        X_connectome2 = X[[col for col in X.columns
                          if col.startswith('connectome2')]]
        X_participants = X[[col for col in X.columns
                            if col.startswith('participants')]]
        
        # entrenamiento y validacion 
        train_idx, validation_idx = train_test_split(range(y.size),
                                                     test_size=0.33, 
                                                     shuffle=True,
                                                     random_state=None) 
        
        # pandas.DataFrame.iloc: Indización basada en la ubicación de enteros 
        # para la selección por posición.
        X_anatomy_train = X_anatomy.iloc[train_idx]
        X_anatomy_validation = X_anatomy.iloc[validation_idx]
        X_connectome_train = X_connectome.iloc[train_idx]
        X_connectome_validation = X_connectome.iloc[validation_idx]
        X_connectome2_train = X_connectome2.iloc[train_idx]
        X_connectome2_validation = X_connectome2.iloc[validation_idx]
        X_participants_train = X_participants.iloc[train_idx]
        X_participants_validation = X_participants.iloc[validation_idx]
        
        y_train = y[train_idx]
        y_validation = y[validation_idx]

        self.clf_connectome.fit(X_connectome_train, y_train)
        self.clf_connectome2.fit(X_connectome2_train, y_train)
        self.clf_anatomy.fit(X_anatomy_train, y_train)
        self.clf_participants.fit(X_participants_train, y_train)

        y_connectome_pred = self.clf_connectome.predict_proba(
            X_connectome_validation)
        y_connectome2_pred = self.clf_connectome2.predict_proba(
            X_connectome2_validation)
        y_anatomy_pred = self.clf_anatomy.predict_proba(
            X_anatomy_validation)
        y_participants_pred = self.clf_participants.predict_proba(
            X_participants_validation)

        self.meta_clf.fit(
            np.concatenate([y_connectome_pred, y_connectome2_pred,  
                            y_anatomy_pred, y_participants_pred], axis=1),
            y_validation)
        
        return self
    
    # Predecir usando el clasificador
    # Realizar la clasificación en una matriz de vectores de prueba X.
    def predict(self, X):
        X_anatomy = X[[col for col in X.columns 
                       if col.startswith('anatomy')]]
        X_connectome = X[[col for col in X.columns
                          if col.startswith('connectome')]]
        X_connectome2 = X[[col for col in X.columns
                          if col.startswith('connectome2')]]

        X_participants = X[[col for col in X.columns
                            if col.startswith('participants')]]        

        y_anatomy_pred = self.clf_anatomy.predict_proba(X_anatomy)
        y_connectome_pred = self.clf_connectome.predict_proba(X_connectome)
        y_connectome2_pred = self.clf_connectome2.predict_proba(X_connectome2)
        y_participants_pred = self.clf_participants.predict_proba(X_participants)

        return self.meta_clf.predict(
            np.concatenate([y_connectome_pred, y_connectome2_pred,  
                            y_anatomy_pred, y_participants_pred], axis=1))

    # metodo: Estimaciones de probabilidad de retorno para el vector de prueba X.
    def predict_proba(self, X):
        X_anatomy = X[[col for col in X.columns 
                       if col.startswith('anatomy')]]
        X_connectome = X[[col for col in X.columns
                          if col.startswith('connectome')]]
        X_connectome2 = X[[col for col in X.columns
                          if col.startswith('connectome2')]]
        X_participants = X[[col for col in X.columns
                            if col.startswith('participants')]]

        y_anatomy_pred = self.clf_anatomy.predict_proba(X_anatomy)
        y_connectome_pred = self.clf_connectome.predict_proba(X_connectome)
        y_connectome2_pred = self.clf_connectome2.predict_proba(X_connectome2)
        y_participants_pred = self.clf_participants.predict_proba(X_participants)

        return self.meta_clf.predict_proba(
            np.concatenate([y_connectome_pred, y_connectome2_pred,  
                            y_anatomy_pred, y_participants_pred], axis=1))

## Test

In [None]:
import numpy as np

results = evaluation(data_train, labels_train)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results['train_roc_auc']),
                                                        np.std(results['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results['test_roc_auc']),
                                                          np.std(results['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['train_accuracy']),
                                                         np.std(results['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['test_accuracy']),
                                                           np.std(results['test_accuracy'])))

Test_05:

Training score ROC-AUC: 0.935 +- 0.010
Validation score ROC-AUC: 0.704 +- 0.037 

Training score accuracy: 0.871 +- 0.016
Validation score accuracy: 0.652 +- 0.030

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 18.9min finished