# Introduction to classification with Clinica:
## Classifying normal subjects versus Alzheimer’s disease patients from neuroimaging and clinical data from ADNI database
### - T1 MRI all-voxels SVM classification
### - FDG PET region based Logistic regression classification
### - Clinical data Random forest classification

## Prerequisites

The original [ADNI](http://adni.loni.usc.edu/) dataset should be downloaded without further touch.
Set the paths to where the data is stored on your computer:

In [None]:
!export ADNI_PATH="~/Aramis/Data/ADNI"
!export OUT_PATH="~/Aramis/Data/OUTPUT"
!export WORKING_DIR="~/Aramis/Data/tmp/WORKING_DIR"

## 1. Convert datasets into BIDS format

In [None]:
!clinica convert adni-to-bids $ADNI_PATH/IMAGES $ADNI_PATH/CLINICAL_DATA $OUT_PATH/ADNI/BIDS -m T1 PET_FDG

Define folders for the next steps:

In [None]:
import os

adnimerge = 'PATH/TO/ADNIMERGE.csv'

adni_bids = os.path.join(os.environ.get('OUT_PATH'), 'ADNI/BIDS')
adni_tsv_dir = os.path.join(os.environ.get('OUT_PATH'), 'ADNI/TSV')
adni_caps_dir = os.path.join(os.environ.get('OUT_PATH'), 'ADNI/CAPS')
adni_output_dir = os.path.join(os.environ.get('OUT_PATH'), 'ADNI/OUTPUT')

working_dir = os.environ.get('WORKING_DIR')


## 2. Create the subjects lists
Choose the subjects at baseline with available T1 MRI for ADNI, AIBL and OASIS, and with FDG-PET for ADNI:

In [None]:
from Code.subjects_lists.subject_lists import run_subjects_lists

### ADNI dataset
database = 'ADNI'

# For T1
subjects_list = 'T1'
run_subjects_lists(adni_bids, adni_tsv_dir, database, subjects_list, adnimerge)

# For FDG-PET
subjects_list = 'PET'
run_subjects_lists(adni_bids, adni_tsv_dir, database, subjects_list, adnimerge)


## 3. Create demographic tables information
Get demographic information of the different populations:

In [None]:
from Code.subjects_lists.lists_stats import run_lists_stats

### ADNI dataset
database = 'ADNI'

# For T1
subjects_list = 'T1'
run_lists_stats(adni_bids, adni_tsv_dir, database, subjects_list, adnimerge)

# For FDG-PET
subjects_list = 'PET'
run_lists_stats(adni_bids, adni_tsv_dir, database, subjects_list, adnimerge)


## 4. Run Clinica image processing pipelines
We make use of pipelines integrated into Clinica software

In [None]:
### ADNI T1
# Tissue segmentation
# Group template creation
# Registration to MNI space
# Parcellation into anatomical regions
!clinica run t1-volume $OUT_PATH/ADNI/BIDS $OUT_PATH/ADNI/CAPS/ ADNIbl -tsv /SUBJECTS_DIR/subjects_T1_PET.tsv -wd $WORKING_DIR -np 8


### ADNI FDG-PET
# Registration to T1 space
# Partial value correction (PVC) (optional)
# Registration to MNI space
# Intensity normalization to SUVR
# Parcellation into anatomical regions
!clinica run pet-volume $OUT_PATH/ADNI/BIDS $OUT_PATH/ADNI/CAPS/ ADNIbl -tsv $OUT_PATH/ADNI/TSV/subjects_T1_PET.tsv -pet_tracer fdg -wd $WORKING_DIR -np 8


## 5. Run classification tasks on imaging data
### Classifications using T1-weighted MRI and FDG-PET images from ADNI dataset

In [None]:
from os import path
import clinica.pipelines.machine_learning.ml_workflows as ml_wf

n_iterations = 250
n_threads = 8

group_id = 'ADNIbl'
subjects_visits_tsv = path.join(adni_tsv_dir, 'CN_vs_AD_subjects_sessions.tsv')
diagnoses_tsv = path.join(adni_tsv_dir, 'CN_vs_AD_diagnoses.tsv')


### Voxel based SVM classification for T1 images with 8mm of smoothing

In [None]:
classification_dir = path.join(adni_output_dir, 'T1', 'voxel_based', 'linear_svm', 'CN_vs_AD')
if not path.exists(classification_dir):
    os.makedirs(classification_dir)

image_type = 'T1'
fwhm = 8

wf = ml_wf.VoxelBasedRepHoldOutDualSVM(adni_caps_dir,
                                       subjects_visits_tsv,
                                       diagnoses_tsv,
                                       group_id,
                                       image_type,
                                       classification_dir,
                                       fwhm=fwhm,
                                       n_iterations=n_iterations,
                                       n_threads=n_threads)

print("Running %s" % classification_dir)
wf.run()


### Region based SVM classification for FDG PET images using AAL2 atlas

In [None]:
classification_dir = path.join(adni_output_dir, 'fdg', 'region_based', 'logistic_reg', 'CN_vs_AD')
if not path.exists(classification_dir):
    os.makedirs(classification_dir)

image_type = 'fdg'
atlas = 'AAL2'

wf = ml_wf.RegionBasedRepHoldOutLogisticRegression(adni_caps_dir,
                                                   subjects_visits_tsv,
                                                   diagnoses_tsv,
                                                   group_id,
                                                   image_type,
                                                   atlas,
                                                   classification_dir,
                                                   n_iterations=n_iterations,
                                                   n_threads=n_threads)
print("Running %s" % classification_dir)
wf.run()


## 6. Preparing clinical data

Data from different sources (clinical data from BIDS and ADNIMERGE data are joined into one file (`CN_vs_AD_clinical_data.tsv`)

In [None]:
import pandas as pd
from os import path

# We will collect the clinical data for a list of subjects. 
# We assume file contains 'participant_id', 'session_id' and 'diagnosis' fields
diagnoses_tsv = path.join(adni_tsv_dir, 'CN_vs_AD_diagnoses.tsv')
subj_sessions = pd.read_csv(diagnoses_tsv, sep='\t')

# Examples of columns that can be used from ADNI BIDS:
participant_columns = ["sex", "education_level", "marital_status", "apoe4", "apoe_gen1", "apoe_gen2"]

session_columns = ["age",
                   # Cognitive measures
                   "MMSE", "cdr_sb", "cdr_global", "adas11", "adas13",
                   "adas_memory", "adas_language", "adas_concentration", "adas_praxis", "ravlt_immediate", "moca",
                   "TMT_A", "TMT_B", "dsst", "logmem_delay", "logmem_imm",
                   # T1 measures
                   "adni_ventricles_vol", "adni_hippocampus_vol", "adni_brain_vol", "adni_entorhinal_vol",
                   "adni_fusiform_vol", "adni_midtemp_vol", "adni_icv",
                   # PET measures
                   "adni_fdg", "adni_pib", "adni_av45",
                   # CSF measures
                   "adni_abeta", "adni_tau", "adni_ptau"]

participant_series = {}
session_series = {}
for col in participant_columns:
    participant_series[col] = []
for col in session_columns:
    session_series[col] = []

participants_tsv = pd.read_csv(path.join(adni_bids, "participants.tsv"), sep='\t')

# We collect the specified columns data
for row in subj_sessions.iterrows():
    subj_sess = row[1]
    
    # From the participants.tsv file for each subject
    selected_participant = participants_tsv[(participants_tsv.participant_id == subj_sess.participant_id)].iloc[0]
    for col in participant_columns:
        participant_series[col].append(selected_participant[col])

    # From the different sessions.tsv files for each subject and session
    session_tsv = pd.read_csv(path.join(adni_bids, subj_sess.participant_id,
                                        subj_sess.participant_id + "_sessions.tsv"), sep='\t')
    selected_session = session_tsv[(session_tsv.session_id == subj_sess.session_id)].iloc[0]
    for col in session_columns:
        session_series[col].append(selected_session[col])

# We add collected information to subjects .tsv
for col in participant_columns:
    subj_sessions.loc[:, col] = pd.Series(participant_series[col], index=subj_sessions.index)

for col in session_columns:
    subj_sessions.loc[:, col] = pd.Series(session_series[col], index=subj_sessions.index)

# We replace gender information that is text by numeric values
subj_sessions.loc[subj_sessions[subj_sessions.sex == 'F'].index, 'sex'] = 1
subj_sessions.loc[subj_sessions[subj_sessions.sex == 'M'].index, 'sex'] = 0

clinical_dir = path.join(adni_tsv_dir, 'clinical_data')

if not path.exists(clinical_dir):
    os.makedirs(clinical_dir)
    
subj_sessions.to_csv(path.join(clinical_dir, 'CN_vs_AD_clinical_data.tsv'), sep='\t', index=False)


### Filtering data according to columns that are going to be used as input to classification

A population is determined by data availability, and a separated data input file is created.


In [None]:
import pandas as pd
from os import path

subj_sessions = pd.read_csv(path.join(clinical_dir, 'CN_vs_AD_clinical_data.tsv'), sep='\t')

# In our model we will use as input: 
# "sex", "education_level", "apoe4", "MMSE", "cdr_sb", "adas_memory", 
# "adas_language", "adas_concentration", "adas_praxis", "ravlt_immediate"

# We select the population for which there are not missing values for the desired fields
model_1 = subj_sessions[~subj_sessions.sex.isnull() &
                        ~subj_sessions.education_level.isnull() &
                        ~subj_sessions.apoe4.isnull() &
                        ~subj_sessions.MMSE.isnull() &
                        ~subj_sessions.cdr_sb.isnull() &
                        ~subj_sessions.adas_memory.isnull() &
                        ~subj_sessions.adas_language.isnull() &
                        ~subj_sessions.adas_concentration.isnull() &
                        ~subj_sessions.adas_praxis.isnull() &
                        ~subj_sessions.ravlt_immediate.isnull()]

model_1.to_csv(path.join(adni_tsv_dir, 'clinical_data', 'input_models', 'CN_vs_AD_model_1.tsv'), sep='\t', index=False)


## 7. Run classification tasks on clinical data
### Random forest classification using demographic data and cognitive tests from ADNI dataset

In [None]:
from os import path
import clinica.pipelines.machine_learning.ml_workflows as ml_wf

n_iterations = 250
n_threads = 8

data_tsv = path.join(adni_tsv_dir, 'clinical_data', 'input_models', 'CN_vs_AD_model_1.tsv')

classification_dir = path.join(adni_output_dir, 'clinical_data', 'random_forest', 'CN_vs_AD')
if not path.exists(classification_dir):
    os.makedirs(classification_dir)

columns = ["sex", "education_level", "apoe4", "MMSE", "cdr_sb", "adas_memory", 
           "adas_language", "adas_concentration", "adas_praxis", "ravlt_immediate"]

wf = ml_wf.TsvRepHoldOutRandomForest(data_tsv,
                                     columns,
                                     classification_dir,
                                     n_threads=n_threads,
                                     n_iterations=n_iterations,
                                     inner_cv=True)
print("Running %s" % classification_dir)
wf.run()
