**This script was done to compare a quality of fit between visits**

- 100 fold validation

alternatives:
    - V1 trained, tested v1
    - V2 trained, tested V2
    - V1 trained, V2 tested

- folds are done separately and are not comparable between visits
- directory containing results: ESO/models/adaptation_comparison

In [1]:
# load packages
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import pcntoolkit as pcn
import pickle
import scipy

from scipy.stats import shapiro

from enigmatoolbox.utils.parcellation import parcel_to_surface
from enigmatoolbox.plotting import plot_cortical, plot_subcortical
from enigmatoolbox.utils.useful import reorder_sctx

from pcntoolkit.normative import estimate, predict, evaluate
from pcntoolkit.util.utils import compute_MSLL, create_design_matrix

ext_scripts_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/braincharts/scripts')
os.chdir(ext_scripts_dir)

from nm_utils import remove_bad_subjects, load_2d

code_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/code')
os.chdir(code_dir)

# importing custom functions
from clinics_desc_functions import prepare_data, plot_quality, trajectory_plotting, dk_roi_viz, load_clinics, en_qc, pretrained_adapt, set_seed

# set seed
set_seed()

# formatiing
%matplotlib inline
%config InlineBackend.print_figure_kwargs = {'bbox_inches':None}

Random seed 42 has been set.


In [2]:
# where things are
main_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO')
models_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/models/adaptation_comparison')
os.makedirs(models_dir, exist_ok=True)
cdata_dir = ('/home/barbora/Documents/Projects/2021_06_AZV_ESO/data')
fsdata_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/fs_stats')
bdata_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/backup')
images_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/img')
pretrained_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/braincharts')

In [3]:
####
# Getting a pretrained model
# ###
model_name = 'lifespan_57K_82sites'
site_names = 'site_ids_82sites.txt'

# load a set of site ids from this model. This must match the training data
with open(os.path.join(pretrained_dir,'docs', site_names)) as f:
    site_ids_tr = f.read().splitlines()

###
# Load the list of IDPs
###

## Our dataset currently misses 2 variables: 'lh_MeanThickness_thickness', 'rh_MeanThickness_thickness'; to be extracted from a2009s
# load the list of idps for left and right hemispheres, plus subcortical regions
with open(os.path.join(pretrained_dir,'docs','phenotypes_lh.txt')) as f:
    idp_ids_lh = f.read().splitlines()
with open(os.path.join(pretrained_dir,'docs','phenotypes_rh.txt')) as f:
    idp_ids_rh = f.read().splitlines()
with open(os.path.join(pretrained_dir,'docs','phenotypes_sc.txt')) as f:
    idp_ids_sc = f.read().splitlines()

# we choose here to process all idps
idp_ids = idp_ids_lh + idp_ids_rh + idp_ids_sc

# delete features that are not present in our data
idp_ids.remove('lh_MeanThickness_thickness')
idp_ids.remove('rh_MeanThickness_thickness')

###
# Starting small
###
#idp_ids = ['CSF']

In [4]:
###
# Load clinics
###

# load clinics from the first visit
c_v1 = pd.read_excel(os.path.join(cdata_dir,'visit1_desc.xlsx')) 
c_v1 = load_clinics(c_v1)
d_v1 = pd.read_csv(os.path.join(bdata_dir,'fit_external_thickness_1.txt'), delimiter=';', index_col=0)
v1_concat = pd.concat([c_v1, d_v1], axis=1, join="inner")
v1_concat["sitenum"] =1000

# load clinics from the second visit
c_v2 = pd.read_excel(os.path.join(cdata_dir,'visit2_desc.xlsx')) 
c_v2 = load_clinics(c_v2)
d_v2 = pd.read_csv(os.path.join(bdata_dir,'fit_external_thickness_2.txt'), delimiter=';',  index_col=0)
v2_concat = pd.concat([c_v2, d_v2], axis=1, join="inner")
v2_concat["sitenum"] =1000

# Quality control based on Euler Number
save_img, img_dir, show_img = True, images_dir, False
v1_clean = en_qc(v1_concat, save_img=save_img, img_dir=img_dir, show_img=show_img)
v2_clean = en_qc(v2_concat, save_img=save_img, img_dir=img_dir, show_img=show_img)

In [5]:
###
# Splitting datasets
###

v1_pat = v1_clean[v1_clean['Category']=='Patient']
v1_cont = v1_clean[v1_clean['Category']=='Control']

# extract a list of unique site ids from the test set
site_ids_te =  sorted(set(v1_pat['site'].to_list()))

v2_pat = v2_clean[v2_clean['Category']=='Patient']
v2_cont = v2_clean[v2_clean['Category']=='Control']


In [7]:
###
# Configure covariates
###
# which data columns do we wish to use as covariates? 
cols_cov = ['age','sex']

# limits for cubic B-spline basis 
xmin = -5 
xmax = 110

# Absolute Z treshold above which a sample is considered to be an outlier (without fitting any model)
outlier_thresh = 7

In [164]:
###
# Run models
# -> folders containing models
###

# Which method do we want to test?
method_setup = 12 
method_dir = os.path.join(models_dir, 'V'+str(method_setup))
os.makedirs(method_dir, exist_ok=True)

nfolds = 100

# each method has to be run separately

for ivisit in range(1,3):

    # make dir for visits
    if method_setup == 12:
        visit_dir = os.path.join(method_dir,'V'+str(ivisit))
        os.makedirs(visit_dir, exist_ok=True)
        nofolds = range(0,nfolds)
        
    elif method_setup == 11:
        visit_dir = method_dir
        if ivisit == 1:
            nofolds = range(1,2)
        elif ivisit ==2:
            break
    
    for ifold in nofolds:
   
        # make dir for visits
        if method_setup == 12:
            # V1 and V2 are both splitted to test and train
            if ivisit == 1:
                df_ad, df_te = train_test_split(v1_cont, test_size=0.33, shuffle=True)
            elif ivisit == 2:
                df_ad, df_te = train_test_split(v2_cont, test_size=0.33, shuffle=True)
            
        elif method_setup == 11:
            # V1 controls are training; V2 controls testing
            df_ad, df_te = v1_cont, v2_cont



        for idp_num, idp in enumerate(idp_ids): 
            print('Running IDP', idp_num, idp, ':')
            idp_dir = os.path.join(pretrained_dir,'models','lifespan_57K_82sites', idp)
            idp_visit_dir = os.path.join(visit_dir,idp)
            os.makedirs(idp_visit_dir, exist_ok=True)
            os.chdir(idp_visit_dir)

            # create directories for every fold
            idp_fold_dir = os.path.join(idp_visit_dir,'fold_'+str(ifold))
            os.makedirs(idp_fold_dir, exist_ok=True)
            os.chdir(idp_fold_dir)

            pretrained_adapt_small(idp, site_ids_tr, site_ids_te, pretrained_dir, idp_fold_dir, df_ad, df_te)
    

Running IDP 0 lh_G&S_frontomargin_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 1 lh_G&S_occipital_inf_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 2 lh_G&S_paracentral_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 3 lh_G&S_subcentral_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 4 lh_G&S_transv_frontopol_thickness :
Some sites missing from the training data. Adapt

In [169]:
###
# Scrape the z-scores
# -> df_results
###
from scipy import stats
nfolds = 100
# empty np array for the results (which is p value of shapiro wilk test of normality on z-scores)
results = np.empty([len(idp_ids), (nfolds*2+1)])

method_setup = 12 # adapt on V1 and V2 separately
method_dir = os.path.join(models_dir, 'V'+str(method_setup))

for ivisit in range(1,3):
    visit_dir = os.path.join(method_dir,'V'+str(ivisit))

    for n_idp, idp in enumerate(idp_ids):

        for ifold in range(0,nfolds):

            if ivisit == 1:
                n_fold = ifold
            elif ivisit == 2:
                n_fold = ifold + nfolds

            idp_visit_dir = os.path.join(visit_dir,idp)
            idp_fold_dir = os.path.join(idp_visit_dir,'fold_'+str(ifold))

            temp  = pd.read_csv(os.path.join(idp_fold_dir,'Z_predict.txt'), header=None)
            results[n_idp, n_fold] = stats.shapiro(temp)[1]


method_setup = 11 # adapt only for V1
method_dir = os.path.join(models_dir, 'V'+str(method_setup))

for ivisit in range(1,2):
    visit_dir = method_dir

    for n_idp, idp in enumerate(idp_ids):

        for ifold in range(1,2):

            idp_visit_dir = os.path.join(visit_dir,idp)
            idp_fold_dir = os.path.join(idp_visit_dir,'fold_'+str(ifold))

            temp  = pd.read_csv(os.path.join(idp_fold_dir,'Z_predict.txt'), header=None)
            results[n_idp, -1] = stats.shapiro(temp)[1]

colnames = ['V'+str(vis)+'_f'+str(fold) for vis in range(1,3) for fold in range(0,nfolds)]
colnames.append('V11')

df_results = pd.DataFrame(results, columns = colnames, index=idp_ids)



In [174]:
df_results.to_csv(os.path.join(models_dir,'z_score_all.csv'), sep = ' ', header=True, index=True)

In [170]:
###
# Plot z-scores of Shapiro wilk test across folds of V11 and V22 together with the value for V12
###

mimg_dir = os.path.join(models_dir,'img')
os.makedirs(mimg_dir, exist_ok=True)
os.chdir(mimg_dir)

for idp in idp_ids:#['lh_G&S_paracentral_thickness']:
    sns.set(style='whitegrid')
    fig, ax = plt.subplots(1,1)

    sns.distplot(df_results.loc[idp][0:nfolds], norm_hist=False, color = 'darkgoldenrod', label='V11')
    sns.distplot(df_results.loc[idp][nfolds:nfolds*2], ax=ax, norm_hist=False, color = 'lightseagreen', label='V22')
    plt.axvline(df_results.loc[idp][-1], color='dimgray', label='V12')
    plt.axvline(0.05, color='dimgray', linestyle = '--')
    plt.legend()
    plt.savefig(idp+'.png')
    plt.close()
    


In [36]:
###
# Scrape the z-scores - separate test for SEX
# -> df_results_s
###
from scipy import stats
nfolds = 100
# empty np array for the results (which is p value of shapiro wilk test of normality on z-scores)
results_f = np.empty([len(idp_ids), (nfolds*2+1)])
results_m = np.empty([len(idp_ids), (nfolds*2+1)])

method_setup = 12 # adapt on V1 and V2 separately
method_dir = os.path.join(models_dir, 'V'+str(method_setup))

for ivisit in range(1,3):
    visit_dir = os.path.join(method_dir,'V'+str(ivisit))

    for n_idp, idp in enumerate(idp_ids):

        for ifold in range(0,nfolds):

            if ivisit == 1:
                n_fold = ifold
            elif ivisit == 2:
                n_fold = ifold + nfolds

            idp_visit_dir = os.path.join(visit_dir,idp)
            idp_fold_dir = os.path.join(idp_visit_dir,'fold_'+str(ifold))

            sex = pd.read_csv(os.path.join(idp_fold_dir,'cov_bspline_te.txt'), header=None, sep =' ')
            temp  = pd.read_csv(os.path.join(idp_fold_dir,'Z_predict.txt'), header=None)
            results_f[n_idp, n_fold] = stats.shapiro(temp[(sex[2]==0)])[1]
            results_m[n_idp, n_fold] = stats.shapiro(temp[(sex[2]==1)])[1]


method_setup = 11 # adapt only for V1
method_dir = os.path.join(models_dir, 'V'+str(method_setup))

for ivisit in range(1,2):
    visit_dir = method_dir

    for n_idp, idp in enumerate(idp_ids):

        for ifold in range(1,2):

            idp_visit_dir = os.path.join(visit_dir,idp)
            idp_fold_dir = os.path.join(idp_visit_dir,'fold_'+str(ifold))

            sex = pd.read_csv(os.path.join(idp_fold_dir,'cov_bspline_te.txt'), header=None, sep =' ')
            temp  = pd.read_csv(os.path.join(idp_fold_dir,'Z_predict.txt'), header=None)
            results_f[n_idp, -1] = stats.shapiro(temp[(sex[2]==0)])[1]
            results_m[n_idp, -1] = stats.shapiro(temp[(sex[2]==1)])[1]

colnames = ['V'+str(vis)+'_f'+str(fold) for vis in range(1,3) for fold in range(0,nfolds)]
colnames.append('V11')

df_results_f = pd.DataFrame(results_f, columns = colnames, index=idp_ids)
df_results_m = pd.DataFrame(results_m, columns = colnames, index=idp_ids)



In [37]:
df_results_f.to_csv(os.path.join(models_dir,'z_score_f.csv'), sep = ' ', header=True, index=True)
df_results_m.to_csv(os.path.join(models_dir,'z_score_m.csv'), sep = ' ', header=True, index=True)

In [46]:
###
# Plot z-scores of Shapiro wilk test across folds of V11 and V22 together with the value for V12
###

mimg_dir = os.path.join(models_dir,'img_s')
os.makedirs(mimg_dir, exist_ok=True)
os.chdir(mimg_dir)

for idp in idp_ids:#['lh_G&S_paracentral_thickness']:
    sns.set(style='whitegrid')
    fig, ax = plt.subplots(1,1, figsize=(7,5))

    sns.distplot(df_results_f.loc[idp][0:nfolds], norm_hist=False, color = 'lightcoral', label='V11_f', hist=False)
    sns.distplot(df_results_f.loc[idp][nfolds:nfolds*2], ax=ax, norm_hist=False, color = 'darkviolet', label='V22_f', hist=False)
    plt.axvline(df_results_f.loc[idp][-1], color='red', label='V12_f')
    
    sns.distplot(df_results_m.loc[idp][0:nfolds], norm_hist=False, color = 'lightseagreen', label='V11_m', hist=False)
    sns.distplot(df_results_m.loc[idp][nfolds:nfolds*2], ax=ax, norm_hist=False, color = 'blue', label='V22_m', hist=False)
    plt.axvline(df_results_m.loc[idp][-1], color='navy', label='V12_m')

    plt.axvline(0.05, color='dimgray', linestyle = '--')
    plt.legend()
    #plt.show()
    plt.savefig(idp+'_s.png')
    plt.close()
    


In [13]:
###
# Is there a systematic bias in V22?
###
df_z = pd.read_csv(os.path.join(models_dir,'z_score_all.csv'), sep = ' ', index_col=0)

In [46]:
v1_all = np.concatenate(df_z.iloc[:,df_z.columns.str.startswith('V1_')].to_numpy())
v2_all = np.concatenate(df_z.iloc[:,df_z.columns.str.startswith('V2_')].to_numpy())

sns.set_theme('paper')
sns.set(style='whitegrid')
fig = plt.subplots(1)
sns.histplot(v1_all, color='tomato', label = 'V1')
sns.histplot(v2_all, color='teal', label='V2')

sns.despine()
plt.legend()
plt.savefig(os.path.join(models_dir,'v1_v2_z_comparison.png'))
plt.close()