In [1]:
# load packages
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import scipy.stats as ss

ext_scripts_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/braincharts/scripts')
os.chdir(ext_scripts_dir)

from nm_utils import remove_bad_subjects, load_2d

code_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/code')
os.chdir(code_dir)

# importing custom functions
import clinics_desc_functions as custom

sns.set(rc={'figure.facecolor':'white'})

In [21]:
import prepare_models_for_analysis

model_name, site_names, site_ids_tr, idp_ids = custom.pretrained_ini()

# where things are
main_dir = ('/home/barbora/Documents/Projects/Normative_Models/COINS')
models_dir = ('/home/barbora/Documents/Projects/Normative_Models/COINS/models')
os.makedirs(models_dir, exist_ok=True)
fsdata_dir = ('/home/barbora/Documents/Projects/Normative_Models/COINS/backup')
pretrained_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/braincharts/models/lifespan_57K_82sites')
images_dir = os.path.join(models_dir,'img')
os.makedirs(images_dir, exist_ok=True)
out_dir = models_dir


Load FS info
-------------

In [3]:
v1_fs = pd.read_csv(os.path.join(fsdata_dir,'fit_external_long_thickness_1.txt'), sep=';')
v1_fs.index = [i.split('.')[-1][2:] for i in v1_fs['id']]
v1_fs = v1_fs.drop(columns= 'id')
v1_fs.sort_index(inplace=True)

In [4]:
v2_fs = pd.read_csv(os.path.join(fsdata_dir,'fit_external_long_thickness_2.txt'), sep=';')
v2_fs.index = [i.split('.')[-1][2:] for i in v2_fs['id']]
v2_fs = v2_fs.drop(columns= 'id')
v2_fs.sort_index(inplace=True)

In [5]:
if sum(v1_fs.index == v2_fs.index) == v1_fs.shape[0]:
    print('Indicies acoss the two dataframes are in sync!')

Indicies acoss the two dataframes are in sync!


Load clinics
-----------

In [142]:
corr_clin = pd.read_csv(os.path.join(fsdata_dir, 'CoRR_AggregatedPhenotypicData.csv'))
pk = [str(i) for i in corr_clin['SUBID']]
corr_clin.index = pk
corr_clin.sort_index(inplace=True)

corr_clin.rename(columns={'SEX':'sex', 'AGE_AT_SCAN_1':'age', 'SITE':'site'}, inplace=True)

In [143]:
# delete subjects that were not processed
del_from_clin = list(set(corr_clin.index.unique()) - set(v1_fs.index))
corr_clin = corr_clin.drop(index= del_from_clin)
corr_clin = corr_clin[corr_clin['SESSION']=='Baseline']

# relabel SEX to 0-females; 1-males
corr_clin['sex'] = corr_clin['sex'].replace({'1': '0', '2':'1'})
corr_clin['sex']= corr_clin['sex'].astype(int)

# change age to number
corr_clin['age'] = pd.to_numeric(corr_clin['age'])

In [144]:
# create sitenum for distinct sites
sites = corr_clin['site'].unique()
df_sites = corr_clin.groupby('site').agg('nunique')
df_sites['sitenum'] = range(1001,1001+len(sites))
dict_sites = df_sites['sitenum'].to_dict()
corr_clin['sitenum'] = corr_clin['site']
corr_clin['sitenum'].replace(dict_sites, inplace=True)

# Delete sites with less than 20 subjects
sites_pivot = corr_clin.groupby('site').agg('count')['SUBID']
sites_small = sites_pivot[sites_pivot<20].index

corr_clin = corr_clin.drop(index=corr_clin.iloc[np.where(corr_clin['site'].isin(sites_small))[0]].index)

# extract a list of unique site ids from the test set
site_ids_te =  sorted(set(corr_clin['site'].to_list()))

In [10]:
# delete from v1_fs, v2_fs
del_from_fs = list(set(v1_fs.index) - set(corr_clin.index.unique()))
v1_fs = v1_fs.drop(index = del_from_fs)
v2_fs = v2_fs.drop(index = del_from_fs)

# add age and sex data to fs dataframes
v1_fs = pd.concat([v1_fs,corr_clin[['sex', 'age', 'site', 'sitenum']]],axis=1,join='inner')
v2_fs = pd.concat([v2_fs,corr_clin[['sex', 'age', 'site', 'sitenum']]],axis=1,join='inner')

**Run Normative models**
-------------------------

In [101]:
from pcntoolkit.normative import estimate, predict, evaluate
from pcntoolkit.util.utils import compute_MSLL, create_design_matrix

####
# Getting a pretrained model
# ###
model_name, site_names, site_ids_tr, idp_ids = custom.pretrained_ini()

In [171]:
# train_test_split - split across sites, keep 30% as training
from sklearn.model_selection import train_test_split

index_split_ad = list()
index_split_te = list()

for i, isite in enumerate(site_ids_te):
    site_spec_ind = corr_clin.index[(corr_clin['site']==isite)]
    
    if i == 0:
        index_split_te, index_split_ad = train_test_split(site_spec_ind, test_size = 0.3, shuffle = False, random_state = 42)
    else:
        b, a = train_test_split(site_spec_ind, random_state = 42)
        index_split_ad = index_split_ad.append(a)
        index_split_te = index_split_te.append(b)


**Need to change the visit and run twice**

In [175]:
###
# Configure covariates
###
# which data columns do we wish to use as covariates? 
cols_cov = ['age','sex']

# limits for cubic B-spline basis 
xmin = -5 
xmax = 110

# Absolute Z treshold above which a sample is considered to be an outlier (without fitting any model)
outlier_thresh = 7

# which visit to analyze?
which_visit = 2

# Pick the correct dataset for modelling
if which_visit == 1:
    df_ad = v1_fs.loc[index_split_ad]
    df_te = v1_fs.loc[index_split_te]
    df_ad.to_csv(os.path.join(models_dir,'V1','df_ad.csv'), sep=' ', index= True)
    df_te.to_csv(os.path.join(models_dir,'V1','df_te.csv'), sep=' ', index= True)

else:
    df_ad = v2_fs.loc[index_split_ad]
    df_te = v2_fs.loc[index_split_te]
    df_ad.to_csv(os.path.join(models_dir,'V2','df_ad.csv'), sep=' ', index= True)
    df_te.to_csv(os.path.join(models_dir,'V2','df_te.csv'), sep=' ', index= True)

In [176]:
# Running the models

for idp_num, idp in enumerate(idp_ids): 
    print('Running IDP', idp_num, idp, ':')
    idp_dir = os.path.join(out_dir, 'V'+str(which_visit), idp)
    
    os.makedirs(idp_dir, exist_ok=True)
    os.chdir(idp_dir)
    
    # extract and save the response variables for the test set
    y_te = df_te[idp].to_numpy()
    
    # save the variables
    resp_file_te = os.path.join(idp_dir, 'resp_te.txt') 
    np.savetxt(resp_file_te, y_te)
        
    # configure and save the design matrix
    cov_file_te = os.path.join(idp_dir, 'cov_bspline_te.txt')
    X_te = create_design_matrix(df_te[cols_cov], 
                                site_ids = df_te['site'],
                                all_sites = site_ids_tr,
                                basis = 'bspline', 
                                xmin = xmin, 
                                xmax = xmax)
    np.savetxt(cov_file_te, X_te)
    
    # check whether all sites in the test set are represented in the training set
    if all(elem in site_ids_tr for elem in site_ids_te):
        print('All sites are present in the training data')
        
        # just make predictions
        yhat_te, s2_te, Z = predict(cov_file_te, 
                                    alg='blr', 
                                    respfile=resp_file_te, 
                                    model_path=os.path.join(idp_dir,'Models'))
    else:
        print('Some sites missing from the training data. Adapting model')
        
        # save the covariates for the adaptation data
        X_ad = create_design_matrix(df_ad[cols_cov], 
                                    site_ids = df_ad['site'],
                                    all_sites = site_ids_tr,
                                    basis = 'bspline', 
                                    xmin = xmin, 
                                    xmax = xmax)
        cov_file_ad = os.path.join(idp_dir, 'cov_bspline_ad.txt')          
        np.savetxt(cov_file_ad, X_ad)
        
        # save the responses for the adaptation data
        resp_file_ad = os.path.join(idp_dir, 'resp_ad.txt') 
        y_ad = df_ad[idp].to_numpy()
        np.savetxt(resp_file_ad, y_ad)
       
        # save the site ids for the adaptation data
        sitenum_file_ad = os.path.join(idp_dir, 'sitenum_ad.txt') 
        site_num_ad = df_ad['sitenum'].to_numpy(dtype=int)
        np.savetxt(sitenum_file_ad, site_num_ad)
        
        # save the site ids for the test data 
        sitenum_file_te = os.path.join(idp_dir, 'sitenum_te.txt')
        site_num_te = df_te['sitenum'].to_numpy(dtype=int)
        np.savetxt(sitenum_file_te, site_num_te)
         
        yhat_te, s2_te, Z = predict(cov_file_te, 
                                    alg = 'blr', 
                                    respfile = resp_file_te, 
                                    model_path = os.path.join(pretrained_dir,idp,'Models'),
                                    adaptrespfile = resp_file_ad,
                                    adaptcovfile = cov_file_ad,
                                    adaptvargroupfile = sitenum_file_ad,
                                    testvargroupfile = sitenum_file_te)

Running IDP 0 lh_G&S_frontomargin_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 1 lh_G&S_occipital_inf_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 2 lh_G&S_paracentral_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 3 lh_G&S_subcentral_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 4 lh_G&S_transv_frontopol_thickness :
Some sites missing from the training data. Adapt

Analysis
--------

**Load the values across idps and visits and compare across sites**

In [177]:
v1_z = custom.idp_concat(os.path.join(models_dir,'V1'), 'Z_predict.txt', idp_ids, 'CORR_V1_Z.csv', t_dir= models_dir)
v2_z = custom.idp_concat(os.path.join(models_dir,'V2'), 'Z_predict.txt', idp_ids, 'CORR_V2_Z.csv', t_dir= models_dir)

v1_z = pd.read_csv(v1_z, sep = ' ')
v2_z = pd.read_csv(v2_z, sep = ' ')

In [178]:
# load the test-retest data
v1_orig = pd.read_csv(os.path.join(models_dir, 'V1', 'df_te.csv'), sep=' ', index_col=0)
v2_orig = pd.read_csv(os.path.join(models_dir, 'V2', 'df_te.csv'), sep=' ', index_col=0)

# load the ESO controls data 
v1_cont = pd.read_csv(os.path.join('/home/barbora/Documents/Projects/Normative_Models/ESO/analyses/01_PANSS/data','v1_cont.txt'), sep=' ', index_col=0)
v2_cont = pd.read_csv(os.path.join('/home/barbora/Documents/Projects/Normative_Models/ESO/analyses/01_PANSS/data','v2_cont.txt'), sep=' ', index_col=0)

In [179]:
v1_z.index = v1_orig.index
v1_z['site'] = v1_orig['site']
v1_z['age'] = v1_orig['age']
v1_z['sex'] = v1_orig['sex']

v2_z.index = v2_orig.index
v2_z['site'] = v1_orig['site']
v2_z['age'] = v1_orig['age']
v2_z['sex'] = v1_orig['sex']

# adding ESO data
v1_z_all = pd.concat([v1_z[idp_ids+ cols_cov+['site']], v1_cont[idp_ids+ cols_cov+['site']]])
v2_z_all = pd.concat([v2_z[idp_ids+ cols_cov+['site']], v2_cont[idp_ids+ cols_cov+['site']]])

# diff
diff = v2_z_all[idp_ids] - v1_z_all[idp_ids]
diff['site'] = v1_z_all['site']
diff['sex'] = v1_z_all['sex']

In [207]:
# Plotting differences between visits across sites with respect to sex

FM_colors = ["coral", "darkseagreen"]
cp = sns.set_palette(sns.color_palette(FM_colors))


for idp in idp_ids:
    fig, ax = plt.subplots(2,1,figsize=(15,6))
    ax[0] = sns.boxplot(x="site", y=idp, data=diff, hue='sex', palette = cp, ax=ax[0])
    ax[0].set(ylim=(-3, 3))
#    ax[0].set_xticklabels(ax[0].get_xticklabels(),rotation = 30)
    ax[0].axhline(0, linewidth = 3, color='#94B0DA')
    ax[0].axvline(14.5, linewidth = 3, color='#94B0DA', linestyle='--')

    ax[1] = sns.countplot(data=diff, x='site')
    fig.suptitle(idp)

    plt.savefig(os.path.join(models_dir, 'img',idp),  bbox_inches='tight')
    plt.close()