In [1]:
# load packages
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import pcntoolkit as pcn
import pickle

from scipy.stats import shapiro

from enigmatoolbox.utils.parcellation import parcel_to_surface
from enigmatoolbox.plotting import plot_cortical, plot_subcortical
from enigmatoolbox.utils.useful import reorder_sctx

from pcntoolkit.normative import estimate, predict, evaluate
from pcntoolkit.util.utils import compute_MSLL, create_design_matrix

ext_scripts_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/braincharts/scripts')
os.chdir(ext_scripts_dir)

from nm_utils import remove_bad_subjects, load_2d

code_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/code')
os.chdir(code_dir)

# importing custom functions
from clinics_desc_functions import prepare_data, plot_quality, trajectory_plotting, dk_roi_viz, load_clinics, en_qc, pretrained_adapt_small, set_seed

# set seed
set_seed()

# formatiing
%matplotlib inline
%config InlineBackend.print_figure_kwargs = {'bbox_inches':None}



Random seed 42 has been set.


In [2]:
# where things are
main_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO')
models_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/models/adaptation_comparison_paired')
os.makedirs(models_dir, exist_ok=True)
cdata_dir = ('/home/barbora/Documents/Projects/2021_06_AZV_ESO/data')
fsdata_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/fs_stats')
bdata_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/backup')
images_dir = os.path.join(models_dir,'img')
os.makedirs(images_dir, exist_ok=True)
pretrained_dir = ('/home/barbora/Documents/Projects/Normative_Models/ESO/braincharts')

In [6]:
####
# Getting a pretrained model
# ###
model_name = 'lifespan_57K_82sites'
site_names = 'site_ids_82sites.txt'

# load a set of site ids from this model. This must match the training data
with open(os.path.join(pretrained_dir,'docs', site_names)) as f:
    site_ids_tr = f.read().splitlines()

## Our dataset currently misses 2 variables: 'lh_MeanThickness_thickness', 'rh_MeanThickness_thickness'; to be extracted from a2009s
# load the list of idps for left and right hemispheres, plus subcortical regions
with open(os.path.join(pretrained_dir,'docs','phenotypes_lh.txt')) as f:
    idp_ids_lh = f.read().splitlines()
with open(os.path.join(pretrained_dir,'docs','phenotypes_rh.txt')) as f:
    idp_ids_rh = f.read().splitlines()
with open(os.path.join(pretrained_dir,'docs','phenotypes_sc.txt')) as f:
    idp_ids_sc = f.read().splitlines()

# we choose here to process all idps
idp_ids = idp_ids_lh + idp_ids_rh + idp_ids_sc

# delete features that are not present in our data
idp_ids.remove('lh_MeanThickness_thickness')
idp_ids.remove('rh_MeanThickness_thickness')


In [5]:
###
# Load clinics
###

# load clinics from the first visit
c_v1 = pd.read_excel(os.path.join(cdata_dir,'visit1_desc.xlsx')) 
c_v1 = load_clinics(c_v1)
d_v1 = pd.read_csv(os.path.join(bdata_dir,'fit_external_thickness_1.txt'), delimiter=';', index_col=0)
v1_concat = pd.concat([c_v1, d_v1], axis=1, join="inner")
v1_concat["sitenum"] =1000

# load clinics from the second visit
c_v2 = pd.read_excel(os.path.join(cdata_dir,'visit2_desc.xlsx')) 
c_v2 = load_clinics(c_v2)
d_v2 = pd.read_csv(os.path.join(bdata_dir,'fit_external_thickness_2.txt'), delimiter=';',  index_col=0)
v2_concat = pd.concat([c_v2, d_v2], axis=1, join="inner")
v2_concat["sitenum"] =1000

# Quality control based on Euler Number
save_img, img_dir, show_img = True, images_dir, False
v1_clean = en_qc(v1_concat, save_img=save_img, img_dir=img_dir, show_img=show_img)
v2_clean = en_qc(v2_concat, save_img=save_img, img_dir=img_dir, show_img=show_img)

In [6]:
# load clinics from the first visit
c_v1 = pd.read_excel(os.path.join(cdata_dir,'visit1_desc.xlsx')) 
c_v1 = load_clinics(c_v1)
d_v1 = pd.read_csv(os.path.join(bdata_dir,'fit_external_thickness_1.txt'), delimiter=';', index_col=0)
v1_concat = pd.concat([c_v1, d_v1], axis=1, join="inner")
v1_concat["sitenum"] =1000

# load clinics from the second visit
c_v2 = pd.read_excel(os.path.join(cdata_dir,'visit2_desc.xlsx')) 
c_v2 = load_clinics(c_v2)
d_v2 = pd.read_csv(os.path.join(bdata_dir,'fit_external_thickness_2.txt'), delimiter=';',  index_col=0)
v2_concat = pd.concat([c_v2, d_v2], axis=1, join="inner")
v2_concat["sitenum"] =1000

In [34]:
# Quality control based on Euler Number
save_img, img_dir, show_img = True, images_dir, False
v1_clean = en_qc(v1_concat, save_img=save_img, img_dir=img_dir, show_img=show_img)
v2_clean = en_qc(v2_concat, save_img=save_img, img_dir=img_dir, show_img=show_img)

# remove visit from index
v1_clean.index = v1_clean.index.str.slice_replace(start=-2, repl='').to_numpy()
v2_clean.index = v2_clean.index.str.slice_replace(start=-2, repl='').to_numpy()

# sort 
v1_clean = v1_clean.sort_index(ascending=True)
v2_clean = v2_clean.sort_index(ascending=True)

In [35]:
# Splitting datasets
v1_pat = v1_clean[v1_clean['Category']=='Patient']
v1_cont = v1_clean[v1_clean['Category']=='Control']
#v1_cont_train = v1_cont.sample(frac = 0.5)
#v1_cont_ad = v1_cont.drop(v1_cont_train.index)

# extract a list of unique site ids from the test set
site_ids_te =  sorted(set(v1_pat['site'].to_list()))

v2_pat = v2_clean[v2_clean['Category']=='Patient']
v2_cont = v2_clean[v2_clean['Category']=='Control']
#v2_cont_train = v2_cont.sample(frac = 0.5)
#v2_cont_ad = v2_cont.drop(v2_cont_train.index)

In [164]:
# find intersection of v1 and v2 - we are going to loop over these people
common = v1_cont.index.intersection(v2_cont.index)

v1_common_f = v1_cont.index[np.where(np.bitwise_and(v1_cont['sex'] == 0, v1_cont.index.isin(common)))[0]]
v2_common_f = v2_cont.index[np.where(np.bitwise_and(v2_cont['sex'] == 0, v2_cont.index.isin(common)))[0]]

v1_common_f_id = np.where(np.bitwise_and(v1_cont['sex'] == 0, v1_cont.index.isin(common)))[0]
v2_common_f_id = np.where(np.bitwise_and(v2_cont['sex'] == 0, v2_cont.index.isin(common)))[0]

v1_common_m = v1_cont.index[np.where(np.bitwise_and(v1_cont['sex'] == 1, v1_cont.index.isin(common)))[0]]
v2_common_m = v2_cont.index[np.where(np.bitwise_and(v2_cont['sex'] == 1, v2_cont.index.isin(common)))[0]]

v1_common_m_id = np.where(np.bitwise_and(v1_cont['sex'] == 1, v1_cont.index.isin(common)))[0]
v2_common_m_id = np.where(np.bitwise_and(v2_cont['sex'] == 1, v2_cont.index.isin(common)))[0]

Running IDP 0 lh_G&S_frontomargin_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 1 lh_G&S_occipital_inf_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 2 lh_G&S_paracentral_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 3 lh_G&S_subcentral_thickness :
Some sites missing from the training data. Adapting model
Loading data ...
Prediction by model  1 of 1
Evaluating the model ...
Evaluations Writing outputs ...
Writing outputs ...
Running IDP 4 lh_G&S_transv_frontopol_thickness :
Some sites missing from the training data. Adapt

In [169]:
###
# Running models
# leave one out crossvalidation for all subjects that have both visits
###

qm = ['EXPV_predict.txt', 'MSLL_predict.txt', 'pRho_predict.txt', 'Rho_predict.txt',  'RMSE_predict.txt',  'SMSE_predict.txt']
qm_colnames = ['EXPV', 'MSLL', 'pRho', 'Rho', 'RMSE', 'SMSE']

# initialize empty arrays
for ivisit in range(1,3):
    for idp in idp_ids: #['lh_G&S_paracentral_thickness']: 
        #idp = 'lh_G&S_paracentral_thickness'
        print(idp)

        # empty arrays for idp
        v1_yhat, v2_yhat, v1_ys2, v2_ys2, v1_Z,v2_Z  = np.empty(len(common)), np.empty(len(common)), np.empty(len(common)), np.empty(len(common)), np.empty(len(common)), np.empty(len(common))
        quality = np.empty([len(common),6])

        # create folder structure
        idp_dir = os.path.join(pretrained_dir,'models','lifespan_57K_82sites', idp)
        idp_cv_dir = os.path.join(models_dir,'V'+str(ivisit),idp)
        os.makedirs(idp_cv_dir, exist_ok=True)
        os.chdir(idp_cv_dir)


        for icont in range(0, len(common)): # we are only running the cross-validation for subjects with both visits
            
            if ivisit == 1:
                df_ad = v1_cont.drop(index=common[icont])
            elif ivisit == 2: 
                df_ad = v2_cont.drop(index=common[icont])
            
            
            df_te = pd.concat([v1_cont[v1_cont.index == common[icont]],v2_cont[v2_cont.index == common[icont]]])

            pretrained_adapt_small(idp, site_ids_tr, site_ids_te, pretrained_dir, idp_cv_dir, idp_dir, df_ad, df_te)

            # add data to arrays
            v1_yhat[icont], v2_yhat[icont] = np.genfromtxt(os.path.join(idp_cv_dir,'yhat_predict.txt'), delimiter=' ')
            v1_ys2[icont], v2_ys2[icont] = np.genfromtxt(os.path.join(idp_cv_dir,'ys2_predict.txt'), delimiter=' ')
            v1_Z[icont], v2_Z[icont] = np.genfromtxt(os.path.join(idp_cv_dir,'Z_predict.txt'), delimiter=' ')
            quality[icont] = [np.genfromtxt(i).reshape(1)[0] for i in qm]
        
        # write
        np.savetxt(os.path.join(idp_cv_dir, 'v1_yhat.txt'), v1_yhat)    
        np.savetxt(os.path.join(idp_cv_dir, 'v2_yhat.txt'), v2_yhat)    
        np.savetxt(os.path.join(idp_cv_dir, 'v1_ys2.txt'), v1_ys2)    
        np.savetxt(os.path.join(idp_cv_dir, 'v2_ys2.txt'), v2_ys2)    
        np.savetxt(os.path.join(idp_cv_dir, 'v1_Z.txt'), v1_Z)    
        np.savetxt(os.path.join(idp_cv_dir, 'v2_Z.txt'), v2_Z)    
        
        df_quality = pd.DataFrame(quality, columns=qm_colnames)
        df_quality.to_csv(os.path.join(idp_cv_dir, 'quality.txt'), sep=' ', header=True, index=False)   

        # we are only running this to have big design matrices ready for plotting
        pretrained_adapt_small(idp, site_ids_tr, site_ids_te, pretrained_dir, idp_cv_dir, idp_dir, v1_cont, v1_cont[v1_cont.index.isin(common)]) 
        os.rename('cov_bspline_te.txt', 'v1_cov_bspline_te.txt')
        os.rename('resp_te.txt', 'v1_resp_te.txt')
        
        pretrained_adapt_small(idp, site_ids_tr, site_ids_te, pretrained_dir, idp_cv_dir, idp_dir, v1_cont, v2_cont[v2_cont.index.isin(common)]) 
        os.rename('cov_bspline_te.txt', 'v2_cov_bspline_te.txt')
        os.rename('resp_te.txt', 'v2_resp_te.txt')


In [40]:
###
# Correlate Z scores between V1 and V2
###
from scipy import stats
from scipy import spatial
from scipy.stats import spearmanr

# empty np array for the results (which is p value of shapiro wilk test of normality on z-scores)
results = np.empty([len(idp_ids), 3])

method_setup = 1 # adapt on V1 and V2 separately
v1_dir = os.path.join(models_dir, 'V'+str(1))
v2_dir = os.path.join(models_dir, 'V'+str(2))


for n_idp, idp in enumerate(idp_ids):#enumerate(['lh_G&S_frontomargin_thickness']): 

    temp1  = np.genfromtxt(os.path.join(v1_dir,idp,'v2_Z.txt'), delimiter=' ')
    temp2  = np.genfromtxt(os.path.join(v2_dir,idp,'v2_Z.txt'), delimiter=' ')
    
    results[n_idp, 0] = (1- spatial.distance.cosine(temp1, temp2))
    results[n_idp, 1], results[n_idp, 2] = spearmanr(temp1,temp2)
    


colnames = ['cosine_similarity', 'R', 'pR']
df_results = pd.DataFrame(results, columns = colnames, index=idp_ids)

df_results.to_csv(os.path.join(models_dir,'v2_prediction_similarity.csv'), sep = ' ', header=True, index=True)


In [58]:
sns.set_style("white")

fig, axes = plt.subplots(1,2, sharex=False,figsize=(10,5))
fig.suptitle('LOO CV - similarity of prediction on controls between adaptation on V1 and V2')
axes[0].set_title('Cosine similarity')
sns.histplot(data=df_results, x="cosine_similarity",alpha=.6, linewidth=1.5,ax = axes[0], color='teal')
axes[1].set_title('R')
sns.histplot(data=df_results, x="R",alpha=.6, linewidth=1.5, ax = axes[1], color='tomato')

sns.despine()
plt.savefig(os.path.join(models_dir,'v2_fit.png'))
plt.close()

In [46]:
df_results['pR']

lh_G&S_frontomargin_thickness        1.088497e-110
lh_G&S_occipital_inf_thickness       6.365639e-138
lh_G&S_paracentral_thickness         1.853630e-148
lh_G&S_subcentral_thickness          5.383316e-124
lh_G&S_transv_frontopol_thickness    5.181537e-112
                                         ...      
SubCortGrayVol                       5.383316e-124
TotalGrayVol                         6.365639e-138
SupraTentorialVol                    1.853630e-148
SupraTentorialVolNotVent             1.853630e-148
EstimatedTotalIntraCranialVol        5.383316e-124
Name: pR, Length: 185, dtype: float64