In [2]:
import pandas as pd
import numpy as np
import nibabel as nib
import os
import scipy.stats as scp
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import itertools
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm
from itertools import combinations
import statsmodels.formula.api as smf

sns.set(context='talk', style='white', font='Arial')

today = date.today().strftime('%Y%m%d')

project_dir = '/Users/catcamacho/Library/CloudStorage/Box-Box/CCP/HBN_study/'
data_dir = project_dir + 'proc/group/parcel_timeseries/sub_ts/'
out_dir = project_dir + 'proc/clin/'
os.makedirs(out_dir,exist_ok=True)

demo_data = pd.read_csv(os.path.join(project_dir, 'phenotypic_data','9994_Basic_Demos_20210322.csv'), 
                        skiprows=1, index_col='BASIC1_003').loc[:,'BASIC1_005'].to_frame()
demo_data.index = ['sub-{0}'.format(a) for a in demo_data.index]
demo_data.index.name = 'sub'
demo_data.columns = ['age']

from sklearn.preprocessing import PowerTransformer, StandardScaler
clin_data = pd.read_csv(project_dir + 'phenotypic_data/full_sample_internqs_20210527.csv', index_col=0, )
clin_data['sub'] = ['sub-' + a for a in clin_data.index]
clin_data.index = clin_data['sub']
clin_data = clin_data.drop(['sub'], axis=1)

clinscores = ['MDD_mean', 'SocAnx_mean','CBCL_AD','CBCL_WD','CBCL_Int', 'MFQ_P_Total',
              'SCARED_P_SC','MFQ_SR_Total' ,'SCARED_SR_SC']

clin_data = clin_data.drop_duplicates().loc[:,clinscores + clin_data.columns[-17:].to_list()]
clin_data.loc[:, clinscores[:2]] = StandardScaler().fit_transform(clin_data.loc[:, clinscores[:2]])
clin_data.loc[:, clinscores[2:]] = PowerTransformer().fit_transform(clin_data.loc[:, clinscores[2:]])

#clin_data.to_csv(os.path.join(out_dir, 'depanx_scores_preproc_20220510.csv'))
# merge with age
clin_data = clin_data.merge(demo_data, how = 'left', left_index=True, right_index=True)
clin_data['age_std'] = StandardScaler().fit_transform(clin_data['age'].to_frame())
clin_data.loc[:, clinscores].describe()

Unnamed: 0,MDD_mean,SocAnx_mean,CBCL_AD,CBCL_WD,CBCL_Int,MFQ_P_Total,SCARED_P_SC,MFQ_SR_Total,SCARED_SR_SC
count,2840.0,2837.0,3257.0,3256.0,3257.0,2816.0,2973.0,2263.0,2324.0
mean,-0.000447,0.000462,-0.000222,-0.000963,-0.000508,-0.000978,0.00045,0.000233,0.000234
std,1.000202,1.000105,0.99998,1.000048,0.999809,0.999897,1.000414,0.999839,0.999816
min,-3.662724,-2.901837,-1.630222,-1.364322,-1.978256,-1.758914,-1.431764,-2.092768,-1.725568
25%,-0.412988,-0.441262,-0.937802,-1.364322,-0.659073,-0.596781,-0.759201,-0.736508,-0.846095
50%,0.160495,-0.13369,0.126053,0.093827,-0.004191,0.105426,0.011675,0.008687,0.029898
75%,0.542817,0.63524,0.746377,0.747555,0.752986,0.741494,0.895234,0.662152,0.708849
max,3.792553,2.634457,2.563155,2.253625,3.215135,2.657899,1.807714,2.596235,1.794867


In [None]:
# plot overall sxs correlations
plt.figure(figsize=(8,6))
sns.heatmap(clin_data.loc[:, clinscores].corr())

In [None]:
# are the same kids identified as high anx/dep for parent and child?
highanxp = clin_data.loc[(clin_data['SCARED_P_SC']>=np.percentile(clin_data.loc[np.isfinite(clin_data['SCARED_P_SC']),'SCARED_P_SC'],80)),:].index.to_list()
highanxsr = clin_data.loc[(clin_data['SCARED_SR_SC']>=np.percentile(clin_data.loc[np.isfinite(clin_data['SCARED_SR_SC']),'SCARED_SR_SC'],80)),:].index.to_list()

overlap=[]
for a in highanxp:
    if a in highanxsr:
        overlap.append(a)
        
print(len(highanxp))
print(len(highanxsr))
print(len(overlap))

In [None]:
t = clin_data.dropna()

# are the same kids identified as high anx/dep for parent and child?
highanxp = t.loc[(t['SCARED_P_SC']>=np.percentile(t.loc[np.isfinite(t['SCARED_P_SC']),'SCARED_P_SC'],80)),:].index.to_list()
highanxsr = t.loc[(t['SCARED_SR_SC']>=np.percentile(t.loc[np.isfinite(t['SCARED_SR_SC']),'SCARED_SR_SC'],80)),:].index.to_list()

overlap=[]
for a in highanxp:
    if a in highanxsr:
        overlap.append(a)
        
print(len(highanxp))
print(len(highanxsr))
print(len(overlap))

In [3]:
# regress age from all dimensional measures and save
for a in ['MDD_mean','SocAnx_mean','MFQ_P_Total','SCARED_P_SC','MFQ_SR_Total','SCARED_SR_SC']:
    print(a)
    res = smf.ols('age_std ~ {0}'.format(a), data=clin_data).fit()
    resids = res.resid.to_frame()
    resids.columns = ['{0}_regage'.format(a)]
    clin_data = clin_data.merge(resids, how='left', left_index=True, right_index=True)
#clin_data.to_csv(os.path.join(out_dir, 'depanx_scores_preproc_20220518.csv'))

MDD_mean
SocAnx_mean
MFQ_P_Total
SCARED_P_SC


In [None]:
clin_data.corr()

In [None]:
for clin in clinscores:
    t = clin_data.loc[np.isfinite(clin_data['age']) & np.isfinite(clin_data[clin]), :]
    r, p = scp.pearsonr(t['age'], t[clin])
    print(round(r, 2), round(p,3))
    sns.lmplot(x='age',y=clin, data=t, line_kws={'lw':4, 'color':'k'}, scatter_kws={'alpha':0.5,'color':'gray'})
    plt.show()
    plt.close()