In [None]:
# Import packages
import numpy as np
import pandas as pd
from glob import glob
import nibabel as nib
from nilearn import plotting, datasets, image
from scipy.stats import pearsonr, zscore
import statsmodels.api as sm
import seaborn as sns
import os
import copy
import matplotlib.pyplot as plt
from datetime import date
today = date.today()

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [None]:
# Set paths and variables
home = '/gpfs/milgram/pi/gee_dylan/candlab/data'
hpcdata = home + '/mri/hcp_pipeline_preproc/shapes'
taskfiles = home + '/behavioral/shapes/task_design_trialwise'
datapath = '/gpfs/milgram/pi/gee_dylan/candlab/analyses/shapes/shapes_phenotyping'
recondata = home + '/mri/bids_recon/shapes/*/ses-shapesV1/func'
analysis = datapath + '/Analysis'
fslpath = '/home/tjk33/project/SHAPES_task_act/out'
newri = '/gpfs/milgram/pi/gee_dylan/lms233/RI_Data/coded_output'

In [None]:
subs = pd.read_csv(analysis + '/subjectlist_n=139_2024-01-29.csv')
sublist = pd.DataFrame(list(set(subs['Record ID'].rename('Subject'))), columns = ['Subject'])

### Compare across scan data and RI to check for subjects missing

In [None]:
# Import RI data
rigendate = '2023-04-03' #Validity check on data from Jan 27 

ri = pd.read_csv(newri + '/Cleaned_WIDE_all_endorsements_n=191_{}.csv'.format(rigendate), header = 0, index_col=0).set_index('ucla_a_id')
ri.columns = 'all_' + (ri.columns).str.lstrip("('endorse_any', ").str.rstrip(")")
ri = ri.reset_index().rename(columns = {'ucla_a_id':'Subject'})
ri['Subject'] = 'sub-' + ri['Subject']
ri=ri.drop([0], axis=0).reset_index(drop=True)

In [None]:
ri_subs = ri['Subject']
print('{} subs have adult RI data'.format(len(ri_subs)))

In [None]:
# Get subjects with shapes run 3 data (second testing run)
shapes_subs_paths = glob(hpcdata + '/sub-*/MNINonLinear/Results/ses-shapesV1_task-shapes3_bold/ses-shapesV1_task-shapes3_bold_8dv.nii.gz')
shapes_subs_full = pd.Series(shapes_subs_paths).str.replace(hpcdata + '/', '', regex=True).str.replace('/MNINonLinear/Results/ses-shapesV1_task-shapes3_bold/ses-shapesV1_task-shapes3_bold_8dv.nii.gz', '', regex=True)
print('{} subs have shapes scan data'.format(len(shapes_subs_full)))

#Merge with RI data to see
has_both = pd.merge(pd.DataFrame(shapes_subs_full, columns=['Subject']), ri_subs, how='inner')
print('{} subs have both RI and scan data'.format(len(has_both)))

#See who processed
proc_subs = pd.Series(glob(fslpath + '/sub-*'), name='Subject').str.replace(fslpath + '/', '')
has_proc = pd.merge(pd.DataFrame(proc_subs), has_both)

# Identify subjects not currently included
union = pd.Series(np.union1d(has_both['Subject'], sublist)) #Compare between globbed result and existing subject list
# intersection of the series 
intersect = pd.Series(np.intersect1d(has_both['Subject'], sublist)) 
# uncommon elements in both the series  
notcommonseries = union[~union.isin(intersect)] 
# displaying the result 
print('Not included in both globbed subjectlist and written subjectlist: \n', notcommonseries) 

# Identify subjects not currently included
union2 = pd.Series(np.union1d(has_both['Subject'], has_proc)) #Compare between globbed result and existing subject list
# intersection of the series 
intersect2 = pd.Series(np.intersect1d(has_both['Subject'], has_proc)) 
# uncommon elements in both the series  
notcommonseries = union2[~union2.isin(intersect2)] 
print('Not included in both globbed subjectlist and FSL processing: \n', notcommonseries) 

# displaying the result 
## SEE SHAPES CODA TRACKER FOR DETAILED INFORMATION ON WHY EXCLUDED

### Create Dataset

In [None]:
# Import demographic data (age at ASR completion and sex at birth)
demo_raw = pd.read_csv(analysis + '/Demographics_3.9.22.csv',
                       header = 0).rename(columns = {'subj_id':'Subject', 
                                                     'branch_a_sex':'sex', 
                                                     'branch_a_gender':'gender',
                                                     'maca_a_3':'years_education',
                                                     'maca_a_9':'combined_income'})
demo = demo_raw[["Subject", "sex", 'gender', 'asr_age', "years_education", 'combined_income']]
demo.loc[:,'combined_income'] = demo.loc[:,'combined_income'].replace([10, 11], np.nan) #Replace don't know and decline to answer with NaN

# Read in age at scan
aas = pd.read_csv(analysis + '/age_at_scan_2024-04-11.csv', index_col=0)

#Diagnostic status
diag = pd.read_csv(analysis + '/DiagnosticStatus.csv', 
                   header = 0).rename(columns = {'record_id':'Subject', 'cc_group':'diagnostic_group'})

diag_only = diag[['Subject', 'diagnostic_group']]

demo_data = pd.merge(demo, diag_only, how = 'inner', on = 'Subject') #Retain subjects with demog and diag info
demo_data = pd.merge(demo_data, aas, on='Subject', how='inner')
demo_data.loc[:,'Subject'] = 'sub-' + demo_data.loc[:,'Subject']

In [None]:
# Select only number of endorsements; drop average severity
ri_num_ends = ri.iloc[:,0:33].set_index('Subject').replace(np.nan, 0.0)

# Code into bins
ri_summed = pd.DataFrame(index = ri['Subject'])
ri_summed['Early_Childhood'] = np.nansum(ri_num_ends.loc[:,"all_0.0":"all_5.0"].astype(float), axis=1)
ri_summed['Mid_Childhood'] = np.nansum(ri_num_ends.loc[:,"all_6.0":"all_12.0"].astype(float), axis=1)
ri_summed['Adolescence'] = np.nansum(ri_num_ends.loc[:,"all_13.0":"all_17.0"].astype(float), axis=1)
ri_summed['Adulthood'] = np.nansum(ri_num_ends.loc[:,"all_18.0":"all_30.0"].astype(float), axis=1)
ri_summed['Total_Events'] = np.nansum(ri_num_ends.loc[:,"all_0.0":"all_999.0"].astype(float), axis=1)

In [None]:
def regress_behav_covariates(df, behav_df):
    regressed_output = np.zeros((len(behav_df), len(behav_df.columns)))
    
    # Create dataframe of regressors -- age at ASR completion, total summed severity of lifetime endorsements, years of education
    regressors = df[['age_at_ri_z','sex']]
    
    # Zscore continuous variables and set categorical variables as factors
    regressors = sm.add_constant(regressors) #Add intercept for OLS regression per https://www.statsmodels.org/stable/examples/notebooks/generated/ols.html
    
    #Run regression
    for i in range(0, len(behav_df.columns)):
        #Get column name
        colname = behav_df.columns[i]
        
        #Set variables and ensure dtype
        col = behav_df.iloc[:,i].astype(float) #Select ith column and confirm float data
        assert len(col) == len(behav_df) # Sanity check to make sure selecting from correct axis
        print('Mean: ', col.mean())
        print('Var: ', col.var())
        plt.show()
        
        # Run Model
        model1 = sm.ZeroInflatedNegativeBinomialP(endog = col, exog=regressors, missing = 'raise') # Endog is dependent variable; white matter data; Exog is matrix of regressors
        result1 = model1.fit(maxiter = 10000)
        regressed_output[:,i] = result1.resid
        print(result1.summary())
        print('\nBIC: {}\n'.format(result1.bic))
        # fig, ax = plt.subplots(1,1)
        # sns.regplot(col, result1.resid, ax=ax)
        plt.show()
        sns.histplot(result1.resid)
        plt.show()
        
    regressed_df = pd.DataFrame(regressed_output, columns = behav_df.columns + '_regr')
    
    return regressed_df

In [None]:
# Merge RI and demographic data on subject to ensure alignment
ri_demo = pd.merge(demo_data, ri_summed.reset_index(), on='Subject', how = 'inner')
print('{} subjects have both RI and diag/demo info'.format(len(ri_demo)))
ri_demo = ri_demo.dropna(subset = ['age_at_scan', 'Adulthood', 'sex'])
ri_demo['age_at_scan_z'] = zscore(ri_demo['age_at_scan'])
ri_demo['age_at_ri_z'] = zscore(ri_demo['age_at_ri'])

# Perform regressions
ri_bins_regr = regress_behav_covariates(ri_demo, ri_demo.loc[:, "Early_Childhood":'Adulthood'])
ri_bins_regr = ri_bins_regr.set_index(ri_demo['Subject']).reset_index() #Add subject column back into df

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize = (20, 5))

sns.histplot(x = 'Early_Childhood_regr', data = ri_bins_regr, ax = ax1)
sns.histplot(x = 'Mid_Childhood_regr', data = ri_bins_regr, ax = ax2)
sns.histplot(x = 'Adolescence_regr', data = ri_bins_regr, ax = ax3)
sns.histplot(x = 'Adulthood_regr', data = ri_bins_regr, ax = ax4)

In [None]:
# Merge RI
ri_merged = pd.merge(ri_demo, ri_bins_regr, on='Subject')
assert len(ri_merged) == len(ri_demo)

In [None]:
# Import and already scored CTQ
ctq_scored = pd.read_csv(analysis + '/CTQ_scored.csv')
ctq_scored['Subject'] = 'sub-' +  ctq_scored['Subject']

In [None]:
# Import Scored ASR Data
asr = pd.read_excel(analysis + '/ASR_Scored_Data_1.28.22.xlsx', header = 0, engine = 'openpyxl')
asr['Subject']='sub-' + asr['subj_id']

asr_small = asr[["Subject", "Internalizing_Problems_TScore", "Internalizing_Problems_Total", 
                 "Total_Problems_TScore", "Total_Problems_Total", 
                 "Externalizing_Problems_TScore", "Externalizing_Problems_Total",
                "Anxiety_Problems_Total", "Anxiety_Problems_TScore"]].dropna()
print(len(asr_small))

In [None]:
# Import RI PTSD data
ri_ptsd = pd.read_csv(analysis + '/RI_LMS_PTSD_3.13.24.csv').rename(columns = {'ucla_a_id':'Subject', 'ucla_a_ptsd_p1_rein_31':'ri_ptsd_total'})
ri_ptsd['ri_ptsd_total'] = ri_ptsd['ri_ptsd_total'].replace(999,np.nan) # 999 = Missing
ri_ptsd['Subject'] = 'sub-' + ri_ptsd['Subject']
ri_small = ri_ptsd[["Subject", "ri_ptsd_total"]]

In [None]:
# Import ICV and data collection site
icv = pd.read_csv(analysis + "/IntracranialVolumes_ScanSites_2024-03-01.csv").drop('eTIV', axis=1)

# Recode scanner site into binary dummy variable
icv['site_bin'] = icv['site'].replace('MRRC', 1).replace('BIC', 0).replace('Cedar_300_New_Haven_CT_US_06519', 1) #300 Cedar and MRRC are the same site

# Recode subject ID to match other dataframes and clean strings
icv['Subject'] = icv['subjectid']
icv['Subject'] = icv['Subject'].replace('A616b', 'A616') #Rename; scanner crashed partway through so data sent in two parts

In [None]:
# Import CDI-RISC data
risc = pd.read_csv(analysis + '/QuestionnaireDataCom_DATA_2022-11-07_2003.csv', 
                   index_col = ['subj_id']).drop(columns = ['cdrisc_complete'], axis=1).dropna(how='any', axis=0)
risc['cdirisc_sum'] = np.nansum(risc,axis=1)
risc = risc.reset_index().rename(columns = {'subj_id':'Subject'})
risc['Subject'] = 'sub-' + risc['Subject']

In [None]:
# Import QA Results
qa = pd.read_csv(analysis + '/NeuroimagingQAShapes-Shapes23SummaryResul_DATA_2022-11-07_2025.csv')
qa['Subject'] = 'sub-' + qa['qa_subj_id']
qa_failed = qa[(qa['qa_s3_ranking'] == 3.0) | (qa['qa_s2_ranking'] == 3.0)]
qa_failed_list = list(set('sub-' + pd.Series(qa_failed['qa_subj_id']).str.replace('--1', '').str.replace('--2', '').str.replace('_2', '')))

In [None]:
qa_failed_list

In [None]:
# Read in subcortical volumes
vols = pd.read_csv(analysis + '/Shapes_Subcortical_Volumes_n=207_2024-03-01.csv')

In [None]:
# Import TSC
tsc_full = pd.read_csv(analysis + '/TSC_scored_2024-03-23.csv', index_col=0).set_index('Subject')
tsc = tsc_full.dropna(how='all', axis=0).reset_index() #Omit subjects who did not complete TSC
tsc['Subject'] = 'sub-' + tsc['Subject']
print(len(tsc))

In [None]:
# Import SCARED
scared = pd.read_csv(analysis + '/SCARED_scored_2024-03-23.csv', index_col=0).set_index('Subject')
scared = scared.dropna(how='all', axis=0).reset_index()
scared['Subject'] = 'sub-' + scared['Subject']
print(len(scared))

In [None]:
# # Import motion data
motion = []

for i in range(0, len(sublist)):
    try:
        sub = sublist.iloc[i][0]
        df = pd.read_csv(hpcdata + '/{}/MNINonLinear/Results/Motionstats_summary_allruns.csv'.format(sub), 
                       sep = '\t', header=0)
        mean_mot2 = df.iloc[0, 15] # Mean FD motion for shapes 2
        assert df.columns[15] == ' fdmean_motion_shapes2 '
        mean_mot3 = df.iloc[0, 26] # Mean FD motion for shapes 3
        assert df.columns[26] == ' fdmean_motion_shapes3 '
        mean_both = (mean_mot2 + mean_mot3)/2
        motion.append([sub, mean_both])
    except Exception as e: 
        print(e)
        print('Error on {}'.format(sub))
mot_df = pd.DataFrame(motion, columns = ['Subject', 'mean_fd']).sort_values(by='Subject', ascending=True).reset_index(drop=True)

In [None]:
from scipy.stats import iqr

print("Mean: {}".format(mot_df['mean_fd'].mean()))
print("Std Dev: {}".format(mot_df['mean_fd'].std()))
print("Min: {}".format(mot_df['mean_fd'].min()))
print("Max: {}".format(mot_df['mean_fd'].max()))

**Merge for complete dataset**

In [None]:
mot_thresh = 0.8
mot_uthresh = mot_df[mot_df['mean_fd'] < mot_thresh]

In [None]:
m1 = pd.merge(sublist, ri_merged, how = 'inner', on='Subject')
print("m1: n={}".format(len(m1)))

m2 = pd.merge(m1, scared, how='left', on='Subject')
print("m2: n={}".format(len(m2)))

m3 = pd.merge(m2, asr_small, how = 'left' )
print("m3: n={}".format(len(m3)))

m4 = pd.merge(m3, icv, how = 'left' )
print("m4: n={}".format(len(m4)))

m5 = pd.merge(m4, risc[['Subject', 'cdirisc_sum']], how = 'left')
print("m5: n={}".format(len(m5)))

m6 = pd.merge(m5, ctq_scored, how='left')
print("m6: n={}".format(len(m6)))

m7 = pd.merge(m6, mot_uthresh, how='inner')
print("m7: n={}".format(len(m7)))

m8 = pd.merge(m7, vols, how = 'left', on='Subject')
print("m8: n={}".format(len(m8)))

m9 = pd.merge(m8, tsc, how='left', on='Subject')
print("m9: n={}".format(len(m9)))

m10 = pd.merge(m9, ri_small, how = 'left', on='Subject')
print("m10: n={}".format(len(m10)))

final = m10

print('{} subj remaining'.format(len(final)))

In [None]:
qa_failed_list.append('sub-A995')
for loc, idx in enumerate(qa_failed_list):
    if idx in final['Subject'].tolist():
        print('{} in final dataset -- remove'.format(idx))
    else:
        pass

In [None]:
# Drop subject that failed QA
final = final.set_index('Subject').drop('sub-A647', axis=0).reset_index()
final = final.set_index('Subject').drop('sub-A995', axis=0).reset_index()

assert 'sub-A647' not in final['Subject'].tolist()
assert 'sub-A995' not in final['Subject'].tolist()

In [None]:
# check which subs were lost
final_check = final
final_check['included'] = 1
check_df = pd.merge(m1,final_check,  how='outer', on='Subject')

remaining = check_df[check_df.included != 1]
remaining['Subject']

In [None]:
final_dropped = final.dropna(subset = ['Subject',
                                       'age_at_scan', 
                                       'eTIV', 
                                       'site_bin', 
                                       'mean_fd',
                                       'Total_Events'], axis=0).sort_values(by='Subject').reset_index(drop=True)

final_dropped = final_dropped.set_index('Subject')
print("{} subjects remaining".format(len(final_dropped)))

outfile = analysis + '/Behav_Dataset_AdulthoodRegr_n={}_{}.csv'.format(len(final_dropped), today)
final_dropped.to_csv(outfile)
print(outfile)