## DWI_Create_Dataset

In [None]:
# Import packages
import pandas as pd
import numpy as np
from datetime import date
import statsmodels.api as sm
from scipy.stats import zscore
import seaborn as sns
from glob import glob
import copy
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.multitest import fdrcorrection as fdrcorr
from statsmodels.discrete.count_model import ZeroInflatedPoisson, ZeroInflatedNegativeBinomialP
from statsmodels.discrete.discrete_model import NegativeBinomial, Poisson 
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

today=str(date.today())

### Read in data and identify subjects

In [None]:
# Data paths
datapath = '/gpfs/milgram/pi/gee_dylan/candlab/analyses/shapes/dwi/QSIPrep'
newri = '/gpfs/milgram/pi/gee_dylan/lms233/RI_Data/coded_output'
analysis = datapath + '/analysis'

In [None]:
#Identify all subs that overlap between RI and DTI
dwi_sublist = glob(datapath + '/output_data/tractseg_output/*/*_gfaMetrics.csv') #Glob all DTI subjects (collecting all subjects with GFA measures)

#Create list with all subjects with gFA measures produced by QSIPrep
dwi_subs = [] #Create empty list
for i in range(0, len(dwi_sublist)):
    subj = dwi_sublist[i].replace(datapath + '/output_data/tractseg_output/', '') #Strip off filename
    subj2 = subj.split('/')[0] #Split string and select subject number
    dwi_subs.append(subj2) #Append to list
print("{} subjects have processed DWI data (gfa measures)".format(len(dwi_subs)))

### Import demographic and RI data

In [None]:
## What date were the RI data generated on?

# Import demographic data (age at ASR completion and sex at birth)
demo_raw = pd.read_csv(analysis + '/Demographics_3.9.22.csv',
                       header = 0).rename(columns = {'subj_id':'Subject', 
                                                     'branch_a_sex':'sex', 
                                                     'branch_a_gender':'gender',
                                                     'maca_a_3':'years_education',
                                                     'maca_a_9':'combined_income'})
demo = demo_raw[["Subject", "sex", 'gender', "years_education", 'combined_income', 'asr_age']]
demo['combined_income'] = demo['combined_income'].replace([10, 11], np.nan) #Replace don't know and decline to answer with NaN

# Read in age at scan
aas = pd.read_csv(analysis + '/age_at_scan_2024-03-14.csv', index_col=0)

#Read in diagnostic data
diag = pd.read_csv(datapath + '/../Flux_Analysis/Behavioral/DiagnosticStatus.csv', 
                   header = 0).rename(columns = {'record_id':'Subject', 'cc_group':'diagnostic_group'})

diag_only = diag[['Subject', 'diagnostic_group']]

demo_data = pd.merge(demo, diag_only, how = 'outer', on = 'Subject')
demo_data = pd.merge(aas, demo_data, how='outer', on='Subject')
assert 10 not in demo_data['combined_income'].value_counts()
assert 11 not in demo_data['combined_income'].value_counts()

In [None]:
# Import RI data
rigendate = '2023-04-03' 

ri_all1 = pd.read_csv(newri + '/Cleaned_WIDE_all_endorsements_n=191_{}.csv'.format(rigendate), header = 0).set_index('ucla_a_id')
ri_all1.columns = 'all_' + + (ri_all1.columns).str.lstrip("('endorse_any', ").str.rstrip(")")
ri_all1 = ri_all1.reset_index().rename(columns = {'ucla_a_id':'Subject'})
ri_all = ri_all1.iloc[:,0:34].set_index('Subject') # Select only endorsed events, not severity

In [None]:
# Import RI PTSD data
ri_ptsd = pd.read_csv(analysis + '/RI_LMS_PTSD_3.13.24.csv').rename(columns = {'ucla_a_id':'Subject', 'ucla_a_ptsd_p1_rein_31':'ri_ptsd_total'})

In [None]:
# Code into bins
def recode_ri(df, thde):
    ri_summed = df
    ri_summed['Early_Childhood_Em_{}'.format(thde)] = np.nansum(df.loc[:,"{}_0.0".format(thde):"{}_5.0".format(thde)].astype(float), axis=1)
    ri_summed['Mid_Childhood_Em_{}'.format(thde)] = np.nansum(df.loc[:,"{}_6.0".format(thde):"{}_12.0".format(thde)].astype(float), axis=1)
    ri_summed['Adolescence_Em_{}'.format(thde)] = np.nansum(df.loc[:,"{}_13.0".format(thde):"{}_17.0".format(thde)].astype(float), axis=1)
    ri_summed['Adulthood_Em_{}'.format(thde)] = np.nansum(df.loc[:,"{}_18.0".format(thde):"{}_30.0".format(thde)].astype(float), axis=1)

    # Code into bins
    ri_summed['Early_Childhood_{}'.format(thde)] = np.nansum(df.loc[:,"{}_0.0".format(thde):"{}_4.0".format(thde)].astype(float), axis=1)
    ri_summed['Late_Childhood_{}'.format(thde)] = np.nansum(df.loc[:,"{}_5.0".format(thde):"{}_9.0".format(thde)].astype(float), axis=1)
    ri_summed['Early_Adolescence_{}'.format(thde)] = np.nansum(df.loc[:,"{}_10.0".format(thde):"{}_14.0".format(thde)].astype(float), axis=1)
    ri_summed['Late_Adolescence_{}'.format(thde)] = np.nansum(df.loc[:,"{}_15.0".format(thde):"{}_18.0".format(thde)].astype(float), axis=1)
    ri_summed['Adulthood_{}'.format(thde)] = np.nansum(df.loc[:,"{}_18.0".format(thde):"{}_30.0".format(thde)].astype(float), axis=1)
    ri_summed['Total_Events_{}'.format(thde)] = np.nansum(df.loc[:,"{}_0.0".format(thde):"{}_999.0".format(thde)].astype(float), axis=1)
    
    return ri_summed

In [None]:
# Run function to recode RI into bins
ri_recoded = recode_ri(ri_all, 'all').reset_index().drop(['all_index'], axis = 1)

# Confirm no 777s or 999s were counted by error
assert ri_recoded['Total_Events_all'].max() < 777

In [None]:
# Import and already scored CTQ
ctq_scored = pd.read_csv(analysis + '/CTQ_scored.csv')

# Import TSC
tsc_full = pd.read_csv(analysis + '/TSC_data_1.9.23.csv').rename(columns = {'subj_id':'Subject'}).set_index('Subject').drop('tsc_complete', axis=1)
tsc = tsc_full.dropna(how='all', axis=0) #Omit subjects who did not complete TSC
tsc['tsc_sum'] = np.nansum(tsc, axis=1)
tsc = tsc.reset_index()[["Subject", "tsc_sum"]]

In [None]:
# Import Scored ASR Data
asr = pd.read_excel(analysis + '/ASR_Scored_Data_5.9.23.xlsx', header = 0, engine = 'openpyxl')
asr['Subject']=asr['subj_id']

asr_small = asr[["Subject", "Total_Problems_TScore"]].dropna(axis=0)

In [None]:
# Import ICV and data collection site
icv = pd.read_csv(analysis + "/IntracranialVolumes_ScanSites_2024-03-01.csv").drop('eTIV', axis=1)

# Recode scanner site into binary dummy variable
icv['site_bin'] = icv['site'].replace('MRRC', 1).replace('BIC', 0).replace('Cedar_300_New_Haven_CT_US_06519', 1) #300 Cedar and MRRC are the same site

# Recode subject ID to match other dataframes and clean strings
icv['Subject'] = icv['subjectid'].str.lstrip('sub-')
icv['Subject'] = icv['Subject'].replace('A616b', 'A616') #Rename; scanner crashed partway through so data sent in two parts

# Read in subcortical volumes
vols = pd.read_csv(analysis + '/Shapes_Subcortical_Volumes_n=207_2024-03-01.csv')
vols['Subject'] = vols['Subject'].str.lstrip('sub-')
icv_merged = pd.merge(icv, vols, on='Subject', how = 'right').drop(['Unnamed: 0', 'subjectid'], axis=1)

In [None]:
#Merge data together
m1 = pd.merge(demo_data, icv_merged, how = 'right', on='Subject') # Keep all subjects with eTIV data (Freesurfer)
m2 = pd.merge(m1, ri_recoded, how = 'inner', on='Subject') # Keep subjects with usable RI data AND ICV
m3 = pd.merge(m2, asr_small, how = 'left', on='Subject') 
m4 = pd.merge(m3, ctq_scored, how = 'left', on='Subject')
m5 = pd.merge(m4, tsc, how='left', on='Subject')
m6 = pd.merge(m5, pd.Series(dwi_subs, name='Subject').str.lstrip('sub-'), how='inner', on='Subject')
# Set name of final merge
bx_fulldf = m6

# Print shape of final dataframe and save to CSV
print("Merged data size: {}".format(bx_fulldf.shape))
bx_fulldf.to_csv(analysis + '/Behav_full_dataset_{}.csv'.format(today), index=False)

In [None]:
# Update subjects with missing data in downloaded files found later

sub2df = bx_fulldf[bx_fulldf['Subject'] == 'A996']
sub2df['sex'] = 0.0
sub2df['age_at_scan'] = 22.3335621139
sub2df['age_at_ri'] = 22.130521
sub2df['diagnostic_group'] = 2.0
sub2df['years_education'] = 16.0
sub2df['combined_income'] = 9.0

bx_fulldf.update(sub2df) # Update main DF in place
# bx_fulldf = bx_fulldf.set_index('Subject').drop(exclude, axis=0).reset_index() # Drop subs with anomalies
print(bx_fulldf.shape)

In [None]:
# Drop subs with problems (Check project documentation for reasons)

bad_subs = ['A258', 'A663', 'A557', 'A593', 'A675', 'A248', 'A619', 'A597', 'A257', 'A641', 'A677', 'A660']

for i in range(0, len(bad_subs)):
    try:
        bx_fulldf = bx_fulldf.set_index('Subject').drop(bad_subs[i], axis=0).reset_index()
        
    except Exception as e:
        print(e)
    
    # Sanity check that sub not present
    assert bad_subs[i] not in bx_fulldf['Subject']
        
print('Final Dataframe size: {}'.format(len(bx_fulldf)))

### Compute tract means and pull DWI data frame

In [None]:
def detect_zerodata(df):
    
    new_data = np.empty(df.shape)
    
    #Index columns for for column-wise (tract-wise) outlier detection
    for i in range(0, len(df.columns)):
        colname = df.columns[i] #Select columns
        col = df.iloc[:, i].astype(float)
        
        #See if any zeroes in column (failure to resolve DTI measures)
        for j in range(0, len(col)): # (for each voxel-wise measure in tract)
            if col[j] == 0.0:
                # Replace 0s with NaNs
                new_data[j, i] = np.nan
                print("Found a 0 in {}".format(colname))
            else:
                new_data[j, i] = col[j]
    
    return pd.DataFrame(new_data, columns = df.columns)
        

In [None]:
def compute_tract_means(sub_df, metric):
    metric_output = np.ones((len(sub_df), 50), dtype='object')
    voxfa_output = np.ones((98, 50, len(sub_df)), dtype='object')
    
    for i in range(0, len(sub_df)):
        sub = sub_df[i]
        
        # Read in spreadsheet produced by TractSeg with gfa or other metrics
        rawdata = pd.read_csv(datapath + '/output_data/tractseg_output/{}/{}_Tractometry_{}Metrics.csv'.format(sub, sub, metric),
                              header = 0, sep = ';')

        column_names = rawdata.columns
        no_outliersdf = detect_zerodata(rawdata) #Replace any zero data with NaNs
        datameans = np.nanmean(no_outliersdf, axis=0) #Has voxelwise columns; compute average retaining row size excluding any NaNs
        
        #Sanity check that data and columns are the same size (averaging across correct axis)
        assert datameans.shape[0] == len(column_names)
        
        #Save output in array
        metric_output[i,:] = datameans # Put means in dataframe
        voxfa_output[:,:,i] = rawdata # Put voxelwise data in 3D matrix
    
    # Sanity check that order subs were read in in matches data that was read in
    sub_sers = pd.Series(sub_df, name='Subject').str.lstrip('sub-')
    
    # Format output in dataframe
    output_df1 = pd.DataFrame(metric_output, columns = column_names)
    
    # Concatenate subject IDs numbers and mean gfa data
    output_df = pd.concat([sub_sers, output_df1], axis=1)
    
    # Replace any subs with 0 values with np.NaN and drop their data
    output_df_final = output_df.replace(0.0, np.nan).dropna(how='any', axis=0)
    
    # Reshape raw voxel output
    voxfa_final = voxfa_output.reshape(len(sub_df), 4900)
    print('CAUTION IF NUMBER != 0: {} subjects had 0s dropped'.format(len(output_df) - len(output_df_final)))
    
    return output_df, voxfa_final

In [None]:
# #Compute tract means
# metric = 'gfa' #gfa, fa0 (this is QA), ad, md, rd

# fa_df, all_rawdata = compute_tract_means(dwi_subs, metric) #Read in subjects with DWI
# print("DWI data shape: {}".format(fa_df.shape))

# #Write raw data to CSV
# fa_df.to_csv(analysis + '/DWI_{}_data_n={}_ZerosExcluded_{}.csv'.format(metric, len(fa_df), today), index=False)

### Remove outliers from RI data, final cleaning and regression

In [None]:
#Identify subjects with values more than 3 standard deviations from mean (raw data)
def remove_outliers(df, columns):
    subs = []
    for i in range(0, len(columns)):
        print('computing...')
        col = columns[i]
        dfmean = df[col].mean()
        dfstd = df[col].std()
        val = dfmean + 3*dfstd #Exclusion criterion: Values greater than 3 standard deviations from the mean
        lessval = dfmean - 3*dfstd #Exclusion criterion: Values less than than 3 standard deviations from the mean
        drop_df = df[df[col] > val]
        drop_df2 = df[df[col] < lessval]
        if len(drop_df) > 0: # If there are subjects that need to be dropped
            for j in range(0, len(drop_df)):
                subs.append(drop_df.reset_index()['Subject'][j]) #Add subject ID to list of subjects to drop
        else:
            pass
        if len(drop_df2) > 0: # If there are subjects that need to be dropped
            for j in range(0, len(drop_df2)):
                print('A participant would be exlcuded for having endorsements 3 std below mean') #no subs were below std dev since count distribution
        else:
            pass
    subs = list(set(subs)) # Drop duplicate IDs
    return subs

In [None]:
# Exclude subjects that are > 3 standard deviations from mean to reduce outlier impact on stats
columns = ['all_0.0', 'all_1.0', 'all_2.0', 'all_3.0', 'all_4.0', 'all_5.0', 
           'all_6.0', 'all_7.0', 'all_8.0', 'all_9.0', 'all_10.0', 'all_11.0', 
           'all_12.0', 'all_13.0', 'all_14.0', 'all_15.0', 'all_16.0', 'all_17.0']

subs_todrop = remove_outliers(bx_fulldf, columns)
print("{} subjects had outlier data".format(len(subs_todrop)))

# #Drop subjects
bx_fulldf_dropped = bx_fulldf.set_index('Subject').drop(subs_todrop, axis=0)
print('Adv outlier removed bx df size: {}'.format(bx_fulldf_dropped.shape))

bx_fulldf_dropped = bx_fulldf_dropped.reset_index().dropna(how='any',
                                                           axis=0,
                                                           subset=['age_at_scan',
                                                                   'site_bin',
                                                                   'years_education'])
print('age, site, edu dropped removed bx df size: {}'.format(bx_fulldf_dropped.shape))

In [None]:
subs_todrop #Print list of subjects to drop

### Regress motion and covariates from DWI data

Run Motion Exlusion Script here to remove outlier subjects and obtain subject-level motion metrics


In [None]:
def regress_dwi_qa_covariates(df, dwi_df, metric):
    # Create empty matrix for results
    regressed_output = np.empty((len(dwi_df), len(dwi_df.columns)))
    
    #Compute mean integrity across whole brain and Z-score
    df['mean_{}'.format(metric)] = zscore(np.mean(dwi_df, axis=1))
    
    #Run regression
    for i in range(0, len(dwi_df.columns)):
        #Set variables and ensure dtype
        dti_col = dwi_df.iloc[:,i].astype(float) #Select ith column and confirm float data
        
        assert len(dti_col) == len(dwi_df) # Sanity check to make sure selecting from correct axis
       
        ## IF STATEMENTS FOR QA DATA
        if dwi_df.columns[i] == '{}_CC_3'.format(metric):
            regressors = df[['age_at_scan_z', 'mean_fd_z', 'eTIV_z', 'site_bin', 'mean_{}'.format(metric), 'CC_3_LSCheck']]
            regressors = sm.add_constant(regressors)
        elif dwi_df.columns[i] == '{}_SLF_I_left'.format(metric):
            regressors = df[['age_at_scan_z', 'mean_fd_z', 'eTIV_z', 'site_bin', 'mean_{}'.format(metric), 'SLFI_LSCheck']]
            regressors = sm.add_constant(regressors)
        elif dwi_df.columns[i] == '{}_SLF_I_right'.format(metric):
            regressors = df[['age_at_scan_z', 'mean_fd_z', 'eTIV_z', 'site_bin', 'mean_{}'.format(metric), 'SLFI_LSCheck']]
            regressors = sm.add_constant(regressors)
        elif dwi_df.columns[i] == '{}_SLF_II_left'.format(metric):
            regressors = df[['age_at_scan_z', 'mean_fd_z', 'eTIV_z', 'site_bin', 'mean_{}'.format(metric), 'SLFII_LSCheck']]
            regressors = sm.add_constant(regressors)
        elif dwi_df.columns[i] == '{}_SLF_II_right'.format(metric):
            regressors = df[['age_at_scan_z', 'mean_fd_z', 'eTIV_z', 'site_bin', 'mean_{}'.format(metric), 'SLFII_LSCheck']]
            regressors = sm.add_constant(regressors)
        elif dwi_df.columns[i] == '{}_STR_left'.format(metric):
            regressors = df[['age_at_scan_z', 'mean_fd_z', 'eTIV_z', 'site_bin', 'mean_{}'.format(metric), 'STR_LSCheck']]
            regressors = sm.add_constant(regressors)
        elif dwi_df.columns[i] == '{}_STR_right'.format(metric):
            regressors = df[['age_at_scan_z', 'mean_fd_z', 'eTIV_z', 'site_bin', 'mean_{}'.format(metric), 'STR_LSCheck']]
            regressors = sm.add_constant(regressors)
        else:
            # Create dataframe of regressors -- age at ASR completion squared, mean framewise displacement in DTI scan, their intracranial volume)
            regressors = df[['age_at_scan_z', 'mean_fd_z', 'eTIV_z', 'site_bin', 'mean_{}'.format(metric)]] #, 'combined_income', , , 'years_education'|
            regressors = sm.add_constant(regressors) #Add intercept for OLS regression per https://www.statsmodels.org/stable/examples/notebooks/generated/ols.html
    
        # Run model
        model = sm.OLS(endog = dti_col, exog=regressors, missing = 'raise') # Endog is dependent variable; white matter data; Exog is matrix of regressors
        result = model.fit()
        regressed_output[:,i] = result.resid #Put column back in new dataframe but same order
        print(result.summary())
    regressed_df = pd.DataFrame(regressed_output, columns = dwi_df.columns + '_regr')
    return regressed_df


In [None]:
def regress_behav_covariates(df, behav_df, thde):
    regressed_output = np.empty((len(behav_df), len(behav_df.columns)))
    
    # Create dataframe of regressors -- age at ASR completion, total summed severity of lifetime endorsements, years of education
    regressors = df[['age_at_ri_z', 'sex', 'Adulthood_{}'.format(thde)]]
    
    # Zscore continuous variables and set categorical variables as factors
    regressors = sm.add_constant(regressors) #Add intercept for OLS regression per https://www.statsmodels.org/stable/examples/notebooks/generated/ols.html
    
    #Run regression
    for i in range(0, len(behav_df.columns)):
        #Get column name
        colname = behav_df.columns[i]
        
        #Set variables and ensure dtype
        col = behav_df.iloc[:,i].astype(float) #Select ith column and confirm float data
        assert len(col) == len(behav_df) # Sanity check to make sure selecting from correct axis
        
        # Run Model
        model1 = sm.ZeroInflatedPoisson(endog = col, exog=regressors, missing = 'raise') # Endog is dependent variable; white matter data; Exog is matrix of regressors
        result1 = model1.fit(maxiter = 10000)
        regressed_output[:,i] = result1.resid #Deviance residuals: https://www.statsmodels.org/devel/generated/statsmodels.genmod.generalized_linear_model.GLMResults.html
        print(result1.summary())
        print('\nBIC: {}\n'.format(result1.bic))
        fig, ax = plt.subplots(1,1)
        sns.regplot(col, result1.resid, ax=ax)
        plt.show()
        sm.qqplot(result1.resid,fit=True, line="45")
        plt.show()
        
    regressed_df = pd.DataFrame(regressed_output, columns = behav_df.columns + '_regr')
    
    return regressed_df


In [None]:
###### Read in dwi data with motion measures
dwi_gendate = '2022-07-12'

newdwi_df = pd.read_csv(analysis + '/DTI_data_motion_params_n=121_{}.csv'.format(dwi_gendate))
newdwi_df['Subject'] = newdwi_df['Subject'].replace('A616b', 'A616')

# Merge behavioral and diffusion data frames to prepare for regessing covariates
dwi_bx_df = pd.merge(bx_fulldf_dropped, newdwi_df, how = 'inner') #Lose 8 ppl here for missing demogs
print(dwi_bx_df.shape)

# Zscore continuous variables and set categorical variables as factors
dwi_bx_df['age_at_scan_z'] = zscore(dwi_bx_df['age_at_scan']) #Create and zscore age term
dwi_bx_df['age_at_ri_z'] = zscore(dwi_bx_df['age_at_ri']) #Create and zscore age term
dwi_bx_df['mean_fd_z'] = zscore(dwi_bx_df['mean_fd'])
dwi_bx_df['eTIV_z'] = zscore(dwi_bx_df['eTIV'])
dwi_bx_df['sex'] = dwi_bx_df['sex'].astype('category')
dwi_bx_df['years_education'] = dwi_bx_df['years_education'].astype('category')
dwi_bx_df['site_bin'] = dwi_bx_df['site_bin'].astype('category')

# Write unregressed data to CSV
# dwi_bx_df.to_csv(analysis + '/Binned_Unregressed_DWI_DISTAL_n={}_{}.csv'.format(len(binned_bx_df),today))

In [None]:
# How many subs had cerebellar cutoff?
tract_qa = pd.read_csv(analysis + '/ProcessedDWIDataQC_Tractography_5.8.23.csv', header=0, engine='python').rename(columns = {"Subject ID":"Subject"})
tract_qa['Subject'] = tract_qa['Subject'].str.lstrip('sub-').str.rstrip('b')
qa_df_m = pd.merge(dwi_bx_df, tract_qa, on='Subject', how = 'inner')
assert len(qa_df_m) == len(dwi_bx_df)
# If assertion fails, run line below:
# list(set(dwi_bx_df['Subject']) - set(qa_df_m['Subject']))

print("{} subj out of {} had cerebellar cutoff; {}%".format(len(qa_df_m[qa_df_m['cb_cutoff'] == 1]), len(qa_df_m), round(len(qa_df_m[qa_df_m['cb_cutoff'] == 1])/len(qa_df_m), 3)))
print("{} subj were collected at BIC and {} at MRRC".format(len(qa_df_m[qa_df_m['site_bin'] == 0]), len(qa_df_m[qa_df_m['site_bin'] == 1])))


### Import tract QA data and test for effects

In [None]:
qa = pd.read_csv(analysis + '/DWI_Tract_QC_reviewComplete_07.05_manualedits.csv', header=0, engine='python')
qa_small = qa[["Subject ID", "CC_3_LSCheck", "FPT_LSCheck", "ILF_LSCheck",
             "SLFI_LSCheck", "SLFII_LSCheck", "SLFIII_LSCheck", "STR_LSCheck"]]
qa_small = qa_small.rename(columns = {'Subject ID':'Subject'})
qa_small['Subject'] = qa_small['Subject'].str.rstrip('b')
qa_small['Subject'] = qa_small['Subject'].str.lstrip('sub-')
qa_df = pd.merge(qa_small, dwi_bx_df, on = 'Subject', how = 'inner')

# Convert to categories
qa_df['CC_3_LSCheck'] = qa_df['CC_3_LSCheck'].astype('category')
qa_df['SLFI_LSCheck'] = qa_df['SLFI_LSCheck'].astype('category')
qa_df['SLFII_LSCheck'] = qa_df['SLFII_LSCheck'].astype('category')
qa_df['STR_LSCheck'] = qa_df['STR_LSCheck'].astype('category')

print(qa_df.shape)

In [None]:
# See how many subjects had problematic tract segmentations
for i in range(1, len(qa_small.columns)):
    col = qa_small.columns[i]
    print(col)
    print(qa_small[col].value_counts())
    print(' ')

In [None]:
# Model whether tract completion is related to FA

model = sm.OLS(endog = qa_df['gfa_CC_3'], exog=qa_df['CC_3_LSCheck']) # Endog is dependent variable; white matter data; Exog is matrix of regressors
result = model.fit()
print(result.summary())
## SIG DIFF

In [None]:
# model = sm.OLS(endog = qa_df['gfa_FPT_right'], exog=qa_df['FPT_LSCheck']) # Endog is dependent variable; white matter data; Exog is matrix of regressors
# result = model.fit()
# print(result.summary())

# No Sig Diff

In [None]:
# model = sm.OLS(endog = qa_df['gfa_ILF_left'], exog=qa_df['ILF_LSCheck']) # Endog is dependent variable; white matter data; Exog is matrix of regressors
# result = model.fit()
# print(result.summary())

# # No Sig Diff

In [None]:
model = sm.OLS(endog = qa_df['gfa_SLF_I_right'], exog=qa_df['SLFI_LSCheck']) # Endog is dependent variable; white matter data; Exog is matrix of regressors
result = model.fit()
print(result.summary())
#Sig Diff

In [None]:
model = sm.OLS(endog = qa_df['gfa_SLF_II_left'], exog=qa_df['SLFII_LSCheck']) # Endog is dependent variable; white matter data; Exog is matrix of regressors
result = model.fit()
print(result.summary())
# Sig Diff

In [None]:
# model = sm.OLS(endog = qa_df['gfa_SLF_III_left'], exog=qa_df['SLFIII_LSCheck']) # Endog is dependent variable; white matter data; Exog is matrix of regressors
# result = model.fit()
# print(result.summary())

# #No Sig Diff

In [None]:
model = sm.OLS(endog = qa_df['gfa_STR_left'], exog=qa_df['STR_LSCheck']) # Endog is dependent variable; white matter data; Exog is matrix of regressors
result = model.fit()
print(result.summary())

## SIG DIFF

### Final cleaning

In [None]:
# Drop tracts with cutoff issues (cerebellar tracts)
qa_df = qa_df.drop(["gfa_ICP_left", "gfa_ICP_right", "gfa_SCP_left", "gfa_SCP_right", "gfa_MCP", 'gfa_T_OCC_left','gfa_T_OCC_right',
                            "qa_ICP_left", "qa_ICP_right", "qa_SCP_left", "qa_SCP_right", "qa_MCP", 'qa_T_OCC_left','qa_T_OCC_right',
                            "rd_ICP_left", "rd_ICP_right", "rd_SCP_left", "rd_SCP_right", "rd_MCP",  'rd_T_OCC_left','rd_T_OCC_right'], axis=1)

In [None]:
# Regress covariates from diffusion data and scale final data
scaler = StandardScaler()

gfa_regressed_df = regress_dwi_qa_covariates(qa_df, qa_df.loc[:, "gfa_AF_left":"gfa_ST_PREM_right"], 'gfa')
gfa_reg_z = pd.DataFrame(scaler.fit_transform(gfa_regressed_df), columns = gfa_regressed_df.columns)

qa_regressed_df = regress_dwi_qa_covariates(qa_df, qa_df.loc[:, "qa_AF_left":"qa_ST_PREM_right"], 'qa')
qa_reg_z = pd.DataFrame(scaler.fit_transform(qa_regressed_df), columns = qa_regressed_df.columns)

rd_regressed_df = regress_dwi_qa_covariates(qa_df, qa_df.loc[:, "rd_AF_left":"rd_ST_PREM_right"], 'rd')
rd_reg_z = pd.DataFrame(scaler.fit_transform(rd_regressed_df), columns = rd_regressed_df.columns)

# Regress covariates from adversity data
behav_regressed_df = regress_behav_covariates(qa_df, qa_df.loc[:, "all_0.0":"all_17.0"].replace(np.nan, 0.0), thde = 'all')
behav_reg_z = pd.DataFrame(scaler.fit_transform(behav_regressed_df), columns = behav_regressed_df.columns)

# Concatenate regressed data with subject IDs from input data
all_reg_df = pd.concat([qa_df['Subject'], 
                        behav_reg_z, 
                        gfa_reg_z, 
                        qa_reg_z, 
                        rd_reg_z], axis=1) 

print("Regressed df shape: {}".format(all_reg_df.shape))

In [None]:
# Merge binned RI data and regressed DWI data
final_reg_df = pd.merge(qa_df, all_reg_df, how = 'inner', on='Subject')
assert len(final_reg_df) == len(qa_df)

In [None]:
final_reg_df.head()['all_12.0_regr']

In [None]:
# Write to CSV
filename = analysis + '/DWI_RI_FullDataset_RegressedCovariates_InclSex_n={}_{}_GFA_QA_RD_ZIPBehavModel_ages0-17_RIAgeRegressed.csv'.format(len(final_reg_df), today)
# final_reg_df.to_csv(filename)

In [None]:
print(filename)