In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import scipy.stats as scp
import pingouin as pg
import nibabel as nib
import os

sns.set(context='talk',style='white')

hbn_folder = '/Users/catcamacho/Library/CloudStorage/Box-Box/CCP/HBN_study'
project_folder = '/Users/catcamacho/Library/CloudStorage/Box-Box/SEAL/stickystates'
ts_folder = os.path.join(hbn_folder, 'proc','group','parcel_timeseries','sub_ts')
data_folder = os.path.join(project_folder, 'DATA','hbn')
out_folder = os.path.join(project_folder, 'ANALYSIS','hbn_brain_states_final', 'k3')

sample_file = os.path.join(hbn_folder,'social_proc_networks','dynamic_connectivity','DATA', 
                           'helper_files','sample_gord.32k_fs_LR.pscalar.nii')
# get parcel and network labels
parcel_labels = nib.load(sample_file).header.get_axis(1).name
network_labels = []
for s in parcel_labels:
    b = s.split('_')
    if len(b)<2:
        network_labels.append(b[0])
    else:
        network_labels.append(b[1])
network_labels = np.array(network_labels)
network_names, network_sizes = np.unique(network_labels, return_counts=True)

# define measures of interest
networks_of_interest = ['Auditory', 'CinguloOperc', 'Default', 'DorsalAttn', 'FrontoParietal',
                        'SMhand', 'SMmouth', 'Salience', 'VentralAttn', 'Visual']
networks_palette = ['#FF00FF','#800080','#FF0000','#00FF00','#FFFF00','#00FFFF','#FF8000',
                    '#000000','#008080','#0000BD']

features_of_interest = ['Positive','Negative','Anger','Happy','Fear','Sad','Excited','Brightness',
                        'SaliencyFract','Sharpness','Vibrance','Loudness','Motion']
timing = np.round(np.arange(0,600,0.8),1)
movie='DM'

n_states = 3

state_palette = ['#05159b', '#cf28cf', '#107070']

# Compute SCARED alpha reliability

In [14]:
# load sample data
subinfo = pd.read_csv(os.path.join(data_folder, 'sampleinfo_movie{0}.csv'.format(movie)), index_col=0)

# load anxiety data
item_info = {'SR_SC':['SCARED_SR_3','SCARED_SR_10','SCARED_SR_26','SCARED_SR_32',
                      'SCARED_SR_39','SCARED_SR_40','SCARED_SR_41'], 
             'PR_SC':['SCARED_P_03','SCARED_P_10','SCARED_P_26','SCARED_P_32',
                      'SCARED_P_39','SCARED_P_40','SCARED_P_41'], 
             'PR_GD':['SCARED_P_05','SCARED_P_07','SCARED_P_14','SCARED_P_21',
                      'SCARED_P_23','SCARED_P_28','SCARED_P_33','SCARED_P_35','SCARED_P_37'], 
             'SR_GD':['SCARED_SR_5','SCARED_SR_7','SCARED_SR_14','SCARED_SR_21',
                      'SCARED_SR_23','SCARED_SR_28','SCARED_SR_33','SCARED_SR_35','SCARED_SR_37'], 
             'all_SR':['SCARED_SR_{0}'.format(a) for a in range(1,42)], 
             'all_PR':['SCARED_P_{0}'.format(str(a).zfill(2)) for a in range(1,42)]}

scared_sr = pd.read_csv(os.path.join(hbn_folder, 'phenotypic_data','9994_SCARED_SR_20210322.csv'), 
                        header=0, skiprows=[1], index_col='EID').loc[:,item_info['all_SR']]
scared_sr.index = ['sub-{0}'.format(i) for i in scared_sr.index]

scared_pr = pd.read_csv(os.path.join(hbn_folder, 'phenotypic_data','9994_SCARED_P_20210322.csv'), 
                        header=0, skiprows=[1], index_col='EID').loc[:,item_info['all_PR']]
scared_pr.index = ['sub-{0}'.format(i) for i in scared_pr.index]

#merge datasets together
full_data = subinfo.merge(scared_pr, how='left', left_index=True, right_index=True)
full_data = full_data.merge(scared_sr, how='left', left_index=True, right_index=True)
full_data = full_data.drop_duplicates()
full_data.describe()

Unnamed: 0,age,female,meanFD,righthanded,PPS_score,sample,SCARED_P_01,SCARED_P_02,SCARED_P_03,SCARED_P_04,...,SCARED_SR_32,SCARED_SR_33,SCARED_SR_34,SCARED_SR_35,SCARED_SR_36,SCARED_SR_37,SCARED_SR_38,SCARED_SR_39,SCARED_SR_40,SCARED_SR_41
count,620.0,620.0,620.0,620.0,439.0,620.0,545.0,543.0,545.0,545.0,...,431.0,432.0,432.0,432.0,432.0,431.0,431.0,432.0,432.0,432.0
mean,10.433548,0.404839,0.359507,0.75,9.739749,1.5,0.161468,0.314917,0.686239,0.337615,...,0.839907,0.773148,0.280093,0.657407,0.261574,0.540603,0.259861,0.666667,0.664352,0.668981
std,2.807872,0.491257,0.184016,0.433362,4.254149,0.500404,0.432568,0.551999,0.685119,0.593901,...,0.798186,0.761255,0.599901,0.742005,0.568798,0.693893,0.564236,0.761608,0.762112,0.736305
min,5.061259,0.0,0.054385,0.0,5.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.165183,0.0,0.209152,0.75,6.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.164328,0.0,0.315161,1.0,8.0,1.5,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,12.845824,1.0,0.485037,1.0,13.0,2.0,0.0,1.0,1.0,1.0,...,1.5,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
max,15.974332,1.0,0.90853,1.0,20.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [19]:
 for k in item_info.keys():
     a = pg.cronbach_alpha(full_data.loc[:, item_info[k]])
     print('Cronbach Alpha for {0}: {1}'.format(k, np.round(a[0],2)))

Cronbach Alpha for SR_SC: 0.86
Cronbach Alpha for PR_SC: 0.88
Cronbach Alpha for PR_GD: 0.86
Cronbach Alpha for SR_GD: 0.86
Cronbach Alpha for all_SR: 0.94
Cronbach Alpha for all_PR: 0.91


# Examine differences by diagnosis

In [19]:
# load data and merge datasets

# load sample data
subinfo = pd.read_csv(os.path.join(data_folder, 'sampleinfo_movie{0}.csv'.format(movie)), index_col=0)
state_info_df = pd.read_csv(os.path.join(out_folder, 'movie{0}_states_info_20240312.csv'.format(movie)), index_col=0)
neg_state_info_df = pd.read_csv(os.path.join(out_folder, 'movie{0}_neg_states_activation_info_20240313.csv'.format(movie)), index_col=0)
odds_info_df = pd.read_csv(os.path.join(out_folder, 'movie{0}_sample_oddsratios_20240313.csv'.format(movie)), index_col=0)
neg_odds_info_df = pd.read_csv(os.path.join(out_folder, 'movie{0}_sample_oddsratios_neg_20240313.csv'.format(movie)), index_col=0)

# convert activation data to wide format
mean_netact_df = pd.read_csv(os.path.join(out_folder, 'movie{0}_mean_netact.csv'.format(movie)), index_col=1).loc[:, ['state'] + networks_of_interest]
act = {}
for s in range(4):
    act[s] = mean_netact_df.loc[mean_netact_df['state']==s,:]
    cols = mean_netact_df.columns.to_list()
    cols[1:] = ['s{0}_{1}'.format(s, a) for a in cols[1:]]
    act[s].columns = cols
    act[s] = act[s].drop('state', axis=1)

mean_netact_df = act[0]
for s in range(1,4):
    mean_netact_df = mean_netact_df.merge(act[s],how='left', left_index=True, right_index=True)

# load anxiety data
scared_sr = pd.read_csv(os.path.join(hbn_folder, 'phenotypic_data','9994_SCARED_SR_20210322.csv'), 
                        header=0, skiprows=[1], index_col='EID').loc[:,['SCARED_SR_SC','SCARED_SR_GD','SCARED_SR_Total']]
scared_sr.index = ['sub-{0}'.format(i) for i in scared_sr.index]

scared_pr = pd.read_csv(os.path.join(hbn_folder, 'phenotypic_data','9994_SCARED_P_20210322.csv'), 
                        header=0, skiprows=[1], index_col='EID').loc[:,['SCARED_P_SC','SCARED_P_GD','SCARED_P_Total']]
scared_pr.index = ['sub-{0}'.format(i) for i in scared_pr.index]

# load depression data
mfq_sr = pd.read_csv(os.path.join(hbn_folder, 'phenotypic_data','9994_MFQ_SR_20210322.csv'), 
                        header=0, skiprows=[1], index_col='EID').loc[:,['MFQ_SR_Total']]
mfq_sr.index = ['sub-{0}'.format(i) for i in mfq_sr.index]

mfq_pr = pd.read_csv(os.path.join(hbn_folder, 'phenotypic_data','9994_MFQ_P_20210322.csv'), 
                        header=0, skiprows=[1], index_col='EID').loc[:,['MFQ_P_Total']]
mfq_pr.index = ['sub-{0}'.format(i) for i in mfq_pr.index]

# load other covariates
adhd = pd.read_csv(os.path.join(hbn_folder, 'phenotypic_data','9994_SWAN_20210322.csv'), 
                   header=0, skiprows=[1], index_col='EID').loc[:,['SWAN_Avg']]
adhd.index = ['sub-{0}'.format(i) for i in adhd.index]

colnames = ['DX_01_Cat', 'DX_01_Sub','DX_02_Cat', 'DX_02_Sub','DX_03_Cat', 'DX_03_Sub','DX_04_Cat', 
            'DX_04_Sub','DX_05_Cat', 'DX_05_Sub','DX_06_Cat', 'DX_06_Sub','DX_07_Cat', 'DX_07_Sub',
            'DX_08_Cat', 'DX_08_Sub','DX_09_Cat', 'DX_09_Sub','DX_10_Cat', 'DX_10_Sub','DX_01',
            'DX_02','DX_03','DX_04','DX_05','DX_06','DX_07','DX_08','DX_09','DX_01','DX_10']
dx = pd.read_csv(os.path.join(hbn_folder, 'phenotypic_data','9994_ConsensusDx_20210322_nodupes.csv'), 
                   header=0, skiprows=[1], index_col='EID', dtype=str).loc[:, colnames]
dx.index = ['sub-{0}'.format(i) for i in dx.index]

dx.loc[:,'dx_text'] = dx.loc[:, colnames].astype(str).agg(' '.join, axis=1)
dx.loc[:,'dx_text']

sub-NDARAA306NT2    Anxiety Disorders nan Anxiety Disorders nan Ne...
sub-NDARAA504CRN    Neurodevelopmental Disorders Attention-Deficit...
sub-NDARAA536PTU    Neurodevelopmental Disorders Attention-Deficit...
sub-NDARAA947ZG5    Neurodevelopmental Disorders Attention-Deficit...
sub-NDARAA948VFH    Neurodevelopmental Disorders Attention-Deficit...
                                          ...                        
sub-NDARZY101JNB    Neurodevelopmental Disorders Autism Spectrum D...
sub-NDARZZ007YMP    Neurodevelopmental Disorders Autism Spectrum D...
sub-NDARZZ284ZDH    Neurodevelopmental Disorders Autism Spectrum D...
sub-NDARZZ810LVF    Neurodevelopmental Disorders Autism Spectrum D...
sub-NDARZZ830JM7    Obsessive Compulsive and Related Disorders nan...
Name: dx_text, Length: 2226, dtype: object

In [20]:
dx.loc[:,'Dep_dx'] = 0
dx.loc[:,'Anx_dx'] = 0
dx.loc[:,'GD_dx'] = 0
dx.loc[:,'SC_dx'] = 0
dx.loc[:,'ADHD_dx'] = 0
dx.loc[:,'ASD_dx'] = 0
dx.loc[:,'Int_dx'] = 0
dx.loc[:,'Learn_dx'] = 0
dx.loc[:,'Comm_dx'] = 0
dx.loc[:,'Motor_dx'] = 0
dx.loc[:,'Sub_dx'] = 0
dx.loc[:,'Disruptive_dx'] = 0
dx.loc[:,'Trauma_dx'] = 0
dx.loc[:,'Schiz_dx'] = 0
dx.loc[:,'OCD_dx'] = 0
dx.loc[:,'Bipolar_dx'] = 0
dx.loc[:,'Eating_dx'] = 0

for a in dx.index:
    if 'Anxiety' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Anx_dx'] = 1
    if 'Generalized Anxiety' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'GD_dx'] = 1
    if 'Autism' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'ASD_dx'] = 1
    if 'Depressive' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Dep_dx'] = 1
    if 'Intellect' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Int_dx'] = 1
    if 'Learning' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Learn_dx'] = 1
    if 'Motor' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Motor_dx'] = 1
    if 'Communication' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Comm_dx'] = 1
    if 'Attention-Deficit' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'ADHD_dx'] = 1
    if 'Substance' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Sub_dx'] = 1
    if 'Traumatic' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Trauma_dx'] = 1
    if 'Schizo' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Schiz_dx'] = 1
    if 'Bipolar' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Bipolar_dx'] = 1
    if 'Eating' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Eating_dx'] = 1
    if 'Disruptive' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Disruptive_dx'] = 1
    if 'Trauma' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'Trauma_dx'] = 1
    if 'Obsessive Compulsive' in dx.loc[a, 'dx_text']:
        dx.loc[a, 'OCD_dx'] = 1
    if ('Social Anxiety' in dx.loc[a, 'dx_text']) | ('Social Phobia' in dx.loc[a, 'dx_text']):
        dx.loc[a, 'SC_dx'] = 1
finaldx_cols = dx.describe().columns
dx = dx.loc[:, finaldx_cols]
dx.describe()

Unnamed: 0,Dep_dx,Anx_dx,GD_dx,SC_dx,ADHD_dx,ASD_dx,Int_dx,Learn_dx,Comm_dx,Motor_dx,Sub_dx,Disruptive_dx,Trauma_dx,Schiz_dx,OCD_dx,Bipolar_dx,Eating_dx
count,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0
mean,0.137916,0.433064,0.182839,0.126685,0.677448,0.231357,0.052111,0.292453,0.196316,0.11186,0.013477,0.221473,0.071429,0.005391,0.065139,0.007188,0.014825
std,0.344889,0.495611,0.386621,0.332694,0.467558,0.421795,0.222302,0.454991,0.3973,0.315265,0.115332,0.415332,0.257597,0.073241,0.246827,0.084495,0.120878
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
#merge datasets together
full_data = subinfo.merge(scared_pr, how='left', left_index=True, right_index=True)
full_data = full_data.merge(scared_sr, how='left', left_index=True, right_index=True)
full_data = full_data.merge(mfq_pr, how='left', left_index=True, right_index=True)
full_data = full_data.merge(mfq_sr, how='left', left_index=True, right_index=True)
full_data = full_data.merge(adhd, how='left', left_index=True, right_index=True)
full_data = full_data.merge(state_info_df, how='left', left_index=True, right_index=True)
full_data = full_data.merge(mean_netact_df, how='left', left_index=True, right_index=True)
full_data = full_data.merge(odds_info_df, how='left', left_index=True, right_index=True)
full_data = full_data.merge(neg_state_info_df, how='left', left_index=True, right_index=True)
full_data = full_data.merge(neg_odds_info_df, how='left', left_index=True, right_index=True)
full_data = full_data.merge(dx, how='left', left_index=True, right_index=True)
full_data = full_data.drop_duplicates()
full_data.to_csv(os.path.join(out_folder, 'movie{0}_full_dataset.csv'.format(movie)))
full_data.describe().T.to_csv(os.path.join(out_folder, 'movie{0}_full_dataset_descriptives.csv'.format(movie)))
full_data.loc[np.isfinite(full_data['SCARED_SR_GD']),:].describe().T.to_csv(os.path.join(out_folder, 
                                                                                         'movie{0}_GD_dataset_descriptives.csv'.format(movie)))

In [53]:
pct = ['s1_percent', 's2_percent', 's3_percent']
trans = ['ntrans_s1_to_2_all', 'ntrans_s1_to_3_all','ntrans_s2_to_1_all',  
         'ntrans_s2_to_3_all','ntrans_s3_to_1_all', 'ntrans_s3_to_2_all']
odds = ['s1_to_1_all', 's1_to_2_all', 's1_to_3_all', 's2_to_1_all',
        's2_to_2_all', 's2_to_3_all', 's3_to_1_all', 's3_to_2_all',
        's3_to_3_all']
vs = ['age',  'meanFD',  'PPS_score','SWAN_Avg','Anx_dx', 'GD_dx', 'SC_dx'] + pct + trans + odds
cs = ['SCARED_SR_GD', 'SCARED_SR_SC', 'SCARED_SR_Total', 'MFQ_SR_Total']
movie='DM'

results_df = pd.DataFrame()

i=0
for c in ['Anx_dx', 'GD_dx', 'SC_dx']:
    start = i
    full_data = pd.read_csv(os.path.join(out_folder, 'movie{0}_full_dataset.csv'.format(movie)), index_col=0)
    full_data.loc[:,vs] = StandardScaler().fit_transform(full_data.loc[:,vs].to_numpy())
    full_data.loc[:,cs] = PowerTransformer().fit_transform(full_data.loc[:,cs].to_numpy())
    
    for s in pct:
        results = smf.glm('{0} ~ {1} + age + female + meanFD + SWAN_Avg'.format(s, c), data=full_data).fit()
        results_df.loc[i, 'movie'] = movie
        results_df.loc[i, 'DV'] = s
        results_df.loc[i, 'IV'] = c
        results_df.loc[i, 'IV_coeff'] = results.params[c]
        results_df.loc[i, 'IV_stat'] = results.tvalues[c]
        results_df.loc[i, 'IV_pval'] = results.pvalues[c]
        ci=results.conf_int()
        results_df.loc[i, 'IV_ci'] = '[{0}, {1}]'.format(round(ci.loc[c,0],2),round(ci.loc[c,1],2))
        results_df.loc[i, 'FDRp'] = np.nan
        i = i + 1
    rej, fdr_p = fdrcorrection(results_df['IV_pval'][start:i])
    results_df.iloc[start:i, -1] = fdr_p
    
    start = i
    for s in trans:
        results = smf.glm('{0} ~ {1} + age + female + meanFD + SWAN_Avg'.format(s, c), data=full_data).fit()
        results_df.loc[i, 'movie'] = movie
        results_df.loc[i, 'DV'] = s
        results_df.loc[i, 'IV'] = c
        results_df.loc[i, 'IV_coeff'] = results.params[c]
        results_df.loc[i, 'IV_stat'] = results.tvalues[c]
        results_df.loc[i, 'IV_pval'] = results.pvalues[c]
        ci=results.conf_int()
        results_df.loc[i, 'IV_ci'] = '[{0}, {1}]'.format(round(ci.loc[c,0],2),round(ci.loc[c,1],2))
        results_df.loc[i, 'FDRp'] = np.nan
        i = i + 1
    _, fdr_p = fdrcorrection(results_df['IV_pval'][start:i])
    results_df.iloc[start:i, -1] = fdr_p

    start = i
    for s in odds:
        results = smf.glm('{0} ~ {1} + age + female + meanFD + SWAN_Avg'.format(s, c), data=full_data).fit()
        results_df.loc[i, 'movie'] = movie
        results_df.loc[i, 'DV'] = s
        results_df.loc[i, 'IV'] = c
        results_df.loc[i, 'IV_coeff'] = results.params[c]
        results_df.loc[i, 'IV_stat'] = results.tvalues[c]
        results_df.loc[i, 'IV_pval'] = results.pvalues[c]
        ci=results.conf_int()
        results_df.loc[i, 'IV_ci'] = '[{0}, {1}]'.format(round(ci.loc[c,0],2),round(ci.loc[c,1],2))
        results_df.loc[i, 'FDRp'] = np.nan
        i = i + 1
    _, fdr_p = fdrcorrection(results_df['IV_pval'][start:i])
    results_df.iloc[start:i, -1] = fdr_p
results_df.to_csv(os.path.join(out_folder, 'movieBoth_full_GD_diagnosis_analysis_results.csv'))
results_df

Unnamed: 0,movie,DV,IV,IV_coeff,IV_stat,IV_pval,IV_ci,FDRp
0,DM,s1_percent,Anx_dx,-0.096783,-1.837429,0.066147,"[-0.2, 0.01]",0.09922
1,DM,s2_percent,Anx_dx,0.116099,2.328755,0.019872,"[0.02, 0.21]",0.059616
2,DM,s3_percent,Anx_dx,-0.026385,-0.496351,0.619647,"[-0.13, 0.08]",0.619647
3,DM,ntrans_s1_to_2_all,Anx_dx,0.03704,0.725739,0.467999,"[-0.06, 0.14]",0.701998
4,DM,ntrans_s1_to_3_all,Anx_dx,-0.036406,-0.732931,0.4636,"[-0.13, 0.06]",0.701998
5,DM,ntrans_s2_to_1_all,Anx_dx,0.01406,0.267924,0.788758,"[-0.09, 0.12]",0.80106
6,DM,ntrans_s2_to_3_all,Anx_dx,0.087041,1.665785,0.095756,"[-0.02, 0.19]",0.287269
7,DM,ntrans_s3_to_1_all,Anx_dx,-0.012665,-0.251975,0.80106,"[-0.11, 0.09]",0.80106
8,DM,ntrans_s3_to_2_all,Anx_dx,0.087103,1.744855,0.08101,"[-0.01, 0.18]",0.287269
9,DM,s1_to_1_all,Anx_dx,-0.035093,-0.645903,0.518342,"[-0.14, 0.07]",0.643769


In [54]:
results_df.sort_values('FDRp', axis=0)
results_df.loc[results_df['IV_pval']<0.05,:]

Unnamed: 0,movie,DV,IV,IV_coeff,IV_stat,IV_pval,IV_ci,FDRp
1,DM,s2_percent,Anx_dx,0.116099,2.328755,0.019872,"[0.02, 0.21]",0.059616
36,DM,s1_percent,SC_dx,-0.174245,-3.331678,0.000863,"[-0.28, -0.07]",0.00259
37,DM,s2_percent,SC_dx,0.122575,2.452227,0.014197,"[0.02, 0.22]",0.021296
44,DM,ntrans_s3_to_2_all,SC_dx,0.127451,2.556628,0.010569,"[0.03, 0.23]",0.063415
46,DM,s1_to_2_all,SC_dx,-0.159608,-3.030596,0.002441,"[-0.26, -0.06]",0.021966
48,DM,s2_to_1_all,SC_dx,0.103925,1.964019,0.049528,"[0.0, 0.21]",0.222876


In [50]:
vs = ['age',  'meanFD',  'PPS_score','SWAN_Avg', 'Anx_dx', 'GD_dx', 'SC_dx','neg_s1_pct',
       'nonneg_s1_pct', 'neg_s1_prob', 'nonneg_s1_prob', 'neg_s2_pct',
       'nonneg_s2_pct', 'neg_s2_prob', 'nonneg_s2_prob', 'neg_s3_pct',
       'nonneg_s3_pct', 'neg_s3_prob', 'nonneg_s3_prob']

cs = ['SCARED_P_SC', 'SCARED_P_GD', 'SCARED_P_Total',
       'SCARED_SR_SC', 'SCARED_SR_GD', 'SCARED_SR_Total']

vois_neg_pct = ['neg_s1_pct','neg_s2_pct', 'neg_s3_pct']

vois_nonneg_pct = ['nonneg_s1_pct', 'nonneg_s2_pct', 'nonneg_s3_pct']

vois_txodds = ['s1_to_1_neg', 's1_to_2_neg', 's1_to_3_neg',
               's2_to_1_neg', 's2_to_2_neg', 's2_to_3_neg', 
               's3_to_1_neg', 's3_to_2_neg', 's3_to_3_neg']

vois_trans = ['ntrans_s1_to_2_neg','ntrans_s1_to_3_neg','ntrans_s2_to_1_neg',
              'ntrans_s2_to_3_neg','ntrans_s3_to_1_neg','ntrans_s3_to_2_neg']


movie='DM'
results_df = pd.DataFrame()
i=0
for c in ['Anx_dx', 'GD_dx', 'SC_dx']:
    start=i

    full_data = pd.read_csv(os.path.join(out_folder, 'movie{0}_full_dataset.csv'.format(movie)), index_col=0)
    full_data.loc[:,vs] = StandardScaler().fit_transform(full_data.loc[:,vs].to_numpy())
    full_data.loc[:,cs] = PowerTransformer().fit_transform(full_data.loc[:,cs].to_numpy())
    
    # time spent in each state
    for s in vois_neg_pct:
        results = smf.glm('{0} ~ {1} + age + female + meanFD + SWAN_Avg'.format(s, c), data=full_data).fit()
        results_df.loc[i, 'movie'] = movie
        results_df.loc[i, 'DV'] = s
        results_df.loc[i, 'IV'] = c
        results_df.loc[i, 'IV_coeff'] = results.params[c]
        results_df.loc[i, 'IV_stat'] = results.tvalues[c]
        results_df.loc[i, 'IV_pval'] = results.pvalues[c]
        ci=results.conf_int()
        results_df.loc[i, 'IV_ci'] = '[{0}, {1}]'.format(round(ci.loc[c,0],2),round(ci.loc[c,1],2))
        results_df.loc[i, 'FDRp'] = np.nan
        i = i + 1
    _, fdr_p = fdrcorrection(results_df['IV_pval'][start:i])
    results_df.iloc[start:i, -1] = fdr_p
    
    ## transition probabilities
    start = i
    for s in vois_nonneg_pct:
        results = smf.glm('{0} ~ {1} + age + female + meanFD + SWAN_Avg'.format(s, c), data=full_data).fit()
        results_df.loc[i, 'movie'] = movie
        results_df.loc[i, 'DV'] = s
        results_df.loc[i, 'IV'] = c
        results_df.loc[i, 'IV_coeff'] = results.params[c]
        results_df.loc[i, 'IV_stat'] = results.tvalues[c]
        results_df.loc[i, 'IV_pval'] = results.pvalues[c]
        ci=results.conf_int()
        results_df.loc[i, 'IV_ci'] = '[{0}, {1}]'.format(round(ci.loc[c,0],2),round(ci.loc[c,1],2))
        results_df.loc[i, 'FDRp'] = np.nan
        i = i + 1
    _, fdr_p = fdrcorrection(results_df['IV_pval'][start:i])
    results_df.iloc[start:i, -1] = fdr_p
    
    ## transition counts
    start = i
    for s in vois_txodds:
        results = smf.glm('{0} ~ {1} + age + female + meanFD + SWAN_Avg'.format(s, c), data=full_data).fit()
        results_df.loc[i, 'movie'] = movie
        results_df.loc[i, 'DV'] = s
        results_df.loc[i, 'IV'] = c
        results_df.loc[i, 'IV_coeff'] = results.params[c]
        results_df.loc[i, 'IV_stat'] = results.tvalues[c]
        results_df.loc[i, 'IV_pval'] = results.pvalues[c]
        ci=results.conf_int()
        results_df.loc[i, 'IV_ci'] = '[{0}, {1}]'.format(round(ci.loc[c,0],2),round(ci.loc[c,1],2))
        results_df.loc[i, 'FDRp'] = np.nan
        i = i + 1
    _, fdr_p = fdrcorrection(results_df['IV_pval'][start:i])
    results_df.iloc[start:i, -1] = fdr_p
    
    ## transition odds
    start = i
    for s in vois_trans:
        results = smf.glm('{0} ~ {1} + age + female + meanFD + SWAN_Avg'.format(s, c), data=full_data).fit()
        results_df.loc[i, 'movie'] = movie
        results_df.loc[i, 'DV'] = s
        results_df.loc[i, 'IV'] = c
        results_df.loc[i, 'IV_coeff'] = results.params[c]
        results_df.loc[i, 'IV_stat'] = results.tvalues[c]
        results_df.loc[i, 'IV_pval'] = results.pvalues[c]
        ci=results.conf_int()
        results_df.loc[i, 'IV_ci'] = '[{0}, {1}]'.format(round(ci.loc[c,0],2),round(ci.loc[c,1],2))
        results_df.loc[i, 'FDRp'] = np.nan
        i = i + 1
    _, fdr_p = fdrcorrection(results_df['IV_pval'][start:i])
    results_df.iloc[start:i, -1] = fdr_p

results_df.to_csv(os.path.join(out_folder, 'movie{0}_neg_GD_diagnosis_sanalysis_results.csv'.format(movie)))
results_df

Unnamed: 0,movie,DV,IV,IV_coeff,IV_stat,IV_pval,IV_ci,FDRp
0,DM,neg_s1_pct,Anx_dx,-0.018065,-0.348196,0.727693,"[-0.12, 0.08]",0.727693
1,DM,neg_s2_pct,Anx_dx,0.101685,1.943503,0.051955,"[-0.0, 0.2]",0.148670
2,DM,neg_s3_pct,Anx_dx,-0.086377,-1.649169,0.099113,"[-0.19, 0.02]",0.148670
3,DM,nonneg_s1_pct,Anx_dx,-0.133299,-2.549798,0.010779,"[-0.24, -0.03]",0.032336
4,DM,nonneg_s2_pct,Anx_dx,0.099530,2.014455,0.043962,"[0.0, 0.2]",0.065943
...,...,...,...,...,...,...,...,...
58,DM,ntrans_s1_to_3_neg,SC_dx,-0.008564,-0.079361,0.936745,"[-0.22, 0.2]",0.998796
59,DM,ntrans_s2_to_1_neg,SC_dx,-0.000146,-0.001509,0.998796,"[-0.19, 0.19]",0.998796
60,DM,ntrans_s2_to_3_neg,SC_dx,0.157028,1.718708,0.085667,"[-0.02, 0.34]",0.257002
61,DM,ntrans_s3_to_1_neg,SC_dx,-0.105625,-0.962529,0.335784,"[-0.32, 0.11]",0.640028


In [51]:
results_df.sort_values('FDRp', axis=0)
results_df.loc[results_df['IV_pval']<0.05,:]

Unnamed: 0,movie,DV,IV,IV_coeff,IV_stat,IV_pval,IV_ci,FDRp
3,DM,nonneg_s1_pct,Anx_dx,-0.133299,-2.549798,0.010779,"[-0.24, -0.03]",0.032336
4,DM,nonneg_s2_pct,Anx_dx,0.09953,2.014455,0.043962,"[0.0, 0.2]",0.065943
10,DM,s2_to_2_neg,Anx_dx,0.005352,2.140685,0.032299,"[0.0, 0.01]",0.126686
14,DM,s3_to_3_neg,Anx_dx,-0.002937,-2.198294,0.027928,"[-0.01, -0.0]",0.126686
45,DM,nonneg_s1_pct,SC_dx,-0.218322,-4.226049,2.4e-05,"[-0.32, -0.12]",7.1e-05
46,DM,nonneg_s2_pct,SC_dx,0.109151,2.204055,0.02752,"[0.01, 0.21]",0.02752
47,DM,nonneg_s3_pct,SC_dx,0.129594,2.384805,0.017088,"[0.02, 0.24]",0.025632
53,DM,s2_to_3_neg,SC_dx,0.002361,2.469134,0.013544,"[0.0, 0.0]",0.121897
56,DM,s3_to_3_neg,SC_dx,-0.002766,-2.061892,0.039218,"[-0.01, -0.0]",0.176481
62,DM,ntrans_s3_to_2_neg,SC_dx,0.226958,2.528552,0.011453,"[0.05, 0.4]",0.068721
