In [None]:
# Import packages
import os
import copy
import shutil
import numpy as np
import pandas as pd
from glob import glob
import dill
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from netneurotools import cluster 
# from netneurotools import plotting as nnt_plotting
import nibabel as nib
from stepmix.stepmix import StepMix
from stepmix.bootstrap import blrt
from nilearn.maskers import NiftiMasker, NiftiLabelsMasker
from nilearn import plotting, datasets, image
from sklearn.linear_model import RidgeClassifier, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.mixture import BayesianGaussianMixture
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
import statsmodels.formula.api as smf
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from statistics import mode
from scipy.stats import kruskal, pearsonr, spearmanr, mannwhitneyu, zscore, median_abs_deviation, iqr
from statsmodels.stats.multitest import fdrcorrection as fdr
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
import statannotations as sa
from statannotations.Annotator import Annotator
import statsmodels.formula.api as smf
from pingouin import pairwise_gameshowell, mwu, pairwise_tests, ttest
from datetime import date
from re import match
import starbars
from IPython.display import display

today=str(date.today())

sns.set_palette('Paired')

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [None]:
# Set plotting defaults
sns.set_style(sns.set_style('whitegrid', {'font.family':'serif', 'font.serif':'Times New Roman'}))
sns.set_palette('nipy_spectral_r', n_colors=15)

In [None]:
# Set paths and variables
home = '/gpfs/milgram/pi/gee_dylan/candlab/data'
hcpdata = home + '/mri/hcp_pipeline_preproc/shapes'
taskfiles = home + '/behavioral/shapes/task_design_trialwise'
datapath = '/gpfs/milgram/pi/gee_dylan/candlab/analyses/shapes/shapes_phenotyping'
fslpath = '/home/tjk33/project/SHAPES_task_act/out'
analysis = datapath + '/Analysis'
suffix = 'bold_8dv_resampled.nii.gz'
plt_out = analysis + '/Figures'

bv_df_orig = pd.read_csv(analysis + '/Behav_Dataset_AdulthoodRegr_n=131_2024-06-21.csv') #3-08 version includes updated dev stage vars
subjects = bv_df_orig['Subject'].tolist()

In [None]:
# Convert to categorical and standardize in behav df
bv_df_orig['sex'] = bv_df_orig['sex'].astype('category')
bv_df_orig['years_education'] = bv_df_orig['years_education'].astype('category')
bv_df_orig['combined_income'] = bv_df_orig['combined_income'].astype('category')

In [None]:
# # Read in betas
betas_date_mackey = '2024-06-21' #'2023-12-21'#
betas_date_shen = '2024-06-27'
betas_date_dacc = '2024-06-21' #'2024-06-18'
dacc_roi = 'AAL3'
#Need to drop cerebellar nodes? Think they are 341-368

## Shen Data
threat_reg_df_shen = pd.read_csv(analysis + '/Regressed_ThreatVBaseline_Shen368_FSL_CopeBetas_n=131_{}_NotDemeaned.csv'.format(betas_date_shen)).set_index('Subject')
safety_reg_df_shen = pd.read_csv(analysis + '/Regressed_SafetyVBaseline_Shen368_FSL_CopeBetas_n=131_{}_NotDemeaned.csv'.format(betas_date_shen)).set_index('Subject')
tvs_reg_df_shen = pd.read_csv(analysis + '/Regressed_ThreatVSafety_Shen368_FSL_CopeBetas_n=131_{}_NotDemeaned.csv'.format(betas_date_shen)).set_index('Subject')

# Mackey & Subcort data
threat_reg_df = pd.read_csv(analysis + '/Regressed_ThreatVBaseline_Mackey_FSL_CopeBetas_n=131_{}_NotDemeaned.csv'.format(betas_date_mackey)).set_index('Subject')
safety_reg_df = pd.read_csv(analysis + '/Regressed_SafetyVBaseline_Mackey_FSL_CopeBetas_n=131_{}_NotDemeaned.csv'.format(betas_date_mackey)).set_index('Subject')
tvs_reg_df = pd.read_csv(analysis + '/Regressed_ThreatVSafety_Mackey_FSL_CopeBetas_n=131_{}_NotDemeaned.csv'.format(betas_date_mackey)).set_index('Subject')

# dACC data
threat_reg_dacc = pd.read_csv(analysis + '/Regressed_ThreatVBaseline_{}_dACC_FSL_CopeBetas_n=131_{}_NotDemeaned.csv'.format(dacc_roi, betas_date_dacc)).set_index('Subject').rename(columns = {'{}_dACC'.format(dacc_roi):'{}_dACC_thr'.format(dacc_roi)})
safety_reg_dacc = pd.read_csv(analysis + '/Regressed_SafetyVBaseline_{}_dACC_FSL_CopeBetas_n=131_{}_NotDemeaned.csv'.format(dacc_roi, betas_date_dacc)).set_index('Subject').rename(columns = {'{}_dACC'.format(dacc_roi):'{}_dACC_saf'.format(dacc_roi)})
tvs_reg_dacc = pd.read_csv(analysis + '/Regressed_ThreatVSafety_{}_dACC_FSL_CopeBetas_n=131_{}_NotDemeaned.csv'.format(dacc_roi, betas_date_dacc)).set_index('Subject').rename(columns = {'{}_dACC'.format(dacc_roi):'{}_dACC_tvs'.format(dacc_roi)})
sublist = safety_reg_dacc.reset_index()['Subject'].tolist()

In [None]:
# Merge subcort and dACC dat
threat_reg_sd = pd.merge(threat_reg_df.reset_index(), threat_reg_dacc.reset_index(), on='Subject')
safety_reg_sd =pd.merge(safety_reg_df.reset_index(), safety_reg_dacc.reset_index(), on='Subject')
tvs_reg_sd = pd.merge(tvs_reg_df.reset_index(), tvs_reg_dacc.reset_index(), on='Subject')

### Get Dataset Info

In [None]:
n_m = len(bv_df_orig[bv_df_orig['sex'] == 0])
n_f = len(bv_df_orig[bv_df_orig['sex'] == 1])
n_tot = len(bv_df_orig)

print("Participant sample is {}% male and {}% female".format(round(n_m/n_tot*100, 3), round(n_f/n_tot*100, 3)))
print("Mean age is {}, std {}".format(round(bv_df_orig['age_at_ri'].mean(), 3), round(bv_df_orig['age_at_ri'].std(), 3)))

In [None]:
# Merge datasets
bv_df_merged = pd.merge(tvs_reg_df.reset_index()['Subject'], bv_df_orig).reset_index()
bv_df_merged['anx_dep_sx'] = np.sqrt(bv_df_merged['Anxiety_Problems_Total'])

### Collapse across left and right hemispheres


In [None]:
def collapse_hemispheres(df, cond):
    collapsed_df = pd.DataFrame(df['Subject'])
    collapsed_df['hipp_{}'.format(cond)] = np.mean((df['left_hippocampus_Shen368'], df['right_hippocampus_Shen368']), axis=0)
    collapsed_df['amyg_{}'.format(cond)] = np.mean((df['left_amygdala_Shen368'], df['right_amygdala_Shen368']), axis=0)
    collapsed_df['{}_dACC_{}'.format(dacc_roi, cond)] = df['{}_dACC_{}'.format(dacc_roi, cond)]
    collapsed_df['Mackey_14m_{}'.format(cond)] = np.mean((df['Mackey_area14m_left'], df['Mackey_area14m_right']), axis=0)
    # collapsed_df['Mackey_14rr_{}'.format(cond)] = np.mean((df['Mackey_area14rr_left'], df['Mackey_area14rr_right']), axis=0)
    # collapsed_df['Mackey_14r_{}'.format(cond)] = np.mean((df['Mackey_area14r_left'], df['Mackey_area14r_right']), axis=0)
    collapsed_df['Mackey_32_{}'.format(cond)] = np.mean((df['Mackey_area32_left'], df['Mackey_area32_right']), axis=0)
    collapsed_df['Mackey_25_{}'.format(cond)] = np.mean((df['Mackey_area25_left'], df['Mackey_area25_right']), axis=0)
    # collapsed_df['Mackey_14c_{}'.format(cond)] = np.mean((df['Mackey_area14c_left'], df['Mackey_area14c_right']), axis=0)
    collapsed_df['Mackey_24_{}'.format(cond)] = np.mean((df['Mackey_area24_left'], df['Mackey_area24_right']), axis=0)
    # collapsed_df['Mackey_11m_{}'.format(cond)] = np.mean((df['Mackey_area11m_left'], df['Mackey_area11m_right']), axis=0)
    return collapsed_df

In [None]:
# Collapse across hemispheres
safety_coll_df = collapse_hemispheres(safety_reg_sd.reset_index(), 'saf')
threat_coll_df = collapse_hemispheres(threat_reg_sd.reset_index(), 'thr')
tvs_coll_df = collapse_hemispheres(tvs_reg_sd.reset_index(), 'tvs')

In [None]:
# Compute averaged node for regions that are highly correlated
safety_coll_df['Mackey_32_14m_saf'] = (safety_coll_df['Mackey_32_saf'] + safety_coll_df['Mackey_14m_saf'])/2
safety_coll_df = safety_coll_df.drop(['Mackey_32_saf', 'Mackey_14m_saf'], axis=1)

threat_coll_df['Mackey_32_14m_thr'] = (threat_coll_df['Mackey_32_thr'] +  threat_coll_df['Mackey_14m_thr'])/2
threat_coll_df = threat_coll_df.drop(['Mackey_14m_thr','Mackey_32_thr'], axis=1 )

tvs_coll_df['Mackey_32_14m_tvs'] = (tvs_coll_df['Mackey_32_tvs'] + tvs_coll_df['Mackey_14m_tvs'])/2
tvs_coll_df = tvs_coll_df.drop(['Mackey_32_tvs', 'Mackey_14m_tvs'], axis=1)

In [None]:
# # Concatenate threat and safety measures into one data frame
all_coll_df = zscore(pd.concat([threat_coll_df.set_index('Subject'), safety_coll_df.set_index('Subject')],
                        axis=1).dropna(), axis=0) #Z-score other conditions

adv_binned_df = pd.concat([bv_df_merged.set_index('Subject')[['Early_Childhood_regr', 'Mid_Childhood_regr', 'Adolescence_regr', 'Adulthood_regr']],
                            all_coll_df], axis=1)

adv_binned_tvs = pd.concat([bv_df_merged.set_index('Subject')[['Early_Childhood_regr', 'Mid_Childhood_regr', 'Adolescence_regr', 'Adulthood_regr']],
                            tvs_coll_df.set_index('Subject')], axis=1)

### Set input data for clustering

NOTE: Large fronto-orbital brightness artifact in many scans makes analyzing this region challenging (Paola mentioned this to me and can also be observed in preprocessed data images). For this reason, we should exclude Mackey parcellations 14rr, 11m, 14c, 14r. Parcellations 24, 25, 32, and 14m are okay to use.

In [None]:
# Test for outliers (based on median for adversity)

def get_outliers(df, var):
    #Define median and standard deviation
    vmn = df[var].median()
    vstd = df[var].std()
    # Outliers are greater or less than 3x standard dev from median
    outliers = np.where((df[var] > vmn + (3*vstd)) | (df[var] < vmn - (3*vstd)))
    
    return pd.Series(outliers[0]).tolist()

In [None]:
# Set input data HERE
data_prep = adv_binned_tvs

In [None]:
outliers = []

for i in range(0, len(data_prep.columns)):
    col = data_prep.columns[i]
    outs = get_outliers(data_prep, col)
    for j in range(0, len(outs)):
        outliers.append(outs[j])
        
all_outliers = list(set(outliers))
print('Dropping {} subjects with outlier data; new sample is {} pts'.format(len(all_outliers), len(data_prep.reset_index().drop(all_outliers, axis=0))))

In [None]:
# Drop outliers and set up input data
in_beta_mat_pre = data_prep.reset_index().drop(all_outliers, axis=0).set_index('Subject')

# Set input data and standardize
in_beta_mat = zscore(in_beta_mat_pre, axis=0)

# Drop outliers from behavioral data
bv_df = bv_df_merged.reset_index().drop(all_outliers, axis=0)

### Set up analysis df

In [None]:
# Get variance inflation factor (code from https://stackoverflow.com/questions/42658379/variance-inflation-factor-in-python)
vifs = pd.DataFrame(np.linalg.inv(in_beta_mat.corr().to_numpy()).diagonal(), 
                 index=in_beta_mat.columns, 
                 columns=['VIF'])
# vifs
drop_vifs = np.where(vifs['VIF']>5)[0].tolist()

vifs

In [None]:
# Plot heatmap
corr_df = pd.merge(in_beta_mat.reset_index(), bv_df_orig.loc[:, ["Subject", "Anxiety_Problems_Total", 'Internalizing_Problems_Total', 'CTQ_Total',
                                                                'Early_Childhood', 'Mid_Childhood', 'Adolescence', 'Adulthood', 'Total_Events']],
                  on='Subject', suffixes=(None, '_merged')).set_index('Subject').dropna(axis=0)
corr = in_beta_mat.corr()
print(corr.shape)
fig, ax = plt.subplots(1, 1, figsize = (10,10))


sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1,  ax=ax, annot=True, annot_kws={"size":"xx-small"})
plt.tight_layout()

### LPA Modeling

In [None]:
from stepmix.stepmix import StepMix
info_crits = dict(n_comps = [], BIC=[], AIC=[], CAIC = [], Avg_LL=[], SS_BIC = [], Entropy=[], Scaled_Entropy = [])

# Gaussian mixture model
for i in range(1, 7):
    model = StepMix(n_components=i, measurement="continuous", init_params = 'random',
                    verbose=1, random_state=0, n_init=1000)
    
    # Fit to data
    model.fit(in_beta_mat)
    bic = model.bic(in_beta_mat)
    aic = model.aic(in_beta_mat)
    caic = model.caic(in_beta_mat)
    log_lik = model.score(in_beta_mat)
    ss_bic = model.sabic(in_beta_mat)
    entropy = model.entropy(in_beta_mat)
    scaled_entropy = model.relative_entropy(in_beta_mat)
    
    info_crits['n_comps'].append(i)
    info_crits['BIC'].append(bic)
    info_crits['AIC'].append(aic)
    info_crits['CAIC'].append(caic)
    info_crits['Avg_LL'].append(log_lik)
    info_crits['SS_BIC'].append(ss_bic)
    info_crits['Entropy'].append(entropy)
    info_crits['Scaled_Entropy'].append(scaled_entropy)

    # Save class membership predictions to df
    # model.predict(in_beta_mat)


In [None]:
# Look at BIC estimations
bic_df = pd.DataFrame(info_crits).round(4)

printmd('**Information criteria for hemispheres averaged together**')
bic_df

In [None]:
final_model = StepMix(n_components = 3, measurement="continuous", init_params = 'random',
                      verbose=1, random_state=0, n_init=1000, 
                      n_steps = 3, correction='BCH', assignment='soft')
final_model.fit(in_beta_mat)

In [None]:
final_model.score(in_beta_mat)

In [None]:
k = 4
k_less1 = 3

# Bootstrapped likelihood ratio test
null_model_2 = StepMix(n_components=k_less1, measurement="continuous", init_params = 'random',
                      verbose=1, random_state=0, n_init=1000, 
                      correction='BCH', assignment='soft')

# Bootstrapped likelihood ratio test
final_model = StepMix(n_components=k, measurement="continuous", init_params = 'random',
                      verbose=1, random_state=0, n_init=1000, 
                      correction='BCH', assignment='soft')

blt1 = blrt(null_model_2, final_model, X = in_beta_mat, n_repetitions=1000)
print("2 classes vs. 3 classes: p={}".format(blt1))

In [None]:
print("{} classes vs. {} classes: p={}".format(k, k_less1, blt1))

In [None]:
# # LPA clusters
lpa_clusters = final_model.predict(in_beta_mat)
cluster_prob = final_model.predict_proba(in_beta_mat)
cluster_prob_df = pd.DataFrame(cluster_prob)

In [None]:
bgimm_cluster_df = pd.DataFrame(in_beta_mat.reset_index()[['Subject']].dropna(axis=0)) 
bgimm_cluster_df['ClusterID'] = lpa_clusters

In [None]:
print(bgimm_cluster_df.groupby('ClusterID').count().iloc[:,0])

In [None]:
both_hems_clustrs = bgimm_cluster_df 

In [None]:
summdf = pd.merge(both_hems_clustrs, bv_df)
print(summdf[['ClusterID', 'Total_Events']].groupby('ClusterID').median())

### Evaluate Clustering Results

In [None]:
# Choose which data to merge in
m1 = pd.merge(bgimm_cluster_df, in_beta_mat.reset_index(), how = 'inner')
m2 = pd.merge(m1, bv_df_orig, how = 'left', on='Subject', suffixes=(None, '_bvdf'))

#Drop participants without RI data
group_df_orig = m2#.dropna(axis=0, subset=['Total_Events'])
group_df_orig['ClusterID'] = group_df_orig['ClusterID'].astype('category')

In [None]:
# Merge Cluster ID with whole-brain results map
clust_beta_df = group_df_orig[['Subject', 'ClusterID']]
counts = clust_beta_df.groupby('ClusterID').count().iloc[:,0]
counts

In [None]:
def recode_cluster(df, var_name, order):

    df[var_name] = df[var_name].replace(order)
    df[var_name] = df[var_name].str.lstrip('Class_')
    
    return df
        

In [None]:
# Recode values in order of adversity exposure (lowest to highest)

order_dict = {0:'Class_3', 1:'Class_2', 2:'Class_1'}
group_df=recode_cluster(group_df_orig, 'ClusterID', order_dict) 
group_df = pd.merge(group_df, all_coll_df.reset_index(), on='Subject', how ='inner', suffixes = (None, 'neural'))
group_df['ClusterID'] = group_df['ClusterID'].astype(int)

# # Write df to CSV
group_df_file = analysis + '/Analysis_Dataset_LPA_3class_n={}_{}.csv'.format(len(group_df), today)
group_df.to_csv(group_df_file)
print(group_df_file)

# Print counts
counts = group_df.groupby('ClusterID').count().iloc[:,0]
counts

In [None]:
# Set order and categories for all plots

import itertools
# Get category IDs, sort, put in list
clus_cats = pd.DataFrame(group_df['ClusterID'].value_counts()).index.astype(int).sort_values().tolist()

# Create list of pairwise combinations
pair_cats=[]
for each in itertools.combinations(clus_cats, 2):
    pair_cats.append(each)

# Define for subsequent plots
order = clus_cats
pairs = pair_cats

In [None]:
# def get_network(df, atlas_defs, network_val):
#     func_net_inds = np.where(atlas_defs.iloc[0,:].to_numpy() == network_val) # Get indices of column
#     func_net = df.iloc[:,func_net_inds[0].tolist()] # Subset columns by those indices
#     return func_net

In [None]:
# # Read in network definitions
# atlas_definitions = pd.read_csv('/gpfs/milgram/pi/gee_dylan/lms233/Shen368/Shen_TenNetwork_368atlas.txt', header=None, sep='\t', index_col=None)
# at_subset = atlas_definitions.iloc[:,0:341]
# at_subset.columns = range(1,342) # Rename columns to account for 0-index

# def compute_net_data(regressed_nets, thde, atlas_subset):
    
#     func_net1 = pd.DataFrame(get_network(regressed_nets, atlas_subset, 1).mean(axis=1), columns = ['net_1_{}'.format(thde)])
#     func_net2 = pd.DataFrame(get_network(regressed_nets, atlas_subset, 2).mean(axis=1), columns = ['net_2_{}'.format(thde)])
#     func_net3 = pd.DataFrame(get_network(regressed_nets, atlas_subset, 3).mean(axis=1), columns = ['net_3_{}'.format(thde)])
#     func_net4 = pd.DataFrame(get_network(regressed_nets, atlas_subset, 4).mean(axis=1), columns = ['net_4_{}'.format(thde)])
#     func_net5 = pd.DataFrame(get_network(regressed_nets, atlas_subset, 5).mean(axis=1), columns = ['net_5_{}'.format(thde)])
#     func_net6 = pd.DataFrame(get_network(regressed_nets, atlas_subset, 6).mean(axis=1), columns = ['net_6_{}'.format(thde)])
#     func_net7 = pd.DataFrame(get_network(regressed_nets, atlas_subset, 7).mean(axis=1), columns = ['net_7_{}'.format(thde)])
#     func_net8 = pd.DataFrame(get_network(regressed_nets, atlas_subset, 8).mean(axis=1), columns = ['net_8_{}'.format(thde)])
#     func_net9 = pd.DataFrame(get_network(regressed_nets, atlas_subset, 9).mean(axis=1), columns = ['net_9_{}'.format(thde)])
#     # func_net10 = get_network(tvs_full, at_subset, 10).mean(axis=1) # Exclude bc all cerebellar nodes
    
#     # Combine dfs into one
#     func_net_df = zscore(pd.concat([func_net1, func_net2,
#                              func_net3, func_net4, func_net5,
#                              func_net6, func_net7, func_net8,
#                              func_net9], axis=1), axis=0)
#     return func_net_df

In [None]:
# # Get network data for each contrast
# thr_func_net = compute_net_data(threat_reg_df_shen, 'thr', at_subset)
# saf_func_net = compute_net_data(safety_reg_df_shen, 'saf', at_subset)
# tvs_func_net = compute_net_data(tvs_reg_df_shen, 'tvs', at_subset)

In [None]:
# # Create network df
# net_df= pd.merge(group_df, tvs_func_net, on='Subject')
# thrsaf_netdf = pd.merge(thr_func_net, saf_func_net, on='Subject')
# all_func_nets_df = pd.merge(net_df, thrsaf_netdf, on='Subject')

# net_dvars = ['net_1_tvs', 'net_2_tvs', 'net_3_tvs',
#              'net_4_tvs', 'net_5_tvs', 'net_6_tvs', 'net_7_tvs',
#              'net_8_tvs', 'net_9_tvs']

# fdr_list = []
# for i, yvar in enumerate(net_dvars):
#     xvar = net_df[['ClusterID', 
#                        'sex', 'asr_age', 
#                        'combined_income', 'years_education']]
#     xvar['ClusterID'] = xvar['ClusterID'].astype('int')
#     xmat = sm.add_constant(xvar)
    
#     net_model = smf.ols("{} ~ C(ClusterID) + sex + asr_age + combined_income + years_education".format(yvar), data=net_df).fit()
    
#     table=sm.stats.anova_lm(net_model, type='3', robust='hc3')
#     display(table)
#     fdr_list.append(table['PR(>F)']['C(ClusterID)'])


In [None]:
# import statannotations as sa
# fig, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize = (12, 8))
# sns.set_palette('Paired')

# from statannotations.Annotator import Annotator


# x = "ClusterID"

# axes = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]

# dvars = ['net_1_tvs', 'net_2_tvs', 'net_3_tvs',
#          'net_4_tvs', 'net_5_tvs', 'net_6_tvs', 'net_7_tvs',
#          'net_8_tvs', 'net_9_tvs']

# labels = ['Functional Network 1', 'Functional Network 2', 'Functional Network 3',
#           'Functional Network 4', 'Functional Network 5', 'Functional Network 6', 
#           'Functional Network 7', 'Functional Network 8', 'Functional Network 9']

# titles = ['Medial frontal network', 'Frontoparietal network', 'Default mode network',
#           'Motor network', 'Visual I network', 'Visual II network', 
#           'Visual association network', 'Cingulo-opercular network', 'Subcortical network']

# for i in range(0, len(dvars)):
#     y = dvars[i]
#     sns.boxplot(y=y, x=x, ax = axes[i], data=net_df) #order=order, 
#     sns.stripplot(y=y, x=x, data = net_df, edgecolor='black', linewidth = 0.5, ax = axes[i]) #order=order, 
#     axes[i].set_ylabel(labels[i], size=10)
#     axes[i].set_xlabel('Latent Profile', size=10)
#     axes[i].set_title(titles[i], size=15)
#     # axes[i].set_ylim(-2, 4)
 
#     annotator = Annotator(axes[i], pairs, data=net_df, x=x, y=y, order=order)
#     annotator.configure(test='t-test_ind', text_format='star', loc='inside')
#     annotator.apply_and_annotate()

# fig.tight_layout()

In [None]:
# fig, ((ax1, ax2, ax3)) = plt.subplots( 1, 3, figsize = (16, 4))

# from statannotations.Annotator import Annotator
# df= all_func_nets_df
# x = "ClusterID"

# order = clus_cats
# pairs = pair_cats

# axes = [ax1, ax2, ax3, ax4, ax5, ax6, 
#         ax7, ax8, ax9, ax10, ax11, ax12, 
#         ax13, ax14, ax15, ax16, ax17, ax18]

# dvars = [ "net_3_tvs", "net_3_thr", "net_3_saf"]


# labels2 = ['Default Mode Network\nThreat vs. Safety', 
#            'Default Mode Network\nThreat vs. Baseline', 
#            'Default Mode Network\nSafety vs. Baseline']

# thr_subcort_palette = ['#6f89a2', '#a16527', '#784050']
# thr_subcort_palette_point = ['#8fb1d0', '#cf8232', '#965064']

# saf_subcort_palette = ['#c5dcf1', '#f0bd87', '#c096a2']
# saf_subcort_palette_point = ['#d8e7f5', '#f5d3af', '#d5b9c1']

# tvs_subcort_palette = ['#9fc5e8', '#e69138', '#965064']
# tvs_subcort_palette_point = ['#b2d0ec', '#eba75f', '#e58080']

# for i in range(0, len(dvars)):
#     y = dvars[i]
#     if '_thr' in y:
#         plotpalette = thr_subcort_palette
#         plotpointpal = thr_subcort_palette_point
#     elif '_saf' in y:
#         plotpalette = saf_subcort_palette
#         plotpointpal = saf_subcort_palette_point
#     elif '_tvs' in y:
#         plotpalette = tvs_subcort_palette
#         plotpointpal = tvs_subcort_palette_point
#     else:
#         print ('Error! Could not identify dvar palette to use')
        
#     sns.boxplot(data=df, x=x, y=y,  ax = axes[i], palette=plotpalette) #order=order,
#     sns.stripplot(y=y, x=x,  data = df, edgecolor='black',  linewidth = 0.5, ax = axes[i], palette=plotpointpal) #order=order,
#     axes[i].set_ylabel(labels2[i], size=14)
#     axes[i].set_xlabel('Latent Profile', size=14)
#     # axes[i].set_ylim(-2, 7)
    
#     annotator = Annotator(axes[i], pairs, data=df, x=x, y=y, order=order)
#     annotator.configure(test='t-test_ind', text_format='star', loc='inside')
#     annotator.apply_and_annotate()

# fig.tight_layout()

# plt.savefig(analysis + '/Figures/LCDiffs_FuncNetworks_{}.png'.format(today), dpi=300, transparent=True)

In [None]:
# # Anovas comparing differences
# from statsmodels.formula.api import ols

# dvars = ['net_3_tvs', 'net_3_thr', 'net_3_saf']

# for i in range(0, len(dvars)):
#     printmd('**{}**'.format(dvars[i]))
#     df = pairwise_tests(data = all_func_nets_df, between = 'ClusterID', dv = dvars[i], parametric=True, subject = 'Subject', padjust='fdr_bh', effsize='cohen', return_desc=True)
#     display(df)

### Plot points to examine clustering

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Plot dimensionality reduced data
pca = PCA(n_components = 3, 
         random_state=0).fit_transform(in_beta_mat)
tsne_fit = TSNE(n_components = 2,
               init = 'random',
                random_state=0,
               n_iter=3000).fit_transform(pca)

In [None]:
embed_df = pd.DataFrame(tsne_fit, columns = ['Dimension 1', 'Dimension 2'])
embed_df['ClusterID'] = group_df['ClusterID']

sns.scatterplot(x = 'Dimension 1', y = 'Dimension 2', hue = 'ClusterID', data = embed_df)

### Plot hippocampal, amygdalar, and adversity data

**Mackey segmentations** \
Mackey Area 14c: Majority Frontal Orbital Cortex and Subcallosal Cortex \
Mackey Area 24: Majority Frontal Medial Cortex and Subcallosal Cortex \
Mackey Area 25: Majority Subcallosal Cortex and Frontal Orbital Cortex \
\
**PCA components**\
Mackey_14r_14rr_11m_PCA: Majority Frontal Pole, Frontal Orbital Cortex, and Frontal Medial Cortex \
Mackey_14m_32_PCA: Majority Subcallosal Cortex, Anterior Cingulate Gyrus, and Paracingulate Gyrus

Mackey Area 11m: Majority Frontal Pole, Frontal Medial Cortex, and Frontal Orbital Cortex \
Mackey Area 14r: Majority Frontal Orbital Cortex, Subcallosal Cortex, and Frontal Medial Cortex \
Mackey Area 14rr: Majority Frontal Orbital Cortex, Majority Frontal Pole, and Frontal Medial Cortex \
Mackey Area 14m: Majority Frontal Medial Cortex, Subcallosal Cortex, Paracingulate Gyrus, and Frontal Pole \
Mackey Area 32: Majority Anterior Cingulate Gyrus, Paracingulate Gyrus, and Subcallosal Cortex

In [None]:
fig, ((ax1, ax2, ax3, ax4, ax5, ax6), 
      (ax7, ax8, ax9, ax10, ax11, ax12),
      (ax13, ax14, ax15, ax16, ax17, ax18)) = plt.subplots( 3, 6, figsize = (20,10))

from statannotations.Annotator import Annotator
df= group_df
x = "ClusterID"

order = clus_cats
pairs = pair_cats

axes = [ax1, ax2, ax3, ax4, ax5, ax6, 
        ax7, ax8, ax9, ax10, ax11, ax12, 
        ax13, ax14, ax15, ax16, ax17, ax18]

dvars = [ "hipp_tvs", "amyg_tvs", 'Mackey_24_tvs', 
         "Mackey_25_tvs", 'Mackey_32_14m_tvs', 
         '{}_dACC_tvs'.format(dacc_roi), 
         
          "hipp_thr", "amyg_thr", 'Mackey_24_thr',
         "Mackey_25_thr", 'Mackey_32_14m_thr', 
         '{}_dACC_thr'.format(dacc_roi),
         
         "hipp_saf", "amyg_saf", 'Mackey_24_saf', 
         "Mackey_25_saf", 'Mackey_32_14m_saf', 
         '{}_dACC_saf'.format(dacc_roi), 
        ]

labels2 = ['Hippocampus\nThreat vs. Safety', 'Amygdala\nThreat vs. Safety', 'vmPFC Area 24\nThreat vs. Safety', 
           'vmPFC Area 25\nThreat vs. Safety', 'vmPFC Areas 32 & 14m\nThreat vs. Safety', 
           '{} dACC\nThreat vs. Safety'.format(dacc_roi), 
           
           'Hippocampus\nThreat vs. Baseline', 'Amygdala\nThreat vs. Baseline','vmPFC Area 24\nThreat vs. Baseline', 
           'vmPFC Area 25\nThreat vs. Baseline', 'vmPFC Areas 32 & 14m\nThreat vs. Baseline', 
           '{} dACC\nThreat vs. Baseline'.format(dacc_roi), 
        
           'Hippocampus\nSafety vs. Baseline', 'Amygdala\nSafety vs. Baseline','vmPFC Area 24\nSafety vs. Baseline', 
           'vmPFC Area 25\nSafety vs. Baseline', 'vmPFC Areas 32 & 14m\nSafety vs. Baseline', 
           '{} dACC\nSafety vs. Baseline'.format(dacc_roi)]


plotpalette = ['#9fc5e8', '#e69138', '#965064']
plotpointpal = ['#b2d0ec', '#eba75f', '#e58080']

for i in range(0, len(dvars)):
    y = dvars[i]
        
    sns.boxplot(data=df, x=x, y=y,  ax = axes[i], palette=plotpalette) #order=order,
    sns.stripplot(y=y, x=x,  data = df, edgecolor='black',  linewidth = 0.5, ax = axes[i], palette=plotpointpal) #order=order,
    axes[i].set_ylabel(labels2[i], size=14)
    axes[i].set_xlabel('Latent Profile', size=14)
    # axes[i].set_ylim(-2, 7)
    
    annotator = Annotator(axes[i], pairs, data=df, x=x, y=y, order=order)
    annotator.configure(test='t-test_ind', text_format='star', loc='inside')
    annotator.apply_and_annotate()

fig.tight_layout()

plt.savefig(analysis + '/Figures/LCDiffs_PFC_{}.png'.format(today), dpi=300, transparent=True)

In [None]:
print('LCDiffs_PFC_{}.png'.format(today))

In [None]:
 # Convert to long format
# Hipp
hipp_long = pd.melt(group_df[["Subject", "ClusterID", "hipp_saf", "hipp_thr"]], 
                    id_vars = ['Subject','ClusterID'], var_name = 'thr_vs_saf', value_name = 'hipp_act')
hipp_long['thr_vs_saf'] = hipp_long['thr_vs_saf'].str.split('_', expand=True)[1]

# Amyg
amyg_long = pd.melt(group_df[["Subject", "ClusterID", "amyg_saf", "amyg_thr"]], 
                    id_vars = ['Subject','ClusterID'], var_name = 'thr_vs_saf', value_name = 'amyg_act')
amyg_long['thr_vs_saf'] = amyg_long['thr_vs_saf'].str.split('_', expand=True)[1]

# vmPFC 25
m25_long = pd.melt(group_df[["Subject", "ClusterID", "Mackey_25_saf", "Mackey_25_thr"]], 
                   id_vars = ['Subject','ClusterID'], var_name = 'thr_vs_saf', value_name = 'm25_act') 
m25_long['thr_vs_saf'] = m25_long['thr_vs_saf'].str.split('_', expand=True)[2]

# vmPFC 24
m24_long = pd.melt(group_df[["Subject", "ClusterID", "Mackey_24_saf", "Mackey_24_thr"]], 
                   id_vars = ['Subject','ClusterID'], var_name = 'thr_vs_saf', value_name = 'm24_act') 
m24_long['thr_vs_saf'] = m24_long['thr_vs_saf'].str.split('_', expand=True)[2]

# vmPFC 32 + 14m
m32_long = pd.melt(group_df[["Subject", "ClusterID", "Mackey_32_14m_saf", "Mackey_32_14m_thr"]], 
                   id_vars = ['Subject','ClusterID'], var_name = 'thr_vs_saf', value_name = 'm32_act') 
m32_long['thr_vs_saf'] = m32_long['thr_vs_saf'].str.split('_', expand=True)[3]

# dACC
dacc_long = pd.melt(group_df[["Subject", "ClusterID", "{}_dACC_saf".format(dacc_roi), "{}_dACC_thr".format(dacc_roi)]], 
                    id_vars = ['Subject','ClusterID'], var_name = 'thr_vs_saf', value_name = 'dacc_act') 
dacc_long['thr_vs_saf'] = dacc_long['thr_vs_saf'].str.split('_', expand=True)[2]

# Merge data
m1 = pd.merge(hipp_long, amyg_long, how = 'inner', on=['Subject', "ClusterID", 'thr_vs_saf'])
m2 = pd.merge(m1, m25_long, how = 'inner', on=['Subject', "ClusterID", 'thr_vs_saf'])
m3 = pd.merge(m2, m24_long, how = 'inner', on=['Subject', "ClusterID", 'thr_vs_saf'])
m4 = pd.merge(m3, m32_long, how = 'inner', on=['Subject', "ClusterID", 'thr_vs_saf'])
m5 = pd.merge(m4, dacc_long, how = 'inner', on=['Subject', "ClusterID", 'thr_vs_saf'])
all_long = m5

In [None]:
# Print pairwise differences -- are neural responses to threat and safety statistically different within clusters (T-tests)

dvars = ['hipp', 'amyg', 'Mackey_25', 'Mackey_24', 'Mackey_32_14m', 'AAL3_dACC', ]

print('*** ONE SAMPLE T_TESTS ***')
res = pd.DataFrame()
for i, dvar in enumerate(dvars):
    res_df = pd.DataFrame()
    for j, cluster in enumerate(clus_cats):
        clus_df = group_df[group_df['ClusterID'] == cluster].reset_index(drop=True)
        df = ttest(x = clus_df['{}_tvs'.format(dvar)], y=0, alternative = "two-sided") # Two-sided one-sample T-test       
        df['mean (A)'] = clus_df['{}_tvs'.format(dvar)].mean()
        df['std (A)'] = clus_df['{}_tvs'.format(dvar)].std()
        res_df = pd.concat([res_df, df], axis=0)
    printmd('**{}**'.format(dvars[i]))
    res_df.index = ['Cluster 1', 'Cluster 2', 'Cluster 3']
    res_df['pFDR'] = fdr(res_df['p-val'])[1]
    res_df = res_df.round(3)
    res_df_reordered = res_df[['mean (A)', 'std (A)', 'T', 'dof','p-val', 'pFDR', 'cohen-d']]
    display(res_df_reordered)


In [None]:
# Print pairwise differences between groups -- are clusters statistically different in response to threat
dvars = ["hipp_thr", "amyg_thr", 'Mackey_24_thr', "Mackey_25_thr", 'Mackey_32_14m_thr', '{}_dACC_thr'.format(dacc_roi),
         "hipp_saf", "amyg_saf", "Mackey_24_saf", 'Mackey_25_saf', 'Mackey_32_14m_saf', '{}_dACC_saf'.format(dacc_roi),
         "hipp_tvs", "amyg_tvs","Mackey_24_tvs", 'Mackey_25_tvs', 'Mackey_32_14m_tvs', '{}_dACC_tvs'.format(dacc_roi)
        ]

res = pd.DataFrame()
for i in range(0, len(dvars)):
    df = pairwise_gameshowell(data = group_df, between = 'ClusterID', 
                              dv = dvars[i], 
                         effsize='cohen').round(3)
    std_df = pairwise_tests(data=group_df, between='ClusterID', dv = dvars[i], parametric=True, return_desc=True) # Compute pairwise tests for within-group means/std
    df['Group'] = dvars[i]
    printmd('**{}**'.format(dvars[i]))
    df = pd.concat([df, std_df[['std(A)', 'std(B)']]], axis=1) # Concatenate means/std dev
    display(df)
    res = pd.concat([res, df], axis=0)

In [None]:
#Plot outputs
fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(1, 4, figsize = (16, 4))
sns.set_palette('Paired')

x = "ClusterID"

order = clus_cats
pairs = pair_cats

axes = [ax1, ax2, ax3, ax4]

dvars = ["Early_Childhood_regr",
         "Mid_Childhood_regr", 
         "Adolescence_regr", 
         "Adulthood_regr"]

adv_labels = ['Adversity Exposure in\nEarly Childhood ', 
              'Adversity Exposure in\nMiddle Childhood', 
              'Adversity Exposure\nin Adolescence', 
              'Adversity Exposure\nin Adulthood', 
              'Cumulative Exposure\nto Developmental Adversity',
              'Cumulative Exposure\nto Adversity']

pairwise_dvars = ["Early_Childhood",
                  "Mid_Childhood", 
                  "Adolescence",
                 "Adulthood"]

adv_colors = ['#9fc5e8', '#e69138', '#965064']
adv_pointcols = ['#b2d0ec', '#eba75f', '#bc647e']

for i in range(0, len(dvars)):
    y = dvars[i]
    y_raw = pairwise_dvars[i]
    print(y)
    # Filter dependent var so only values greater than 0 remain
    df = group_df

    # Print medians
    print('*** Class 1 Exposure Median: {}; Mean: {}, Std Dev: {}, range = {}-{} ***\n'.format(df[df.ClusterID==1][y_raw].median(),
                                                            df[df.ClusterID==1][y].mean(), df[df.ClusterID==1][y].std(), df[df.ClusterID==1][y_raw].min(), 
                                                                                               df[df.ClusterID==1][y_raw].max()))
    print('*** Class 2 Exposure Median: {}; Mean: {}, Std Dev: {}, range = {}-{} ***\n'.format(df[df.ClusterID==2][y_raw].median(),
                                                            df[df.ClusterID==2][y].mean(), df[df.ClusterID==2][y].std(), df[df.ClusterID==2][y_raw].min(), 
                                                                                               df[df.ClusterID==2][y_raw].max()))
    print('*** Class 3 Exposure Median: {}; Mean: {}, Std Dev: {}, range = {}-{} ***\n'.format(df[df.ClusterID==3][y_raw].median(),
                                                            df[df.ClusterID==3][y].mean(), df[df.ClusterID==3][y].std(), df[df.ClusterID==3][y_raw].min(), 
                                                                                               df[df.ClusterID==3][y_raw].max()))
    

    sns.boxplot(data=df, x=x, y=y, ax = axes[i], palette=adv_colors)
    sns.stripplot(y=y, x=x, data = df, edgecolor='black', linewidth = 0.5, ax = axes[i], jitter=True, 
                  palette=adv_pointcols)
    
    axes[i].set_ylabel('Number of Exposures', size=14)
    axes[i].set_xlabel('Latent Profile', size=14)
    axes[i].set_ylim(-2, 7)
    annotator = Annotator(axes[i], pairs, data=df, x=x, y=y, order=order)
    annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
    annotator.apply_and_annotate()

fig.tight_layout()
plt.savefig(analysis + '/Figures/LCDiffs_AdversityExp_{}.png'.format(today), dpi=300, transparent=True)

In [None]:
p_1 = []
p_2 = []
p_3 = []

def get_desc_stats(data_df, vars_list):
    append_df = np.zeros((3, 8), dtype='object')
   
    for i, cat in enumerate(clus_cats):
        df = data_df[data_df['ClusterID'] == cat]
        assert len(df['ClusterID']) < len(group_df)
        counter = []
        
        for j, var in enumerate(vars_list):
            loc = j + sum(counter)
            med = df[var].median()
            var_min, var_max = round(df[var].min()), round(df[var].max())
            
            append_df[i, loc] = med
            append_df[i, loc+1] = "{}-{}".format(var_min, var_max)
            counter.append(1)

    return append_df

In [None]:
adv_list = ['Early_Childhood', 'Mid_Childhood', 'Adolescence', 'Adulthood']
adv_summary = get_desc_stats(group_df, adv_list)
pd.DataFrame(adv_summary, columns = ['Median', 'Range', 'Median', 'Range', 
                                    'Median', 'Range', 'Median', 'Range'])

In [None]:
# Determine pairwise differences between groups in early childhood adversity exposure
ec_ptests = pairwise_tests(data = group_df, between = 'ClusterID', dv = 'Early_Childhood_regr', parametric=False, return_desc=True, subject = 'Subject', padjust='fdr_bh', effsize='cohen').round(3)
ec_ptests['Group'] = 'Early Childhood'

In [None]:
# Determine pairwise differences between groups in middle childhood adversity exposure
mc_ptests = pairwise_tests(data = group_df, between = 'ClusterID', dv = 'Mid_Childhood_regr', parametric=False, return_desc=True, subject = 'Subject', padjust='fdr_bh', effsize='cohen').round(3)
mc_ptests['Group'] = 'Middle Childhood'

In [None]:
# Determine pairwise differences between groups in adolescent adversity exposure
adol_ptests = pairwise_tests(data = group_df, between = 'ClusterID', dv = 'Adolescence_regr', parametric=False, return_desc=True, subject = 'Subject', padjust='fdr_bh', effsize='cohen').round(3)
adol_ptests['Group'] = 'Adolescence'

In [None]:
# Determine pairwise differences between groups in adolescent adversity exposure
adult_ptests = pairwise_tests(data = group_df, between = 'ClusterID', dv = 'Adulthood_regr', parametric=False, return_desc=True, subject = 'Subject', padjust='fdr_bh', effsize='cohen').round(3)
adult_ptests['Group'] = 'Adulthood'

In [None]:
# Concatenate into table
adv_table = pd.concat([ec_ptests, mc_ptests, adol_ptests, adult_ptests], axis=0).drop(['Contrast', 'alternative', 'p-adjust', 'Paired', 'Parametric'], axis=1).rename(columns = {'U-val': 'Statistic', 'p-unc':'pval'})
tvs_table = res[res['Group'].str.endswith('tvs')].rename(columns = {'T':'Statistic'})
both_table = pd.concat([adv_table, tvs_table], axis = 0)
both_table['Comparison'] = both_table['A'].astype(str) + ' vs. ' + both_table['B'].astype(str)
both_table = both_table.drop(['A', 'B'], axis=1)[["Group", 'Comparison', "mean(A)", "std(A)", "mean(B)", "std(B)", 'Statistic', 'pval', 'p-corr', 'cohen']]
both_table.pivot_table(columns = ['Group', 'Comparison'], sort=False).round(3)

### Examine group differences in symptoms

In [None]:
#Symptom Data
symptom_df = group_df.dropna(subset = ['age_at_ri', 'sex', 'combined_income', 'years_education', 'total_scared'])
print('{} subjects have symptom data'.format(len(symptom_df)))

In [None]:
# Model associations with anxiety symptoms
symp_pvals = []

# Set dependent variable
symptom_df['scared_total_tranf'] = np.sqrt(symptom_df['total_scared'] + 1)
yvar = 'scared_total_tranf'

fig, ax = plt.subplots(1, 1, figsize=(3, 3))
sns.histplot(symptom_df[yvar], ax=ax, bins=30)
plt.show()

# Fit model
scared_model = smf.ols("scared_total_tranf ~ C(ClusterID) + sex + asr_age + combined_income + years_education ", data=symptom_df).fit()
display(scared_model.summary())
table=sm.stats.anova_lm(scared_model, type='3', robust='hc3') #https://cran.r-project.org/web/packages/sandwich/vignettes/sandwich.pdf, "...which arrive at the conclusion that HC3 provides the best performance in small samples as it gives less weight to influential observations."
symp_pvals.append([yvar,table["PR(>F)"]['C(ClusterID)']])
display(table)

# Plot results
order = clus_cats
pairs = pair_cats

# Print median values for each class
print('*** Class 1 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==1]['total_scared'].median(),
                                                        symptom_df[symptom_df.ClusterID==1]['total_scared'].mean()))
print('*** Class 2 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==2]['total_scared'].median(),
                                                        symptom_df[symptom_df.ClusterID==2]['total_scared'].mean()))
print('*** Class 3 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==3]['total_scared'].median(),
                                                        symptom_df[symptom_df.ClusterID==3]['total_scared'].mean()))
# print('*** Class 4 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==4]['total_scared'].median(),
#                                                         symptom_df[symptom_df.ClusterID==4]['total_scared'].mean()))
fig, ax = plt.subplots(1, 1, figsize = (6, 3))
anx_boxpal = ['#9fc5e8', '#e69138', '#965064']
anx_boxpalpoint = ['#b2d0ec', '#eba75f', '#bc647e']

sns.boxplot(x = 'ClusterID', y = yvar, data = symptom_df, ax=ax, palette=anx_boxpal)
sns.stripplot(y=yvar, x='ClusterID', data = symptom_df, edgecolor='black', order=order, linewidth = 0.5, ax = ax,
             palette=anx_boxpalpoint)
ax.set_ylabel('Anxiety Symptoms (SCAARED)')
ax.set_xlabel('Latent Profile')
ax.set_ylim(0,16)

annotator = Annotator(ax, pairs, data=symptom_df, x='ClusterID', y=yvar, order=order)
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
annotator.apply_and_annotate()
plt.tight_layout()
plt.savefig(analysis + '/Figures/LCDiffs_Anxiety_{}.png'.format(today), dpi=300, transparent=True)
plt.show()
from pingouin import pairwise_tests
pairwise_gameshowell(data = symptom_df, between = 'ClusterID', dv = yvar, #subject = 'Subject', parametric=True, padjust='fdr_bh',, return_desc=True
                         effsize='cohen').round(3)

In [None]:
# Model associations with anxious and depressive symptoms

# Model Total symptoms
symptom_df['tsc_tranf'] = np.log(symptom_df['tsc_total'] + 1) 
yvar = 'tsc_tranf' #Square root transform bc not normal (JB test)
print("Mean: {}".format(symptom_df[yvar].mean()))
print("Variance: {}".format(symptom_df[yvar].var()))

# Histogram of distribution\
fig, ax = plt.subplots(1, 1, figsize=(3, 3))
sns.histplot(symptom_df[yvar], ax=ax, bins=30)
plt.show()

# Fit model
tsc_model = smf.ols("tsc_tranf ~ C(ClusterID) + sex + asr_age + combined_income + years_education ", data=symptom_df).fit()
table=sm.stats.anova_lm(tsc_model, type='3', robust='hc3')
symp_pvals.append([yvar,table["PR(>F)"]['C(ClusterID)']])
display(table)

# Print class medians
print('*** Class 1 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==1]['tsc_total'].median(),
                                                        symptom_df[symptom_df.ClusterID==1]['tsc_total'].mean()))
print('*** Class 2 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==2]['tsc_total'].median(),
                                                        symptom_df[symptom_df.ClusterID==2]['tsc_total'].mean()))
print('*** Class 3 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==3]['tsc_total'].median(),
                                                        symptom_df[symptom_df.ClusterID==3]['tsc_total'].mean()))

order = clus_cats
pairs = pair_cats


fig, ax = plt.subplots(1, 1, figsize = (6, 3))

sns.boxplot(x = 'ClusterID', y = 'tsc_total', data = symptom_df,ax=ax, palette="blend:#f6b26b,#df6161")
sns.stripplot(y='tsc_total', x='ClusterID',data = symptom_df, edgecolor='black', order=order, linewidth = 0.5, ax = ax,
             palette="blend:#f6b26b,#df6161")
annotator = Annotator(ax=ax, pairs = pairs, data=symptom_df, x=x, y = 'tsc_total', order=order)
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
annotator.apply_and_annotate()
plt.show()
pairwise_gameshowell(data = symptom_df, dv = yvar, between = 'ClusterID')

In [None]:
# Model associations with anxious and depressive symptoms
print(len(symptom_df))
# Model Total symptoms
symptom_df['ext_prob_tranf'] = np.log(symptom_df['Externalizing_Problems_Total'] + 1) #Log root transform bc not normal (JB test)
yvar = 'ext_prob_tranf'
print("Mean: {}".format(symptom_df[yvar].mean()))
print("Variance: {}".format(symptom_df[yvar].var()))

# Histogram of distribution
fig, ax = plt.subplots(1, 1, figsize=(3, 3))
sns.histplot(symptom_df[yvar], ax=ax, bins=30)
plt.show()

# Fit model
ext_model = smf.ols("ext_prob_tranf ~ C(ClusterID) + sex + asr_age + combined_income + years_education ", data=symptom_df).fit()
table=sm.stats.anova_lm(ext_model, type='3', robust='hc3')
symp_pvals.append([yvar,table["PR(>F)"]['C(ClusterID)']])
display(ext_model.summary())
display(table)

# Print class medians
print('*** Class 1 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==1]['Externalizing_Problems_Total'].median(),
                                                        symptom_df[symptom_df.ClusterID==1]['Externalizing_Problems_Total'].mean()))
print('*** Class 2 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==2]['Externalizing_Problems_Total'].median(),
                                                        symptom_df[symptom_df.ClusterID==2]['Externalizing_Problems_Total'].mean()))
print('*** Class 3 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==3]['Externalizing_Problems_Total'].median(),
                                                        symptom_df[symptom_df.ClusterID==3]['Externalizing_Problems_Total'].mean())),
print('*** Class 4 Symptoms Median: {}; Mean: {} ***\n'.format(symptom_df[symptom_df.ClusterID==4]['Externalizing_Problems_Total'].median(),
                                                        symptom_df[symptom_df.ClusterID==4]['Externalizing_Problems_Total'].mean()))

order = clus_cats
pairs = pair_cats


fig, ax = plt.subplots(1, 1, figsize = (6, 3))

sns.boxplot(x = 'ClusterID', y = 'Externalizing_Problems_Total', data = symptom_df,ax=ax, palette="blend:#f6b26b,#df6161")
sns.stripplot(y='Externalizing_Problems_Total', x='ClusterID',data = symptom_df, edgecolor='black', order=order, linewidth = 0.5, ax = ax,
             palette="blend:#f6b26b,#df6161")
annotator = Annotator(ax=ax, pairs = pairs, data=symptom_df, x=x, y='Externalizing_Problems_Total', order=order)
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
annotator.apply_and_annotate()
plt.show()

pairwise_gameshowell(data = symptom_df, dv = yvar, between = 'ClusterID')

In [None]:
printmd('**Symptom medians**')
group_df[['ClusterID', 'diagnostic_group', 'Anxiety_Problems_Total', 'total_scared', 'tsc_total', 'Externalizing_Problems_Total']].groupby('ClusterID').median()

In [None]:
# FDR correction
fdr_table = pd.DataFrame(symp_pvals, columns = ['measure', 'pvalue'])
fdr_table['fdr_passed'], fdr_table['fdr_pval'] = fdr(fdr_table['pvalue'])
fdr_table

In [None]:
symp_corr = symptom_df.loc[:, ["Internalizing_Problems_Total", "Externalizing_Problems_Total", 'Anxiety_Problems_Total', 'Total_Problems_Total', 'tsc_anxiety', 'ri_ptsd_total']].dropna().corr()
sns.heatmap(symp_corr)

### Test group diffs in GSR data

In [None]:
gsr = pd.read_csv(analysis + '/GSR_data_2024-04-15.csv', index_col = 0)
gsr_avg = gsr.groupby('Subject').mean()

In [None]:
gsr_group_df = pd.merge(gsr, group_df, on='Subject', how = 'inner').reset_index(drop=True)
gsr_avg_df = pd.merge(gsr_avg.reset_index(), group_df, on='Subject')

In [None]:
gsr_avg_df.head()

In [None]:
# Omnibus model
gsr_data = gsr_group_df.dropna(axis=0, subset=['ClusterID', 'age_at_scan', 'sex', 'combined_income', 'Threat']).reset_index(drop=True)
mod = sm.MixedLM.from_formula("Threat ~ ClusterID + Run_x + age_at_scan + sex + combined_income ", 
                groups="Subject", data= gsr_data);
aresults = mod.fit();
print(aresults.summary())

In [None]:
gsr_model = smf.ols("Threat ~ C(ClusterID) + sex + asr_age + combined_income + years_education ", data=gsr_avg_df).fit()
table=sm.stats.anova_lm(gsr_model, type='3', robust='hc3')
display(table)


In [None]:
fig, ax = plt.subplots(1, 1, figsize = (6, 3))

sns.boxplot(x = 'ClusterID', y = 'Threat', data = gsr_data,ax=ax, palette="blend:#f6b26b,#df6161")
sns.stripplot(x = 'ClusterID', y = 'Threat', data = gsr_data, 
              edgecolor='black', order=order, linewidth = 0.5, ax = ax,
              palette="blend:#f6b26b,#df6161")
plt.show()

In [None]:
pairwise_tests(data = gsr_data, dv = 'Threat', between = 'ClusterID', effsize='cohen', padjust='fdr_bh', parametric=False, subject = 'Subject', return_desc=True).round(3)

In [None]:
gsr_model = smf.ols("Safety ~ C(ClusterID) + sex + asr_age + combined_income + years_education ", data=gsr_avg_df).fit()
table=sm.stats.anova_lm(gsr_model, type='3', robust='hc3')
display(table)


In [None]:
fig, ax = plt.subplots(1, 1, figsize = (6, 3))

sns.boxplot(x = 'ClusterID', y = 'Safety', data = gsr_data,ax=ax, palette="blend:#f6b26b,#df6161")
sns.stripplot(x = 'ClusterID', y = 'Safety', data = gsr_data, edgecolor='black', order=order, linewidth = 0.5, ax = ax,
             palette="blend:#f6b26b,#df6161")
plt.show()

In [None]:
pairwise_tests(data = gsr_data, dv = 'Safety', between = 'ClusterID', effsize='cohen', padjust='fdr_bh', parametric=False, subject = 'Subject', return_desc=True).round(3)

## Check for differences in counterbalancing

In [None]:
# Import counterbalancing info
cb1 = pd.read_csv(analysis + '/ShapesCounterbalancing_REDCap_7.10.24.csv').rename(columns = {'shapes_version':'redcap_record'})
cb1['Subject'] = 'sub-' + cb1['record_id']

cb2 = pd.read_csv(analysis + '/Shapes task versions (counterbalance) - Assignment.csv').drop(['Scan date', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5'], axis=1).rename(columns = {'Version':'gdoc_record'})
cb2['Subject'] = 'sub-' + cb2['ID']

cb3 = pd.read_csv(analysis + '/Shapes_Counterbalancing_fromEprime_2024-07-10.csv').drop('Unnamed: 0', axis=1).rename(columns = {'Counterbalancing':'eprime_file_record'})

cb_m1 = pd.merge(cb1, cb2, on='Subject', how = 'outer')
cb_m2 = pd.merge(cb_m1, cb3, on='Subject', how = 'outer')
cb = cb_m2.drop(['record_id', 'ID'], axis=1).set_index('Subject').dropna(how = 'all', axis=0)

cb['Final_Version'] = cb['gdoc_record']
cb['Final_Version'].update(cb['redcap_record'])
cb['Final_Version'].update(cb['eprime_file_record'])

cb.to_csv(analysis + '/Shapes_Combined_Counterbalancing_n={}_{}.csv'.format(len(cb), today))
print(analysis + '/Shapes_Combined_Counterbalancing_n={}_{}.csv'.format(len(cb), today))

In [None]:
cb_df = pd.merge(cb.reset_index()[['Subject', 'Final_Version']], group_df, on='Subject', how = 'right')
cb_df['Final_Version'] = cb_df['Final_Version'].astype(float)
print("{} unique counterbalancing versions".format(len(cb_df['Final_Version'].unique())))

In [None]:
nan_df = cb_df[cb_df.Final_Version.isna()]
print('{} participants are missing counterbalancing data'.format(len(nan_df)))

In [None]:
from pingouin import anova

# Do LPA variables differ as a function of counterbalancing?
dvs = ['Early_Childhood_regr', 'Mid_Childhood_regr', 'Adolescence_regr','Adulthood_regr', 'hipp_tvs', 
       'amyg_tvs', 'AAL3_dACC_tvs','Mackey_25_tvs', 'Mackey_24_tvs', 'Mackey_32_14m_tvs']

res_df = pd.DataFrame()
for idx, yvar in enumerate(dvs):
    printmd("**{}**".format(yvar))
    aov_mod = anova(data = cb_df, dv = yvar, between = 'Final_Version', ss_type = '2', detailed = True)
    res_df = pd.concat([res_df, aov_mod.iloc[0:1,:]], axis=0)
    display(aov_mod)

In [None]:
# FDR correction for multiple comparisons
res_df['FDR Passed'], res_df['FDR p-val'] = fdr(res_df['p-unc'])
res_df

In [None]:
from pingouin import chi2_independence, pairwise_corr

# Does latent profile differ by counterbalancing?
expected, observed, stats = chi2_independence(cb_df, x='Final_Version', y='ClusterID')
stats

### Check correlations between covariates and dvs for clinical models

In [None]:
# Are clinical symptoms and covariates correlated?
corr_vars = [['scared_total_tranf', 'tsc_tranf', 'Externalizing_Problems_Total'], ['asr_age', 'combined_income', 'years_education']]

symptom_df['combined_income'] = symptom_df['combined_income'].astype(float)
symptom_df['years_education'] = symptom_df['years_education'].astype(float)

pairwise_corr(data = symptom_df, columns = corr_vars, alternative = 'two-sided', method = 'spearman', padjust = 'fdr_bh')

In [None]:
corr_vars = [['asr_age', 'combined_income', 'years_education'],['Early_Childhood_regr', 'Mid_Childhood_regr', 
             'Adolescence_regr','Adulthood_regr', 'hipp_tvs', 'amyg_tvs', 'AAL3_dACC_tvs','Mackey_25_tvs', 
             'Mackey_24_tvs', 'Mackey_32_14m_tvs']]

group_df['combined_income'] = group_df['combined_income'].astype(float)
group_df['years_education'] = group_df['years_education'].astype(float)

pairwise_corr(data = group_df, columns = corr_vars, alternative = 'two-sided', method = 'spearman', padjust = 'fdr_bh').sort_values(by='p-corr', ascending = True)

In [None]:
# Do any of the following covariates differ as a function of latent profile?
covars = ['asr_age', 'combined_income', 'years_education']
res_df = pd.DataFrame()

for idx, yvar in enumerate(covars):
    printmd("**{}**".format(yvar))
    aov_mod = anova(data = symptom_df, dv = yvar, between = 'ClusterID', ss_type = '2', detailed = True)
    res_df = pd.concat([res_df, aov_mod.iloc[0:1,:]], axis=0)
    display(aov_mod)

In [None]:
# FDR correction for multiple comparisons
res_df['FDR Passed'], res_df['FDR p-val'] = fdr(res_df['p-unc'])
res_df

In [None]:
# Does sex differ between latent profiles?
expected, observed, stats = chi2_independence(cb_df, x='sex', y='ClusterID')
stats

In [None]:
# Do dvs differ by sex?
dvs = ['scared_total_tranf', 'tsc_tranf', 'Externalizing_Problems_Total']

for idx, yvar in enumerate(dvs):
    printmd("**{}**".format(yvar))
    aov_mod = pairwise_tests(data = symptom_df, dv = yvar, between = 'sex')
    res_df = pd.concat([res_df, aov_mod.iloc[0:1,:]], axis=0)
    display(aov_mod)

In [None]:
corr_df = symptom_df[['Early_Childhood_regr', 'Mid_Childhood_regr', 'Adolescence_regr','Adulthood_regr', 
                      'hipp_tvs', 'amyg_tvs', 'AAL3_dACC_tvs','Mackey_25_tvs', 'Mackey_24_tvs', 
                      'Mackey_32_14m_tvs', 'asr_age', 'combined_income', 'years_education', 
                      'scared_total_tranf', 'tsc_tranf', 'Externalizing_Problems_Total']].astype(float)
corr_df = corr_df.rename(columns = {'scared_total_tranf':'SCAARED Anxiety Symptoms', 'tsc_tranf':'TSC-40 Trauma-Related Symptoms', 'Externalizing_Problems_Total':'ASR Externalizing Problems', 'combined_income': 'Combined Family Income', 'asr_age':'Age at Symptom Questionnaires', 'years_education': 'Years of Education'})

corr_df.columns = corr_df.columns.str.replace('_', ' ').str.replace('tvs', 'Activation (Threat vs. Safety)').str.replace('regr', 'Adversity').str.replace('Mackey', 'vmPFC Area').str.replace('AAL3', '').str.replace('hipp', 'Hippocampus').str.replace('amyg', 'Amygdala').str.replace('Mid', 'Middle')
corr_df_img = corr_df.corr(method = 'spearman').round(3)

fig, ax = plt.subplots(figsize = (11,9))
sns.heatmap(corr_df_img, vmin=-1, vmax=1, annot = True, annot_kws = {'fontsize':8})