### Organize GSR Data

In [None]:
import pandas as pd
import numpy as np
from glob import glob
import nibabel as nib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import AnovaRM, anova_lm
from scipy.stats import ttest_rel
from scipy.stats.mstats import winsorize
import seaborn as sns
from copy import deepcopy
from datetime import date
from sklearn.preprocessing import StandardScaler
from scipy.stats import jarque_bera, zscore, boxcox
from pingouin import pairwise_tests

today=str(date.today())
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
plt.rcParams.update({'font.size': 14})

In [None]:
gsrpath = '/gpfs/milgram/project/gee_dylan/candlab/scripts/shapes/gsr/pspm'
datapath = '/gpfs/milgram/pi/gee_dylan/candlab/analyses/shapes/shapes_phenotyping'
analysis = datapath + '/Analysis'
bv_data = pd.read_csv(analysis + '/Behav_Dataset_n=117_2023-03-08.csv') 
gsrqa = pd.read_csv(analysis + '/GSRQAShapes-GSRratings_Labels_9.25.23.csv')
gsrqa['Subject'] = 'sub-' + gsrqa['Subject ID'].str.replace('--1','').str.replace('--2','').str.replace('-P','')
sublist = bv_data['Subject'].tolist()

In [None]:
# Get counts for data read in
print(len(gsrqa['Subject']), 'subjects included')
print(len(list(set(gsrqa['Subject']))), 'subjects (de-duplicated)')

In [None]:
# Identify subjects with multiple ratings
problem_qa = []

for i in range(0, len(list(set(gsrqa['Subject'])))):
    subj = list(set(gsrqa['Subject']))[i] # select ID
    dset = gsrqa[gsrqa['Subject'] == subj] # get dataset with just that ID number (all QA entries)
    if len(list(set(dset['Overall GSR QA Rating']))) > 1:
        problem_qa.append(subj)

In [None]:
# Get count of number of QA entries for each subject
num_entries = pd.DataFrame(gsrqa['Subject'].value_counts()).reset_index().rename(columns = {'Subject':'Count',
                                                                                            'index':'Subject'})

In [None]:
# Filter entries and keep only one, ideally the double-entered version. Some IDs have multiple entries (greater than 2)
subj_ids_filtering = []
for i in range(0, len(list(set(gsrqa['Subject'])))):
    subj = list(set(gsrqa['Subject']))[i] # Select subject
    sub_dset = gsrqa[gsrqa['Subject'] == subj].reset_index(drop=True) # get df with all QA ratings for that subject
    entries = num_entries[num_entries['Subject'] == subj] # see how many entries that subject has
    
    # If subject has only one rating, save that rating
    if entries['Count'].item() == 1:
        subj_ids_filtering.append(sub_dset['Subject ID'][0])
        
    # If subject has more than one rating...
    else:
        if len(list(set(sub_dset['Overall GSR QA Rating']))) == 1: # if there is only a single 'overall' rating, save that
            subj_ids_filtering.append(sub_dset['Subject ID'][0])

        else:
            if sub_dset['Subject ID'][0] == subj.lstrip('sub-'): # If subject was mis-entered without sub- attached, save their record
                subj_ids_filtering.append(sub_dset['Subject ID'][0])

            else: # Otherwise follow up manually
                print('ERROR on ', subj)

In [None]:
# Go through error subjects manually and choose which records to keep
manual_updates = ['A992--1', 'A661--1', 'A556--2'] # keep these
subj_ids_fjoined = subj_ids_filtering + manual_updates

assert len(subj_ids_fjoined) == 178

In [None]:
# Merge to select subjects
gsr_qa_dset = pd.merge(gsrqa, pd.DataFrame(subj_ids_fjoined, columns = ['Subject ID']), how = 'inner')

In [None]:
hasgsrlist = glob(gsrpath + '/*_run3*_6.txt')

hasgsr = []
for i in range(0, len(hasgsrlist)):
    line = hasgsrlist[i]
    subid = line.replace('/gpfs/milgram/project/gee_dylan/candlab/scripts/shapes/gsr/pspm/', '').split('_')[0]
    hasgsr.append(subid)

In [None]:
# Drop duplicate IDs
hasgsr = list(set(hasgsr))
print(hasgsr[0:5], '..., n =', len(hasgsr))
print(len(hasgsr), 'subjects have GSR QA ratings')

In [None]:
subs_gsr = pd.DataFrame(hasgsr, columns = ['Subject'])
subs_gsr['Subject'] = 'sub-' + subs_gsr['Subject']
subs_gsr['HasGSR'] = 1

comb_df1 = pd.merge(bv_data['Subject'], subs_gsr, on='Subject', how='outer').sort_values(by='HasGSR', ascending=True)
comb_df = pd.merge(comb_df1, gsr_qa_dset, on='Subject', how='left')
hasgsrdf = comb_df[comb_df['HasGSR'] == 1]
usablegsrdf = hasgsrdf[(hasgsrdf['Overall GSR QA Rating'] == 'Pass')].dropna(how='any', axis=0) #| (hasgsrdf['Overall GSR QA Rating'] == 'Qualified Pass')

newsubsgsr = usablegsrdf['Subject'].tolist()
print('{} subject are missing GSR data'.format(len(comb_df[comb_df['HasGSR'] != 1])))
print('{} out of {} subjects have usable data'.format(len(usablegsrdf), len(hasgsrdf)))

**Inclusion:**
43 subjects if include just pass,
59 subjects if include pass & qualified pass

In [None]:
gsr_datatrain = np.ones((len(newsubsgsr), 6))
gsr_data1 = np.ones((len(newsubsgsr), 6))
gsr_data2 = np.ones((len(newsubsgsr), 6))

for i in range(0, len(newsubsgsr)):
    sub = newsubsgsr[i]
    
    runtrain = pd.read_csv(gsrpath + '/{}_run1_earlylate_stats_case2_6.txt'.format(sub.lstrip('sub-')), sep = '\t', header = 1)
    assert runtrain.columns[2] == 'Stimulus_Aminus_early recon'
    assert runtrain.columns[3] == 'Stimulus_Aminus_late recon'
    
    run1 = pd.read_csv(gsrpath + '/{}_run2_stats_case2_6.txt'.format(sub.lstrip('sub-')), sep = '\t', header = 1)
    assert run1.columns[1] == 'Stimulus_Aminus recon'
    assert run1.columns[2] == 'Stimulus_Bminus recon'
    
    run2 = pd.read_csv(gsrpath + '/{}_run3_stats_case2_6.txt'.format(sub.lstrip('sub-')), sep = '\t', header = 1)
    assert run2.columns[1] == 'Stimulus_Aminus recon'
    assert run2.columns[2] == 'Stimulus_Bminus recon'

    means = pd.concat([run1, run2], axis=0).mean(axis=0)
    gsr_datatrain[i, :] = runtrain.iloc[:,0:6] #Drop last column of NaNs
    gsr_data1[i, :] = run1.iloc[:,0:10]
    gsr_data2[i, :] = run2.iloc[:,0:10]


In [None]:
# Organize results

# Training Run
gsr_traindf = pd.DataFrame(gsr_datatrain, columns = runtrain.columns[0:6], index=newsubsgsr).reset_index().dropna(how='all', axis=1).rename(columns = {'index':'Subject'})
gsr_traindf['Run'] = 'Training' # Assign name to run

# Testing Run 1
gsr_df1 = pd.DataFrame(gsr_data1, columns = run1.columns[0:10], index = newsubsgsr).reset_index().dropna(how='all', axis=1).rename(columns = {'index':'Subject'})
gsr_df1['Run'] = 'Run1'

# Testing Run 2
gsr_df2 = pd.DataFrame(gsr_data2, columns = run2.columns[0:10], index = newsubsgsr).reset_index().dropna(how='all', axis=1).rename(columns = {'index':'Subject'})
gsr_df2['Run'] = 'Run2'
gsr_df = pd.concat([gsr_df1, gsr_df2], axis=0).rename(columns = {'Stimulus_Aminus_early recon':'Early_Threat',
                                                                'Stimulus_Bminus_early recon':'Early_Safety',
                                                                'Stimulus_Aminus_late recon':'Late_Threat',
                                                                'Stimulus_Bminus_late recon':'Late_Safety',
                                                                'Stimulus_Aminus recon':'Threat',
                                                                'Stimulus_Bminus recon':'Safety'})

training_df = gsr_traindf.rename(columns = {'Stimulus_Aplus_early recon':'Early_Threat_Reinforced',
                                            'Stimulus_Aminus_early recon':'Early_Threat',
                                            'Stimulus_Bminus_early recon':'Early_Safety',
                                            'Stimulus_Aplus_late recon':'Late_Threat_Reinforced',
                                           'Stimulus_Aminus_late recon':'Late_Threat',
                                           'Stimulus_Bminus_late recon':'Late_Safety'})

In [None]:
# Melt and reshape data to long format
reshaped_training = pd.melt(training_df, 
                            value_vars = ['Early_Threat', 'Early_Safety',
                                          'Late_Threat', 'Late_Safety'], 
                      id_vars = ['Subject', 'Run'], 
                      var_name = 'Task Condition',
                     value_name='Reconstructed Response Value').sort_values(by='Subject')
reshaped_training['Reconstructed Response Value'].max()

### Check Training Data

In [None]:
def find_outliers(data, column):
    outliers = []
    val_mean = data[column].mean()
    val_std = data[column].std()
    for i in range(0, len(data)):
        dset = data.iloc[i, :]
        if dset[column] > (val_mean + 3*val_std): # If value more than 3 SD from mean
            outliers.append(dset['Subject'])
        elif dset[column] < (val_mean - 3*val_std): # If value less than 3 SD from mean
            outliers.append(dset['Subject'])
    print(len(outliers), 'outliers were found')
    return outliers

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15,4))

sns.histplot(training_df['Early_Threat'], ax = ax1)
sns.histplot(training_df['Early_Safety'], ax = ax2)
sns.histplot(training_df['Late_Threat'], ax = ax3)
sns.histplot(training_df['Late_Safety'], ax = ax4)

fig.tight_layout()

In [None]:
# Find and remove outliers by condition and run
i_outliers = find_outliers(training_df, 'Early_Threat')
j_outliers = find_outliers(training_df, 'Early_Safety')
k_outliers = find_outliers(training_df, 'Late_Threat')
l_outliers = find_outliers(training_df, 'Late_Safety')

total_outliers = list(set(i_outliers + j_outliers + k_outliers + l_outliers))
print(len(total_outliers), 'total outliers')
print(total_outliers)

# #Winsorise outliders
# training_df_wins = deepcopy(training_df)
# for i in range(1, 7):
#     col = training_df_wins.columns[i]
#     print('Column: ', col)
#     winsorize(training_df_wins[col], limits= [.1, .1], inclusive=[False, False], inplace=True)

# Drop outliers
training_clean_df = training_df[training_df.Subject.isin(total_outliers) == False]
assert len(training_clean_df) == len(training_df) - len(total_outliers) #Ensure outliers were dropped
    
print('Stats conducted now with {} subs instead of {}'.format(len(training_clean_df), len(training_df)))

In [None]:
reshaped_training = pd.melt(training_clean_df, 
                            value_vars = ['Early_Threat', 'Early_Safety',
                                          'Late_Threat', 'Late_Safety'], 
                      id_vars = ['Subject', 'Run'], 
                      var_name = 'Task Condition',
                     value_name='Reconstructed Response Value').sort_values(by='Subject')

reshaped_training['Timing'] = reshaped_training['Task Condition'].str.split('_', expand=True).iloc[:, 0] + ' Phase'
reshaped_training['Condition'] = reshaped_training['Task Condition'].str.split('_', expand=True).iloc[:, 1]

# Normalize and factorize variables
reshaped_training['Timing'] = reshaped_training['Timing'].astype('category')
reshaped_training['Condition'] = reshaped_training['Condition'].astype('category')
reshaped_training['Reconstructed Response Value'] = zscore(reshaped_training['Reconstructed Response Value'].astype('float')) #Normalize SCR responses

In [None]:
# Plot results
fig, (ax2) = plt.subplots(1, 1, figsize = (7, 5))

sns.boxplot(x = 'Timing', y = 'Reconstructed Response Value', hue = 'Condition', data = reshaped_training, ax=ax2,
            order = ['Early Phase', 'Late Phase'],
            palette = dict(Safety='#63a7e6', Threat='red'))

plt.legend(loc='upper right')
plt.ylim(-3, 6)
ax2.set_xlabel('Acquisition Run')
fig.tight_layout()
fig.savefig(analysis + "/Figures/AcquisitionPhase_GSRPlots_{}.png".format(today), dpi=300, transparent=True) 

In [None]:
# Drop missing data to prepare for mixed effects modeling

t_df = pd.merge(reshaped_training, bv_data, on='Subject', how='inner').dropna(subset = ['asr_age',
                                                                                 'sex',
                                                                                 'combined_income',
                                                                                 'years_education',
                                                                                 'diagnostic_group']).reset_index()
print('Analysis conducted with {} subjects, {} dropped due to missing data'.format(len(t_df['Subject'].value_counts()), len(reshaped_training['Subject'].value_counts()) - len(t_df['Subject'].value_counts())))

# Normalize and factorize variables
t_df['task_condition'] = t_df['Task Condition'].astype('category')
t_df['sex'] = t_df['sex'].astype('category')
t_df['combined_income'] = t_df['combined_income'].astype('category')
t_df['years_education'] = t_df['years_education'].astype('category')
t_df['diagnostic_group'] = t_df['diagnostic_group'].astype('category')
t_df['asr_age'] = zscore(t_df['asr_age'].astype('float'))
t_df['reconstructed_value']= t_df['Reconstructed Response Value']

In [None]:
# Fit mixed effects models

# Plot distribution
sns.histplot(t_df['reconstructed_value'])
plt.show()

#Omnibus model
mod_t = smf.mixedlm("reconstructed_value ~ Condition + Timing +asr_age + sex + combined_income + years_education", 
                groups="Subject", data= t_df);
t_results = mod_t.fit();
print(t_results.summary())

# Pairwise tests
pairwise_tests(data = t_df, dv = 'reconstructed_value', within = 'Timing', between = 'Condition', subject = 'Subject',
              parametric = True, marginal = True, padjust = 'fdr_bh', effsize = 'cohen', return_desc=True).round(3)

### Check Testing Data

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,4))

sns.histplot(gsr_df['Threat'], ax = ax1)
sns.histplot(gsr_df['Safety'], ax = ax2)

fig.tight_layout()

In [None]:
# Remove outliers from Testing data

a_outliers = find_outliers(gsr_df, 'Threat')
b_outliers = find_outliers(gsr_df, 'Safety')

total_outliers_test = list(set(a_outliers + b_outliers)) 
print(len(total_outliers_test), 'total outliers out of', len(gsr_df), 'subjects')
total_outliers_test

In [None]:
# #Winsorise outliders
# gsr_df_wins = deepcopy(gsr_df)
# for col in ['Threat', 'Safety']:
#     print('Column: ', col)
#     winsorize(gsr_df_wins[col], limits= [.1, .1], inclusive=[False, False], inplace=True)

# Drop outliers data[data.marks.isin(list1) == False])
gsr_clean_df = gsr_df[gsr_df.Subject.isin(total_outliers_test) == False]
print('Testing phase analyses conducted with {} subjects'.format(len(gsr_clean_df)/2))

assert len(gsr_clean_df) == len(gsr_df) - len(total_outliers_test)*2 #Assert subs across both runs (*2) are dropped

In [None]:
# Merge training and testing together to save
train_test_gsr = pd.merge(gsr_clean_df, training_clean_df, on='Subject', how='outer')
train_test_gsr.to_csv(analysis + '/GSR_data_{}.csv'.format(today))
print(analysis+ '/GSR_data_{}.csv'.format(today))

In [None]:
# Reshape data frame for analysis

reshaped_df = pd.melt(gsr_clean_df, value_vars = ['Threat', 'Safety'], 
                      id_vars = ['Subject', 'Run'], 
                      var_name = 'Task Condition',
                     value_name='Reconstructed Response Value').sort_values(by='Subject')

reshaped_df['Task_Condition'] = reshaped_df['Task Condition'].astype('category')
reshaped_df['Reconstructed Response Value'] = zscore(reshaped_df['Reconstructed Response Value'].astype('float'))
reshaped_df['reconstructed_value'] = zscore(reshaped_df['Reconstructed Response Value'].astype('float'))

In [None]:
# Plot results

fig, (ax2) = plt.subplots(1, 1, figsize = (7, 5))

reshaped_df['Run'] = reshaped_df['Run'].str.replace('Run1', 'First Run').str.replace('Run2', 'Second Run')
sns.boxplot(x = 'Run', y = 'Reconstructed Response Value', hue = 'Task Condition', data = reshaped_df, ax=ax2,
            order = ['First Run', 'Second Run'], 
            palette = dict(Safety='#63a7e6', Threat='red'))

plt.legend(loc='upper right')
plt.ylim(-3, 6)
ax2.set_xlabel('Testing Runs')
fig.tight_layout()
fig.savefig(analysis + "/Figures/TestingPhase_GSRPlot_{}.png".format(today), dpi=300, transparent=True) 

In [None]:
# Drop missing data to prepare for mixed effects modeling

m_df = pd.merge(reshaped_df, bv_data, on='Subject', how='inner').dropna(subset = ['asr_age',
                                                                                 'sex',
                                                                                 'combined_income',
                                                                                 'years_education']).reset_index()
m_df['task_condition'] = m_df['Task Condition']
print('Analysis conducted with {} subjects, {} dropped due to missing data'.format(len(m_df['Subject'].value_counts()), len(reshaped_training['Subject'].value_counts()) - len(m_df['Subject'].value_counts())))

In [None]:
# Fit mixed effects models

# Omnibus model
mod = sm.MixedLM.from_formula("reconstructed_value ~ Task_Condition + Run + asr_age + sex + combined_income + years_education", 
                groups="Subject", data= m_df);
aresults = mod.fit();
print(aresults.summary())

# Pairwise tests
pairwise_tests(data = m_df, dv = 'reconstructed_value', within = 'Run', between = 'Task_Condition', subject = 'Subject',
              parametric = True, marginal = True, padjust = 'fdr_bh', effsize = 'cohen', return_desc=True).round(3)