In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from collections import Counter
import statsmodels.stats.multitest

In [2]:
dataset = 'MPI_LEMON'
# dataset = 'ABIDE'

temp = pd.read_csv(f'../OutputFiles/PosCorr/{dataset}/Global_SigDiff_Node_persistence.tsv', sep ='\t')

sig_nodes = temp['Region'].dropna().tolist()
print(len(sig_nodes))

df1 = pd.read_csv("../Data/Cognitive_terms_SchaeferAtlas.csv")
df1_sorted = df1[df1['rois'].isin(sig_nodes)]
len(df1_sorted)

108


108

In [3]:
def StatisticalTest(c,df1,rows):
    null = {key : [] for key,value in c.items()}
    z_score, p_value = {}, {}
    
    for i in range(1000):
      tmp = df1.sample(n=rows)
      words = ",".join([each for each in tmp['list of cognitive terms'] if type(each) == str])
      words = [each.strip() for each in words.split(",")]
      tmp_freq = Counter(words)
      # print(tmp_freq)
      for key,value in c.items():
        if key in tmp_freq.keys():
          null[key].append(tmp_freq[key])
        if key not in tmp_freq.keys():
          null[key].append(0)
    
    for key,value in c.items():
      mean = np.mean(null[key])
      std = np.std(null[key])
      # if(std != 0):
      z = (value - mean)/std
      z_score[key] = z
      p = scipy.stats.norm.sf(abs(z))*2
      p_value[key] = p

    fdr = {}
    fdr_p = statsmodels.stats.multitest.multipletests(list(p_value.values()), alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)
    count = 0
    for each in p_value.keys():
      fdr[each] = fdr_p[1][count]
      count += 1
    # print('length of fdr', len(fdr))

    final = pd.DataFrame()
    final['term'] = pd.Series(c.keys())
    # print('Length of final', len(final))
    final_freq, final_z_score, final_p_val, final_fdr_corrected = list(), list(), list(), list()
    for each in c.keys():
        final_freq.append(c[each])
        if (each in z_score):
            # print('yes', each)
            final_z_score.append(z_score[each])
            final_p_val.append(p_value[each])
            final_fdr_corrected.append(fdr[each])
    
    final['freq'], final['z_score'], final['p_val'], final['fdr_corrected'] = final_freq, final_z_score, final_p_val, final_fdr_corrected
    sig_terms = final[final['fdr_corrected'] < 0.05]
    return sig_terms

In [4]:
rsns = [ 'Visual', 'Somato Motor', 'Dorsal Attention', 'Salient Ventral Attention', 'Limbic', 'Control', 'Default']
to_remove = ["oddball","ptsd","motor sma","temporal","v5","sts","psts","pre supplementary","pareital","ofc","network dmn","negative neutral","mt","mm",
             "mci","loop","fusiform face","face ffa","extrastriate","autonomic","vpmfc","vmpfc","vlpfc","v5","suppressed","supplementary","sts","stream",
             "stimulation tms","sii","resting state","psts","pre supplementary","ppc","pfc","pcc","parietal","ofc","neutral","network dmn","mtg","mt",
             "mpfc","mci","loop","ifg","fusiform face","functional connectivity","face ffa","extrastriate","electrical","dmn","default mode",
             "default network","cortex supplementary","cortex mpfc","connectivity","broca","acc","sma","supplementary motor","connectivity",
             "dorsal attention","ifg","mpfc","pfc","alzheimar","alzheimer"]

outpath = '../OutputFiles/PosCorr/NeurosynthAnalysis/'
for rsn in rsns:
    df = df1_sorted[df1_sorted['RSN'].str.contains(rsn)]
    # print(len(df))
    rows = df.shape[0]
    if rows != 0:
        print(f'Number of nodes with sig diff in "{rsn}" is', rows)#, df_updated['list of cognitive terms'])
        rois = ",".join([each for each in df['list of cognitive terms'] if type(each) == str])
        rois = [each.strip() for each in rois.split(",")]
        
        c = Counter(rois)
        # print(len(c))
        c = dict(c)
        if ('' in c) : c.pop('')
        print('Number of unique different terms:', len(c), ', Tolal terms:', sum(c.values()))
        
        for each in to_remove:
          try:
            c.pop(each)
          except:
            continue
         
        sig_terms = StatisticalTest(c,df1,rows)
        print('Number of sig different terms after FDR correction:', len(sig_terms))
        sig_terms.to_csv(outpath + f'{dataset}_{rsn}_SignificantcognitiveTerms_FDRcorrected.txt', sep = "\t", index = False)
    else:
        print(f'Number of nodes with sig diff in "{rsn}" is', rows)
    print(f'Done for {rsn}', '-'*50)
    # break

Number of nodes with sig diff in "Visual" is 12
Number of unique different terms: 142 , Tolal terms: 230
Number of sig different terms after FDR correction: 53
Done for Visual --------------------------------------------------
Number of nodes with sig diff in "Somato Motor" is 22
Number of unique different terms: 226 , Tolal terms: 705
Number of sig different terms after FDR correction: 74
Done for Somato Motor --------------------------------------------------
Number of nodes with sig diff in "Dorsal Attention" is 17
Number of unique different terms: 260 , Tolal terms: 735
Number of sig different terms after FDR correction: 106
Done for Dorsal Attention --------------------------------------------------
Number of nodes with sig diff in "Salient Ventral Attention" is 11
Number of unique different terms: 181 , Tolal terms: 359
Number of sig different terms after FDR correction: 37
Done for Salient Ventral Attention --------------------------------------------------
Number of nodes with 

In [5]:
outpath

'../OutputFiles/PosCorr/NeurosynthAnalaysis/'