In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from collections import Counter
import statsmodels.stats.multitest

In [2]:
# dataset = 'MPI_LEMON'
dataset = 'ABIDE'

temp = pd.read_csv(f'../OutputFiles_Javaplex/PosCorr/{dataset}/Global_SigDiff_Node_persistence.tsv', sep ='\t')

sig_nodes = temp['Region'].dropna().tolist()
print(len(sig_nodes), sig_nodes[0])

df1 = pd.read_csv("../PersistentHomology_fMRIdata/Data/Updated_Cognitive_terms_SchaeferAtlas.csv")
df1_sorted = df1[df1['rois'].isin(sig_nodes)]
print(len(df1_sorted), np.unique(df1_sorted['RSN'], return_counts = True))
# df1_sorted

27 7Networks_LH_Vis_5
27 (array(['Control', 'Default', 'Dorsal Attention', 'Limbic',
       'Salient Ventral Attention', 'Somato Motor', 'Visual'],
      dtype=object), array([4, 7, 1, 3, 1, 9, 2], dtype=int64))


In [3]:
def StatisticalTest(c,df1,rows):
    null = {key : [] for key,value in c.items()}
    z_score, p_value = {}, {}
    
    for i in range(1000):
      tmp = df1.sample(n=rows)
      words = ",".join([each for each in tmp['list of cognitive terms'] if type(each) == str])
      words = [each.strip() for each in words.split(",")]
      tmp_freq = Counter(words)
      # print(tmp_freq)
      for key,value in c.items():
        if key in tmp_freq.keys():
          null[key].append(tmp_freq[key])
        if key not in tmp_freq.keys():
          null[key].append(0)
    
    for key,value in c.items():
      mean = np.mean(null[key])
      std = np.std(null[key])
      # if(std != 0):
      z = (value - mean)/std
      z_score[key] = z
      p = scipy.stats.norm.sf(abs(z))*2
      p_value[key] = p
    print('Number of p-values with < 0.05 : ',len([each for each in p_value.values() if each < 0.05]))
    fdr = {}
    fdr_p = statsmodels.stats.multitest.multipletests(list(p_value.values()), alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)
    count = 0
    for each in p_value.keys():
      fdr[each] = fdr_p[1][count]
      count += 1
    # print('length of fdr', len(fdr))

    final = pd.DataFrame()
    final['term'] = pd.Series(c.keys())
    # print('Length of final', len(final))
    final_freq, final_z_score, final_p_val, final_fdr_corrected = list(), list(), list(), list()
    for each in c.keys():
        final_freq.append(c[each])
        if (each in z_score):
            # print('yes', each)
            final_z_score.append(z_score[each])
            final_p_val.append(p_value[each])
            final_fdr_corrected.append(fdr[each])
    
    final['freq'], final['z_score'], final['p_val'], final['fdr_corrected'] = final_freq, final_z_score, final_p_val, final_fdr_corrected
    sig_terms = final[final['fdr_corrected'] < 0.05]
    return sig_terms

In [4]:
rsns = [ 'Visual', 'Somato Motor', 'Dorsal Attention', 'Salient Ventral Attention', 'Limbic', 'Control', 'Default']
to_remove = ["oddball","ptsd","motor sma","temporal","v5","sts","psts","pre supplementary","pareital","ofc","network dmn","negative neutral","mt","mm",
             "mci","loop","fusiform face","face ffa","extrastriate","autonomic","vpmfc","vmpfc","vlpfc","v5","suppressed","supplementary","sts","stream",
             "stimulation tms","sii","resting state","psts","pre supplementary","ppc","pfc","pcc","parietal","ofc","neutral","network dmn","mtg","mt",
             "mpfc","mci","loop","ifg","fusiform face","functional connectivity","face ffa","extrastriate","electrical","dmn","default mode",
             "default network","cortex supplementary","cortex mpfc","connectivity","broca","acc","sma","supplementary motor","connectivity",
             "dorsal attention","ifg","mpfc","pfc","alzheimar","alzheimer"]

outpath = '../OutputFiles_Javaplex/PosCorr/NeurosynthAnalaysis/'
for rsn in rsns:
    df = df1_sorted[df1_sorted['RSN'].str.contains(rsn)]
    # print(len(df))
    rows = df.shape[0]
    if rows != 0:
        print(f'Number of nodes with sig diff in "{rsn}" is', rows)#, df_updated['list of cognitive terms'])
        rois = ",".join([each for each in df['list of cognitive terms'] if type(each) == str])
        # print(type(rois))
        rois = [each.strip() for each in rois.split(",")]
        # print(rois)
        print('Number of different terms:', len(rois))
        
        c = Counter(rois)
        # print(len(c))
        c = dict(c)
        if ('' in c) : c.pop('')
        print('Number of unique different terms:', len(c), ', Tolal terms:', sum(c.values()))
        
        for each in to_remove:
          try:
            c.pop(each)
          except:
            continue
         
        print('Number of unique different terms after removing from to_remove:', len(c), ', Tolal terms:', sum(c.values()))     
        sig_terms = StatisticalTest(c,df1,rows)
        print('Number of sig different terms after FDR correction:', len(sig_terms))
        sig_terms.to_csv(outpath + f'{dataset}_{rsn}_SignificantcognitiveTerms_FDRcorrected.txt', sep = "\t", index = False)
    else:
        print(f'Number of nodes with sig diff in "{rsn}" is', rows)
    print(f'Done for {rsn}', '-'*50)
    # break

Number of nodes with sig diff in "Visual" is 2
Number of different terms: 14
Number of unique different terms: 14 , Tolal terms: 14
Number of unique different terms after removing from to_remove: 14 , Tolal terms: 14
Number of p-values with < 0.05 :  12
Number of sig different terms after FDR correction: 12
Done for Visual --------------------------------------------------
Number of nodes with sig diff in "Somato Motor" is 9
Number of different terms: 205
Number of unique different terms: 102 , Tolal terms: 205
Number of unique different terms after removing from to_remove: 93 , Tolal terms: 186
Number of p-values with < 0.05 :  39
Number of sig different terms after FDR correction: 28
Done for Somato Motor --------------------------------------------------
Number of nodes with sig diff in "Dorsal Attention" is 1
Number of different terms: 10
Number of unique different terms: 10 , Tolal terms: 10
Number of unique different terms after removing from to_remove: 9 , Tolal terms: 9
Number 

In [5]:
outpath

'../OutputFiles_Javaplex/PosCorr/NeurosynthAnalaysis/'

In [6]:
# rsns = [ 'Visual', 'Somato Motor', 'Dorsal Attention', 'Salient Ventral Attention', 'Limbic', 'Control', 'Default']
cog_term_details = {'Resting state networks': [],'Number of unique terms': [],'Terms with frequency': []}
for rsn in rsns:
    try:
        sig_terms_infile = pd.read_csv(outpath + f"{dataset}_{rsn}_SignificantcognitiveTerms_FDRcorrected.txt", sep = "\t")
        sig_terms_infile[['freq', 'term']].to_csv(outpath + f"{dataset}_{rsn}_SignificantcognitiveTerms_FDRcorrected.csv", index = None, header = ['weight', 'word'])
        terms, freq = sig_terms_infile['term'], sig_terms_infile['freq']
        ll = [f'{terms[i]} ({freq[i]})' for i in range(len(sig_terms_infile))]
        cog_term_details['Resting state networks'].append(rsn)
        cog_term_details['Number of unique terms'].append(len(ll))
        cog_term_details['Terms with frequency'].append(', '.join(ll))
        print(rsn, len(ll))
    except:
        print('No file for', rsn,'\n')
        cog_term_details['Resting state networks'].append(rsn)
        cog_term_details['Number of unique terms'].append(0)
        cog_term_details['Terms with frequency'].append('No terms')
    # break
pd.DataFrame(cog_term_details).to_csv(outpath + f"{dataset}_SignificantcognitiveTermsWithFrequenct.csv", index = None)

Visual 12
Somato Motor 28
Dorsal Attention 9
Salient Ventral Attention 4
Limbic 21
Control 61
Default 45


In [7]:
for rsn in rsns:
    df = df1_sorted[df1_sorted['RSN'].str.contains(rsn)]
    rows = df.shape[0]
    if rows != 0:
        print(f'Number of nodes with sig diff in "{rsn}" is', rows)

Number of nodes with sig diff in "Visual" is 2
Number of nodes with sig diff in "Somato Motor" is 9
Number of nodes with sig diff in "Dorsal Attention" is 1
Number of nodes with sig diff in "Salient Ventral Attention" is 1
Number of nodes with sig diff in "Limbic" is 3
Number of nodes with sig diff in "Control" is 4
Number of nodes with sig diff in "Default" is 7


In [8]:
sig_terms_infile[['freq', 'term']]

Unnamed: 0,freq,term
0,2,junction
1,3,temporoparietal junction
2,2,temporoparietal
3,2,parietal junction
4,4,mentalizing
5,1,moral
6,2,mental states
7,3,social
8,1,posterior temporal
9,1,read


##### Copy paste all the f"{dataset}_{rsn}_SignificantcognitiveTerms_FDRcorrected.csv" files in the "MargedRelatedTerms" folder. 
##### Then manually marge the related terms, for creating the wordcloud and ST.

### Single file with words and frequency after marging similar terms (for ST)

In [7]:
import pandas as pd

outpath = '../OutputFiles_Javaplex/PosCorr/NeurosynthAnalaysis/'
rsns = [ 'Visual', 'Somato Motor', 'Dorsal Attention', 'Salient Ventral Attention', 'Limbic', 'Control', 'Default']
pathMT = outpath + 'MargedRelatedTerms/'
# dataset = 'MPI_LEMON'
dataset = 'ABIDE'
dataset, pathMT

('ABIDE',
 '../OutputFiles_Javaplex/PosCorr/NeurosynthAnalaysis/MargedRelatedTerms/')

In [8]:
outdf_ST = {'RSNs':[],'Count':[], 'Terms':[]}
for rsn in rsns:
    outdf_ST['RSNs'].append(rsn)
    try:
        dfMT = pd.read_csv(pathMT + f"{dataset}_{rsn}_SignificantcognitiveTerms_FDRcorrected.csv")
        outdf_ST['Count'].append(dfMT['weight'].count())
        outdf_ST['Terms'].append([f'{t} ({c})' for t,c in zip(dfMT['word'],dfMT['weight'])])
    except:
        outdf_ST['Count'].append(0)
        outdf_ST['Terms'].append('No terms')
    print('Done for ',rsn, outdf_ST['Count'])
    # break
pd.DataFrame(outdf_ST).to_csv(pathMT+ f"{dataset}_SignificantcognitiveTermsWithFrequenct.txt", sep = '\t', index=None)

Done for  Visual [12]
Done for  Somato Motor [12, 25]
Done for  Dorsal Attention [12, 25, 9]
Done for  Salient Ventral Attention [12, 25, 9, 4]
Done for  Limbic [12, 25, 9, 4, 21]
Done for  Control [12, 25, 9, 4, 21, 59]
Done for  Default [12, 25, 9, 4, 21, 59, 41]


In [6]:
pd.DataFrame(outdf_ST)

Unnamed: 0,RSNs,Count,Terms
0,Visual,50,"[parahippocampal (3), objects (8), visual (7),..."
1,Somato Motor,66,"[music (3), heschl (2), musicians (3), pain (7..."
2,Dorsal Attention,97,"[visual word (2), word form (3), faces (2), fu..."
3,Salient Ventral Attention,35,"[motor (7), mirror (3), somatosensory cortices..."
4,Limbic,18,"[orbitofrontal (1), orbitofrontal cortex (1), ..."
5,Control,53,"[retrieval (6), parietal cortex (6), posterior..."
6,Default,104,"[anterior temporal (4), person (4), temporal p..."


In [13]:
outdf_ST

{'RSNs': ['Visual',
  'Somato Motor',
  'Dorsal Attention',
  'Salient Ventral Attention',
  'Limbic',
  'Control',
  'Default'],
 'Count': [9, 0, 70, 0, 0, 46, 0, 0, 0, 67, 0],
 'Terms': ['No terms',
  'No terms',
  'No terms',
  'No terms',
  'No terms',
  'No terms',
  'No terms']}

In [None]:
Number of nodes with sig diff in "Visual" is 12
Number of different terms: 230
Number of unique different terms: 142 , Tolal terms: 230
Number of unique different terms after removing from to_remove: 133 , Tolal terms: 211
Number of p-values with < 0.05 :  64
Number of sig different terms after FDR correction: 55
Done for Visual --------------------------------------------------
Number of nodes with sig diff in "Somato Motor" is 22
Number of different terms: 705
Number of unique different terms: 226 , Tolal terms: 705
Number of unique different terms after removing from to_remove: 210 , Tolal terms: 661
Number of p-values with < 0.05 :  80
Number of sig different terms after FDR correction: 71
Done for Somato Motor --------------------------------------------------
Number of nodes with sig diff in "Dorsal Attention" is 17
Number of different terms: 735
Number of unique different terms: 260 , Tolal terms: 735
Number of unique different terms after removing from to_remove: 245 , Tolal terms: 696
Number of p-values with < 0.05 :  134
Number of sig different terms after FDR correction: 106
Done for Dorsal Attention --------------------------------------------------
Number of nodes with sig diff in "Salient Ventral Attention" is 11
Number of different terms: 359
Number of unique different terms: 181 , Tolal terms: 359
Number of unique different terms after removing from to_remove: 165 , Tolal terms: 323
Number of p-values with < 0.05 :  56
Number of sig different terms after FDR correction: 38
Done for Salient Ventral Attention --------------------------------------------------
Number of nodes with sig diff in "Limbic" is 2
Number of different terms: 26
Number of unique different terms: 24 , Tolal terms: 26
Number of unique different terms after removing from to_remove: 20 , Tolal terms: 21
Number of p-values with < 0.05 :  18
Number of sig different terms after FDR correction: 18
Done for Limbic --------------------------------------------------
Number of nodes with sig diff in "Control" is 15
Number of different terms: 315
Number of unique different terms: 175 , Tolal terms: 315
Number of unique different terms after removing from to_remove: 166 , Tolal terms: 301
Number of p-values with < 0.05 :  71
Number of sig different terms after FDR correction: 56
Done for Control --------------------------------------------------
Number of nodes with sig diff in "Default" is 29
Number of different terms: 861
Number of unique different terms: 318 , Tolal terms: 861
Number of unique different terms after removing from to_remove: 294 , Tolal terms: 774
Number of p-values with < 0.05 :  136
Number of sig different terms after FDR correction: 111
Done for Default --------------------------------------------------