In [2]:
import numpy as np
import pandas as pd
import os, re, time, itertools, ast
import pickle as pkl
import scipy.stats as stats
import statsmodels.stats.multitest as multitest
from collections import Counter

## Frequency and Persistence of nodes present in a loop/hole

In [2]:
def edges_to_cycle(list_of_edges):
    unique_nodes = list(set(num for pair in list_of_edges for num in pair))
    return unique_nodes

def freq_list_oneBar(LIST):  ## For only one list of nodes present in a hole
    number_of_cycles = len(LIST)
    unique_elements, counts = np.unique(LIST, return_counts=True)
    nodefreq = dict(zip(unique_elements, counts))
    return nodefreq

def Node_freq_and_persist_dict(cycles, NNodes):
    total_persist_len = cycles['Death'].max() - cycles['Birth'].min()
    NodeFreq = {m: 0 for m in range(NNodes)}
    Nodepersist = {m: 0 for m in range(NNodes)}
    for l in range(len(cycles)):
        edges, birth, death = cycles['1DHole'][l], cycles['Birth'][l], cycles['Death'][l]
        edgelist = ast.literal_eval(edges.replace('-', ''))
        cycle = edges_to_cycle(edgelist)
        cycles.loc[l, 'Cycles'] = str(cycle)
        nodefrequencies = freq_list_oneBar(cycle)
        for node in nodefrequencies.keys():
            if node in NodeFreq.keys():
                NodeFreq[node] += nodefrequencies[node]
            if node in Nodepersist.keys():
                Nodepersist[node] += (death - birth) 
            else:
                print('Invalid node name:', node)
    if len(cycles) != 0:
        updated_Nodepersist = {k: v/total_persist_len for k,v in Nodepersist.items()}
    else:
        updated_Nodepersist = Nodepersist
        total_persist_len = 0
    return NodeFreq, updated_Nodepersist, total_persist_len

In [3]:
# dataset = 'MPI_LEMON'
dataset = 'ABIDE'
NNodes = 200

path_file = f'../Data/{dataset}/FCM_DistMat/'
files_list = os.listdir(path_file)
print(dataset, len(files_list))

ABIDE 1


In [5]:
t0 = time.time()

inpath = f'../OutputFiles/PosCorr/{dataset}/Holes_1D_Javaplex/'

Freqoutdict = {'Nodes': [n for n in range(NNodes)]}
Persistoutdict = {'Nodes': [n for n in range(NNodes)]}
are_same_persist_lngts = []
for i in range(len(files_list)):
    t2 = time.time()
    SubID = files_list[i].split('.')[0].split('_')[-1]    
    cycle_details = pd.read_csv(inpath + f'Sub_{SubID}.txt',sep ='\t')    
    Nodefreq, Nodepersistence, total_persist_len = Node_freq_and_persist_dict(cycle_details, NNodes)
    Freqoutdict[SubID] = Nodefreq.values()
    Persistoutdict[SubID] = Nodepersistence.values()
    print(f'[{i}] Done for {SubID}', round(time.time() - t2, 4))
    # break

Freqfilename = f"Global_Node_frequency.csv"
Persistfilename = f"Global_Node_persistence.csv"

FreqOutdf, PersistOutdf = pd.DataFrame(Freqoutdict), pd.DataFrame(Persistoutdict)
outpath = f'../OutputFiles/PosCorr/{dataset}/'
FreqOutdf.to_csv(outpath + Freqfilename, index = None)
PersistOutdf.to_csv(outpath + Persistfilename, index = None)

print(outpath + Persistfilename, PersistOutdf.shape, FreqOutdf.shape)
print('Done for ', dataset, time.time() - t0)
# PersistOutdf

[0] Done for 51491 0.106
[1] Done for 50642 0.1392
[2] Done for 50646 0.1192
[3] Done for 50647 0.1319
[4] Done for 50649 0.1368
[5] Done for 50656 0.1342
[6] Done for 50659 0.1224
[7] Done for 50664 0.1344
[8] Done for 50665 0.1345
[9] Done for 50668 0.1108
[10] Done for 50772 0.1406
[11] Done for 50773 0.1177
[12] Done for 50774 0.0872
[13] Done for 50775 0.1208
[14] Done for 50776 0.1257
[15] Done for 50777 0.1287
[16] Done for 50778 0.1086
[17] Done for 50779 0.1254
[18] Done for 50780 0.1305
[19] Done for 50781 0.1203
[20] Done for 50782 0.1178
[21] Done for 50783 0.115
[22] Done for 50784 0.1316
[23] Done for 50785 0.1178
[24] Done for 50786 0.1121
[25] Done for 50787 0.1136
[26] Done for 50788 0.1267
[27] Done for 50789 0.1023
[28] Done for 50790 0.1159
[29] Done for 50791 0.1635
[30] Done for 50792 0.1494
[31] Done for 50793 0.2486
[32] Done for 50794 0.1424
[33] Done for 50795 0.1256
[34] Done for 50796 0.1184
[35] Done for 50797 0.1252
[36] Done for 50798 0.1429
[37] Done for

# Statistical test

In [6]:
#------------------------------------------------
# Function to find mean an standard error 
#------------------------------------------------

def Node_Groupdiff(group1,group2,Infile,g1name,g2name,NNodes):
    p_val = []
    Gr1,Gr2 = [[],[]],[[],[]]
    for i in range(NNodes):
        data1,data2 = [],[]
        for sub1 in group1:
            data1.append(Infile[sub1][i])
        Gr1[0].append(np.mean(data1))
        Gr1[1].append(stats.sem(data1))
        
        for sub2 in group2:
            data2.append(Infile[sub2][i])
        Gr2[0].append(np.mean(data2))
        Gr2[1].append(stats.sem(data2))
        p_val.append(stats.ttest_ind(data1,data2,equal_var=False)[1])
        
    correction = multitest.multipletests(p_val, alpha=0.05, method = 'fdr_bh')
    fdr = correction[0]
    p_v_corrected = correction[1]

    nodes = [i for i in range(NNodes)]
    Gdiff = {'Nodes': nodes, 'p_values':p_val, 'fdr_corrected_p_val':p_v_corrected, f'{g1name}_mean': Gr1[0], f'{g1name}_sd':Gr1[1], f'{g2name}_mean':Gr2[0], f'{g2name}_sd':Gr2[1]}
    return pd.DataFrame(Gdiff)

In [4]:
if dataset == 'MPI_LEMON':
    Detailsfile = pd.read_csv('../Data/MPI_LEMON/MPILemon_Subject_details.csv')
    Young = list(map(str, list(Detailsfile.loc[Detailsfile['Cohort'] == 'Young','Subject'])))
    Elder = list(map(str, list(Detailsfile.loc[Detailsfile['Cohort'] == 'Elderly','Subject'])))
    All_subs = Young + Elder
    Group1, Group2 = 'Young', 'Elderly'
    GR1, GR2 = Young, Elder

elif dataset == 'ABIDE':
    Detailsfile = pd.read_csv('../Data/ABIDE/ABIDE_Subject_details.csv')
    ASD = list(map(str, list(Detailsfile.loc[Detailsfile['Cohort'] == 'ASD','Subject identifier'])))
    Healthy = list(map(str, list(Detailsfile.loc[Detailsfile['Cohort'] == 'HC','Subject identifier'])))
    All_subs = ASD + Healthy
    Group1, Group2 = 'ASD', 'Healthy'
    GR1, GR2 = ASD, Healthy

print(Group1, len(GR1), '\t', Group2, len(GR2), '\tTotal:', len(All_subs))
print(GR1[0], GR2[0])

ASD 395 	 Healthy 425 	Total: 820
50601 50551


In [8]:
t0 = time.time()
outpath = f'../OutputFiles/PosCorr/{dataset}/'

Freqfilename = f"Global_Node_frequency.csv"
Persistfilename = f"Global_Node_persistence.csv"

Infile_1 = pd.read_csv(outpath + Freqfilename)
Infile_2 = pd.read_csv(outpath + Persistfilename)
print(NNodes, Infile_1.shape)

print(Group1, len(GR1), '\t', Group2, len(GR2), '\tTotal:', len(GR1+GR2))

df_freq = Node_Groupdiff(GR1, GR2, Infile_1, Group1, Group2, NNodes)
df_persist = Node_Groupdiff(GR1, GR2, Infile_2, Group1, Group2, NNodes)

df_freq = pd.DataFrame(df_freq)
df_persist = pd.DataFrame(df_persist)
df_freq.to_csv(outpath + f"p_values_{Freqfilename.replace('csv','txt')}",sep = '\t', index=None)
df_persist.to_csv(outpath + f"p_values_{Persistfilename.replace('csv','txt')}",sep = '\t', index=None)

print(outpath + f"p_values_{Persistfilename.replace('csv','txt')}", f'\t {df_persist.shape}\t Time:', time.time() - t0)
print('Done')

200 (200, 821)
ASD 395 	 Healthy 425 	Total: 820
../OutputFiles/PosCorr/ABIDE/p_values_Global_Node_persistence.txt 	 (200, 7)	 Time: 5.9798829555511475
Done


## Number of significantly different ROIs and Intersection with NIBS

In [13]:
NIBS_file = pd.read_csv('../Data/NIBS_Identified_ROIs.csv')
nibs = NIBS_file.loc[NIBS_file[dataset] == True, 'Node_ID'].tolist()
print(dataset, len(nibs), nibs)
print(Group1, len(GR1), '\t', Group2, len(GR2), '\tTotal:', len(GR1+GR2),'\n' )

outpath = f'../OutputFiles/PosCorr/{dataset}/'
files = ['persistence', 'frequency']

for f in files: 
    df = pd.read_csv(outpath + f'p_values_Global_Node_{f}.txt', sep = '\t')
    filtered_values = df.loc[df['fdr_corrected_p_val'] < 0.05, 'Nodes']
    nodesno = set(nibs).intersection(set(filtered_values))
    print("FDR-corrected:", len(filtered_values), '\t Coincide with NIBS:', len(nodesno),F'\t Node {f} ')

ABIDE 31 [19, 20, 23, 25, 27, 28, 40, 41, 42, 50, 53, 69, 86, 90, 92, 94, 124, 125, 128, 130, 144, 146, 150, 157, 171, 173, 174, 175, 189, 194, 195]
ASD 395 	 Healthy 425 	Total: 820 

FDR-corrected: 27 	 Coincide with NIBS: 6 	 Node persistence 
FDR-corrected: 35 	 Coincide with NIBS: 7 	 Node frequency 
