In [1]:
import pandas as pd
import gudhi as gd
import gudhi.representations
import os, re, time
import numpy as np
import pickle as pkl

In [2]:
def replace_infinity_by_2(barcode):
    for bars in barcode:
        for bar in bars:
            if bar[1] == np.inf:
                bar[1]=np.sqrt(2)
    return barcode

# Data

In [3]:
Detailsfile = pd.read_csv('../Data/MPI_LEMON/MPILemon_Subject_details.csv')
#print(Detailsfile)

Young = list(map(str, list(Detailsfile.loc[Detailsfile['Cohort'] == 'Young','Subject'])))
Elder = list(map(str, list(Detailsfile.loc[Detailsfile['Cohort'] == 'Elderly','Subject'])))
All_subs = Young + Elder

Group1, Group2 = 'Young', 'Elderly'
GR1, GR2 = Young, Elder
print(Group1, len(GR1), '\t', Group2, len(GR2), '\tTotal:', len(All_subs))
print(type(GR1[0]), GR1[0])

Young 153 	 Elderly 72 	Total: 225
<class 'str'> 32302


In [3]:
Detailsfile = pd.read_csv('../Data/ABIDE/ABIDE_Subject_details.csv')

ASD = list(map(str, list(Detailsfile.loc[Detailsfile['Cohort'] == 'ASD','Subject identifier'])))
Healthy = list(map(str, list(Detailsfile.loc[Detailsfile['Cohort'] == 'HC','Subject identifier'])))
All_subs = ASD + Healthy
    
Group1, Group2 = 'ASD', 'Healthy'
GR1, GR2 = ASD, Healthy
# GR1.remove('KKI_50806')
print(Group1, len(GR1), '\t', Group2, len(GR2), '\tTotal:', len(All_subs))
print(GR1[0], GR2[0])
# print(All_subs[30:])

ASD 395 	 Healthy 425 	Total: 820
50601 50551


# Persistent Homology Computation

In [4]:
# path_file = '../Data/MPI_LEMON/FCM_DistMat/'
path_file = '../Data/ABIDE/FCM_DistMat/' 
files_list = os.listdir(path_file)
len(files_list)

820

In [5]:
dgms_list = list()
barcode0_list = list()
barcode1_list = list()
barcode2_list = list()
t0 = time.time()

for i in range(len(files_list)):
    t1 = time.time()
    SubID = files_list[i].split('.')[0].split('_')[-1]
    #print(i,SubID)
    #break
    DisMat = pd.read_csv(path_file + files_list[i], header = None, sep = ',').values
    thres = np.sqrt(2)
    skeleton = gd.RipsComplex(distance_matrix = DisMat, max_edge_length=thres, sparse=None)
    Rips_simplex_tree = skeleton.create_simplex_tree(max_dimension=3)
    BarCodes_Rips = Rips_simplex_tree.persistence()
    dgms_list.append({SubID: np.array([list(bars[1]) for bars in BarCodes_Rips])})
    barcode0_list.append({SubID: Rips_simplex_tree.persistence_intervals_in_dimension(0)})
    barcode1_list.append({SubID: Rips_simplex_tree.persistence_intervals_in_dimension(1)})
    barcode2_list.append({SubID: Rips_simplex_tree.persistence_intervals_in_dimension(2)})
    print('Done for ',i,SubID,'  Number of 1Dholes = ', len(barcode1_list[i][SubID]), '  ',time.time() - t1)
    # if i == 5:
    # break
print('DONE')
print(time.time() - t0)

Done for  0 51491   Number of 1Dholes =  168    5.222912073135376
Done for  1 50642   Number of 1Dholes =  210    3.382326126098633
Done for  2 50646   Number of 1Dholes =  151    7.413985013961792
Done for  3 50647   Number of 1Dholes =  184    5.603994369506836
Done for  4 50649   Number of 1Dholes =  161    3.729884624481201
Done for  5 50656   Number of 1Dholes =  202    6.447138786315918
Done for  6 50659   Number of 1Dholes =  162    7.771590709686279
Done for  7 50664   Number of 1Dholes =  193    5.177280426025391
Done for  8 50665   Number of 1Dholes =  165    6.678848743438721
Done for  9 50668   Number of 1Dholes =  171    6.935305833816528
Done for  10 50772   Number of 1Dholes =  207    3.99279522895813
Done for  11 50773   Number of 1Dholes =  171    8.122189283370972
Done for  12 50774   Number of 1Dholes =  150    5.1294355392456055
Done for  13 50775   Number of 1Dholes =  194    10.159194946289062
Done for  14 50776   Number of 1Dholes =  182    7.634879112243652
Done

In [6]:
# outpath = '../OutputFiles/Positive_weight/MPI_LEMON/'
outpath = '../OutputFiles/Positive_weight/ABIDE/'
37241220474243
Done for  428 50236   Number of 1Dholes =  222    7.036773920059204
Done for  429 50237   Number of 1Dholes =  187    4.6462156772613525
Done for  430 50238   Number of 1Dholes =  181    3.62833738
with open(outpath +'dgms_list.pkl','wb') as f:
    pkl.dump(dgms_list, f)
    
with open(outpath +'barcode0_list.pkl','wb') as f:
    pkl.dump(barcode0_list, f)
    
with open(outpath +'barcode1_list.pkl','wb') as f:
    pkl.dump(barcode1_list, f)
    
with open(outpath +'barcode2_list.pkl','wb') as f:
    pkl.dump(barcode2_list, f)
print('Done')

Done


In [11]:
# outpath = '../OutputFiles/Positive_weight/MPI_LEMON/'
outpath = '../OutputFiles/Positive_weight/ABIDE/'
with open(outpath +'dgms_list.pkl','rb') as f:
    dgms_list = pkl.load(f)
    
with open(outpath +'barcode0_list.pkl','rb') as f:
    barcode0_list = pkl.load(f)
    
with open(outpath +'barcode1_list.pkl','rb') as f:
    barcode1_list = pkl.load(f)
    
with open(outpath +'barcode2_list.pkl','rb') as f:
    barcode2_list = pkl.load(f)

print('Opened input files')

Opened input files


In [12]:
len(dgms_list), len(barcode0_list), len(barcode1_list), len(barcode2_list)
# len(barcode2_list[4]['32305'])

(820, 820, 820, 820)

In [13]:
# len(barcode2_list[0]['32301'])

In [14]:
Group1_dgms_list, Group2_dgms_list = list(), list()
Group1_barcode0_list, Group2_barcode0_list = list(), list()
Group1_barcode1_list, Group2_barcode1_list = list(), list()
Group1_barcode2_list, Group2_barcode2_list = list(), list()
G1_ID_list, G2_ID_list = list(), list()

for i in range(len(files_list)):
    t1 = time.time()
    SubID = files_list[i].split('.')[0].split('_')[-1]
    # print(i,SubID,type(SubID))
    if SubID in GR1:
        Group1_dgms_list.append(dgms_list[i][SubID])
        Group1_barcode0_list.append(barcode0_list[i][SubID])
        Group1_barcode1_list.append(barcode1_list[i][SubID])
        Group1_barcode2_list.append(barcode2_list[i][SubID])
        G1_ID_list.append(SubID)
        #print(f'Done for {Group1} ',i,SubID,time.time() - t1)
    elif SubID in GR2:
        Group2_dgms_list.append(dgms_list[i][SubID])
        Group2_barcode0_list.append(barcode0_list[i][SubID])
        Group2_barcode1_list.append(barcode1_list[i][SubID])
        Group2_barcode2_list.append(barcode2_list[i][SubID])
        G2_ID_list.append(SubID)
        #print(f'Done for {Group2}',i,SubID,time.time() - t1)
    # if i == 5:
        # break
print('DONE')
print(Group1, len(Group1_barcode0_list), '\t', Group2, len(Group2_barcode0_list))

DONE
ASD 395 	 Healthy 425


In [15]:
with open(outpath +f'{Group1}_dgms_list.pkl','wb') as f:
    pkl.dump(Group1_dgms_list, f)
    
with open(outpath +f'{Group1}_barcode0_list.pkl','wb') as f:
    pkl.dump(Group1_barcode0_list, f)
    
with open(outpath +f'{Group1}_barcode1_list.pkl','wb') as f:
    pkl.dump(Group1_barcode1_list, f)
    
with open(outpath +f'{Group1}_barcode2_list.pkl','wb') as f:
    pkl.dump(Group1_barcode2_list, f)

with open(outpath +f'{Group2}_dgms_list.pkl','wb') as f:
    pkl.dump(Group2_dgms_list, f)
    
with open(outpath +f'{Group2}_barcode0_list.pkl','wb') as f:
    pkl.dump(Group2_barcode0_list, f)
    
with open(outpath +f'{Group2}_barcode1_list.pkl','wb') as f:
    pkl.dump(Group2_barcode1_list, f)
    
with open(outpath +f'{Group2}_barcode2_list.pkl','wb') as f:
    pkl.dump(Group2_barcode2_list, f)

print('Done')

Done


# Persistence Landscape

In [16]:
LS = gd.representations.Landscape(num_landscapes=1)
Group1_LS = LS.fit_transform(Group1_barcode1_list)
Group2_LS = LS.fit_transform(Group2_barcode1_list)

Group1_L1_norm = np.linalg.norm(Group1_LS, 1, axis=1)
Group1_L2_norm = np.linalg.norm(Group1_LS, axis=1)
Group2_L1_norm = np.linalg.norm(Group2_LS, 1, axis=1)
Group2_L2_norm = np.linalg.norm(Group2_LS, axis=1)

In [17]:
with open(outpath +f'{Group1}_L1_norm_1dim.pkl','wb') as f:
    pkl.dump(Group1_L1_norm, f)
    
with open(outpath +f'{Group1}_L2_norm_1dim.pkl','wb') as f:
    pkl.dump(Group1_L2_norm, f)

with open(outpath +f'{Group2}_L1_norm_1dim.pkl','wb') as f:
    pkl.dump(Group2_L1_norm, f)
    
with open(outpath +f'{Group2}_L2_norm_1dim.pkl','wb') as f:
    pkl.dump(Group2_L2_norm, f)


# Persistence Entropy

In [18]:
Group1_dgms_list = replace_infinity_by_2(Group1_dgms_list)
Group2_dgms_list = replace_infinity_by_2(Group2_dgms_list)

PE = gd.representations.Entropy()
Group1_pe_dim = PE.fit_transform(Group1_dgms_list)
Group2_pe_dim = PE.fit_transform(Group2_dgms_list)

In [19]:
# Group1_dgms_list

In [20]:
with open(outpath +f'{Group1}_persistence_entropy.pkl','wb') as f:
    pkl.dump(Group1_pe_dim, f)

with open(outpath +f'{Group2}_persistence_entropy.pkl','wb') as f:
    pkl.dump(Group2_pe_dim, f)

In [21]:
Group1_Global_data, Group2_Global_data = pd.DataFrame(), pd.DataFrame()

Group1_Global_data['SubID'] = G1_ID_list
Group1_Global_data['L1_norm'] = Group1_L1_norm
Group1_Global_data['L2_norm'] = Group1_L2_norm
Group1_Global_data['pe_dim'] = Group1_pe_dim 

Group2_Global_data['SubID'] = G2_ID_list
Group2_Global_data['L1_norm'] = Group2_L1_norm
Group2_Global_data['L2_norm'] = Group2_L2_norm
Group2_Global_data['pe_dim'] = Group2_pe_dim

print(Group1,'\n', Group1_Global_data[['L1_norm', 'L2_norm', 'pe_dim']].mean())
print(Group2,'\n', Group2_Global_data[['L1_norm', 'L2_norm', 'pe_dim']].mean())
Group1_Global_data.to_csv(outpath + f'{Group1}_L1L2PE.txt', sep = '\t', index=False)
Group2_Global_data.to_csv(outpath + f'{Group2}_L1L2PE.txt', sep = '\t', index=False)

ASD 
 L1_norm    3.027599
L2_norm    0.509080
pe_dim     5.534529
dtype: float64
Healthy 
 L1_norm    3.428484
L2_norm    0.544434
pe_dim     5.515724
dtype: float64


In [22]:
Group1, len(Group1_Global_data), Group2, len(Group2_Global_data), 

('ASD', 395, 'Healthy', 425)

In [23]:
Group2_Global_data#, Group1_Global_data

Unnamed: 0,SubID,L1_norm,L2_norm,pe_dim
0,51491,3.530659,0.536122,5.509056
1,50656,3.639333,0.568448,5.592295
2,50659,5.459258,0.760068,5.559813
3,50664,3.380321,0.595569,5.571076
4,50665,4.665912,0.702905,5.519631
...,...,...,...,...
420,50574,4.902363,0.759227,5.552485
421,50575,3.973737,0.634904,5.596767
422,50576,3.845485,0.630360,5.514820
423,50577,3.878031,0.596013,5.599884


In [24]:
len(G2_ID_list)

425

In [19]:
    # DisMat = pd.read_csv(path_file + files_list[i], header = None, sep = ',').values
    # thres = np.sqrt(2)
    # CorrMat = pd.read_csv(path_file+'../FCM_CorrMat/' + files_list[i], header = None, sep = ',').values
    # CorrMat[CorrMat < 0] = 0
    # nonzero_count_corrmat = np.count_nonzero(CorrMat)
    # DisMat[DisMat > thres] = thres
    # nonzero_count = np.count_nonzero(DisMat)
    # count_x = np.sum(DisMat == thres)
    # print(20000 - nonzero_count_corrmat/2 ==  count_x/2)
    # if not 20000 - nonzero_count_corrmat/2 ==  count_x/2:
    #     print(SubID, 20000 - nonzero_count_corrmat/2,  count_x/2)
    #     break

Number of occurrences of 2: 2


In [22]:
2800/60

46.666666666666664