In [48]:
import numpy as np
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import time

In [99]:
# Function to compute Node Distance Distribution
def ndd(x):
    result = x.copy()
    total = np.sum(x[2])
    ndd = x[2]/total
    result[2] = ndd
    return result

# Function to generate Transition Matrix
def transition_matrix(x, metaboliteList, walk=1):
    g = x.copy()
    result_data = []  # Use a list to store results for better performance

    if walk == 1:
        for i in metaboliteList:
            row_i = g[g[0] == i]  # Filter rows for metabolite `i`
            row_sum = row_i[2].sum()  # Compute the sum of column 2
            row_sum = row_sum if row_sum != 0 else 1  # Handle zero-sum rows

            # Iterate through rows and compute normalized values
            for _, row in row_i.iterrows():
                val = row[2] / row_sum
                result_data.append([i, row[1], val])  # Store results as a list

        # Convert the accumulated results into a DataFrame at once
        result = pd.DataFrame(result_data, columns=[0, 1, 2])

    elif walk == 2:
        # Prepare walk_distances
        walk_distances = []

        g_dict = g.groupby(0)[[1, 2]].apply(lambda df: df.values.tolist()).to_dict()

        # Compute walk distances
        for i in metaboliteList:
            row_i = g_dict.get(i, [])
            for j, val1 in row_i:
                row_j = g_dict.get(j, [])
                for k, val2 in row_j:
                    walk_distances.append([i, k, val1 + val2])

        # Convert walk_distances to a DataFrame
        walk_distances = pd.DataFrame(walk_distances, columns=[0, 1, 2])

        # Normalize walk distances and calculate results
        for i in metaboliteList:
            row_i = walk_distances[walk_distances[0] == i]  # Filter rows for metabolite `i`
            row_sum = row_i[2].sum()  # Compute the sum of column 2
            row_sum = row_sum if row_sum != 0 else 1  # Handle zero-sum rows

            # Iterate through rows and compute normalized values
            for _, row in row_i.iterrows():
                val = row[2] / row_sum
                result_data.append([i, row[1], val])  # Store results as a list

        # Convert the accumulated results into a DataFrame at once
        result = pd.DataFrame(result_data, columns=[0, 1, 2])

    else:
        print('This function is limited to walk = 1 or 2')

    return result

# # Function to compute Kullback-Leibler Divergence
# def D(p, q):
#     result = 0
#     for i in range(len(p)):
#         if q[i] != 0:
#             result += p[i]*np.log(p[i]/q[i])
#     return result

# # Function to compute Jensen-Shannon Divergence
# def J(p, q):
#     m = (p+q)/2
#     DPM = D(p, m)
#     DQM = D(q, m)
#     return np.abs(DPM/2 + DQM/2)

# # Function for Distance Measure
# def M(Gp, Gq, metaboliteList):
#     result = []
#     for i in metaboliteList:
#         Gp_i = Gp[Gp[0]==i][2].to_numpy()
#         Gq_i = Gq[Gq[0]==i][2].to_numpy()
#         result.append(np.sqrt(J(Gp_i, Gq_i))/(2*np.sqrt(np.log(2))))
#     return np.mean(result)

In [5]:
data = []
data_name = []
for i in glob('/content/drive/MyDrive/Mendeley/running Met2Graph/Met2MetGraph/meanSumT2DM/MetGraphs/*.ncol'):
    data.append(pd.read_table(i, sep=' ', header=None))
    data_name.append(i.split('MetGraphs/')[-1].split('.')[0])

In [6]:
len(data)

8555

In [7]:
metaboliteList = []
metaboliteListAll = []

for i in data[0][0]:
    metaboliteList.append(i)
    metaboliteListAll.append(i)
for i in data[0][1]:
    metaboliteListAll.append(i)

metaboliteList = np.unique(metaboliteList)
metaboliteListAll = np.unique(metaboliteListAll)

In [98]:
start = time.time()

tm1 = transition_matrix(data[0], metaboliteList)

end = time.time()
print(f'Time TM1: {end-start}')

start = time.time()

tm2 = transition_matrix(data[0], metaboliteList, 2)

end = time.time()
print(f'Time TM2: {end-start}')

Time TM1: 4.3722851276397705


  g_dict = g.groupby(0).apply(lambda df: df.values.tolist()).to_dict()


Time TM2: 28.067373514175415


In [100]:
start = time.time()

tm1 = transition_matrix(data[0], metaboliteList)

end = time.time()
print(f'Time TM1: {end-start}')

start = time.time()

tm2 = transition_matrix(data[0], metaboliteList, 2)

end = time.time()
print(f'Time TM2: {end-start}')

Time TM1: 3.9815664291381836
Time TM2: 25.86453652381897


In [None]:
for i in tqdm(range(len(data))):
    ndd_i = ndd(data[i])
    ndd_i.to_csv(f'../NDD/{data_name[i]}.ncol', sep=' ', index=False, header=None)

100%|███████████████████████████████████████| 8555/8555 [01:37<00:00, 87.36it/s]


In [None]:
for i in tqdm(range(len(data))):
    tm_1 = transition_matrix(data[i], metaboliteList)
    tm_1.to_csv(f'../TransitionMatrix1/{data_name[i]}.ncol', sep=' ', index=False, header=None)

  1%|▏                                      | 53/8555 [03:06<8:25:25,  3.57s/it]

In [None]:
for i in tqdm(range(len(data))):
    tm_2 = transition_matrix(data[i], metaboliteList, 2)
    tm_2.to_csv(f'../TransitionMatrix2/{data_name[i]}.ncol', sep=' ', index=False, header=None)