In [1]:
import numpy as np
import pandas as pd
import os
import shutil
from glob import glob
from tqdm import tqdm

In [2]:
# Function to compute Kullback-Leibler Divergence
def D(p, q):
    result = 0
    for i in range(len(p)):
        if q[i] != 0:
            result += p[i]*np.log(p[i]/q[i])
    return result

# Function to compute Jensen-Shannon Divergence
def J(p, q):
    m = (p+q)/2
    DPM = D(p, m)
    DQM = D(q, m)
    return np.abs(DPM/2 + DQM/2)

# Function for Distance Measure
def M(Gp, Gq, metaboliteList):
    result = []
    for i in metaboliteList:
        Gp_i = Gp[Gp[0]==i][2].to_numpy()
        Gq_i = Gq[Gq[0]==i][2].to_numpy()
        result.append(np.sqrt(J(Gp_i, Gq_i))/(2*np.sqrt(np.log(2))))
    return np.mean(result)

## Data Preparation

In [3]:
reference_dir = '/content/drive/MyDrive/Mendeley/running Met2Graph/Filter_NonT2DM_to_Healthy/100_200_references_for_Netpro2vec'

thresholds = [
    # '1_80',
    # '2_60',
    '2_80'
]
N = [
    # 100,
    200,
    # 300,
    # 400
]
targets = [
    # 'meanSum',
    'minSum',
    # 'minMax'
]
formats = [
    'NDD',
    'TransitionMatrix1',
    'TransitionMatrix2',
]

for f in tqdm(formats):
    for target in targets:
        for t in thresholds:
            for n in N:
                reference = pd.read_excel(os.path.join(reference_dir, f'{n}_{target}_sample_selection_{t}.xlsx'))
                src = f'/content/drive/MyDrive/Mendeley/running Met2Graph/Met2MetGraph/{target}T2DM/8555_ndd_transition/{f}'
                dest = f'/content/drive/MyDrive/Mendeley/running Met2Graph/Met2MetGraph/{target}T2DM/{n}_ndd_transition/{f}'

                if os.path.exists(dest) != True:
                    os.mkdir(dest)

                for i in reference.index:
                    row_i = reference.loc[i]
                    gtex = row_i['gtex']
                    filename = f'{target}_{gtex}.ncol'
                    shutil.copy(os.path.join(src, filename), os.path.join(dest, filename))

100%|██████████| 3/3 [06:47<00:00, 135.87s/it]


## Run Distance Matrix

In [5]:
N = [
    # 100,
    200,
    # 300,
    # 400
]
targets = [
    # 'meanSum',
    'minSum',
    # 'minMax'
]

for target in target:
    for n in N:
        main = f'/content/drive/MyDrive/Mendeley/running Met2Graph/Met2MetGraph/{target}T2DM/{n}_ndd_transition'
        ndd_dir = os.path.join(main, 'NDD')
        tm1_dir = os.path.join(main, 'TransitionMatrix1')
        tm2_dir = os.path.join(main, 'TransitionMatrix2')
        out_dir = main

        sample_filename = [] # calling gtex filename
        sample_name = [] # calling gtex name
        for i in os.listdir(ndd_dir):
            sample_filename.append(i)
            sample_name.append(i.split('_')[-1].split('.')[0])

        metaboliteList = []
        for i in pd.read_table(os.path.join(ndd_dir, sample_filename[0]), sep=' ', header=None)[0]:
            metaboliteList.append(i)
        metaboliteList = np.unique(metaboliteList)

        distance_matrix = {}
        for i in sample_name:
            distance_matrix[i] = [0]*len(sample_name)

        distance_matrix = pd.DataFrame(distance_matrix, index=sample_name)

        for i in range(len(sample_filename)):
            for j in tqdm(range(len(sample_filename))):
                if i == j:
                    continue
                else:
                    filename_i = sample_filename[i]
                    filename_j = sample_filename[j]
                    ndd_i = pd.read_table(os.path.join(ndd_dir, filename_i), sep=' ', header=None)
                    tm1_i = pd.read_table(os.path.join(tm1_dir, filename_i), sep=' ', header=None)
                    tm2_i = pd.read_table(os.path.join(tm2_dir, filename_i), sep=' ', header=None)
                    ndd_j = pd.read_table(os.path.join(ndd_dir, filename_j), sep=' ', header=None)
                    tm1_j = pd.read_table(os.path.join(tm1_dir, filename_j), sep=' ', header=None)
                    tm2_j = pd.read_table(os.path.join(tm2_dir, filename_j), sep=' ', header=None)
                    M1 = M(ndd_i, ndd_j, metaboliteList)
                    M2 = M(tm1_i, tm1_j, metaboliteList)
                    M3 = M(tm2_i, tm2_j, metaboliteList)
                    D3 = M1 + M2 + M3
                    distance_matrix.at[sample_name[i], sample_name[j]] = D3

        distance_matrix.to_excel(os.path.join(main, f'distance_matrix_{n}.xlsx'))

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Mendeley/running Met2Graph/Met2MetGraph/mT2DM/200_ndd_transition/NDD'

## Trial

In [None]:
ndd_dir = '../100_ndd_transition/NDD'
tm1_dir = '../100_ndd_transition/TransitionMatrix1'
tm2_dir = '../100_ndd_transition/TransitionMatrix2'
out_dir = '../100_ndd_transition/DistanceMatrix'

if os.path.exists(out_dir) == False:
    os.mkdir(out_dir)

In [None]:
sample_filename = [] # calling gtex filename
sample_name = [] # calling gtex name
for i in os.listdir(ndd_dir):
    sample_filename.append(i)
    sample_name.append(i.split('_')[-1].split('.')[0])

In [None]:
metaboliteList = []

for i in pd.read_table(os.path.join(ndd_dir, sample_filename[0]), sep=' ', header=None)[0]:
    metaboliteList.append(i)

metaboliteList = np.unique(metaboliteList)

In [None]:
distance_matrix = {}
for i in sample_name:
    distance_matrix[i] = [0]*len(sample_name)

distance_matrix = pd.DataFrame(distance_matrix, index=sample_name)

In [None]:
for i in range(len(sample_filename)):
    for j in tqdm(range(len(sample_filename))):
        if i == j:
            continue
        else:
            filename_i = sample_filename[i]
            filename_j = sample_filename[j]
            ndd_i = pd.read_table(os.path.join(ndd_dir, filename_i), sep=' ', header=None)
            tm1_i = pd.read_table(os.path.join(tm1_dir, filename_i), sep=' ', header=None)
            tm2_i = pd.read_table(os.path.join(tm2_dir, filename_i), sep=' ', header=None)
            ndd_j = pd.read_table(os.path.join(ndd_dir, filename_j), sep=' ', header=None)
            tm1_j = pd.read_table(os.path.join(tm1_dir, filename_j), sep=' ', header=None)
            tm2_j = pd.read_table(os.path.join(tm2_dir, filename_j), sep=' ', header=None)
            M1 = M(ndd_i, ndd_j, metaboliteList)
            M2 = M(tm1_i, tm1_j, metaboliteList)
            M3 = M(tm2_i, tm2_j, metaboliteList)
            D3 = M1 + M2 + M3
            distance_matrix.at[sample_name[i], sample_name[j]] = D3

100%|█████████████████████████████████████████| 100/100 [27:53<00:00, 16.74s/it]
100%|█████████████████████████████████████████| 100/100 [27:53<00:00, 16.74s/it]
100%|█████████████████████████████████████████| 100/100 [27:51<00:00, 16.71s/it]
100%|█████████████████████████████████████████| 100/100 [27:51<00:00, 16.71s/it]
100%|█████████████████████████████████████████| 100/100 [27:51<00:00, 16.71s/it]
100%|█████████████████████████████████████████| 100/100 [27:50<00:00, 16.71s/it]
100%|█████████████████████████████████████████| 100/100 [27:51<00:00, 16.71s/it]
100%|█████████████████████████████████████████| 100/100 [27:52<00:00, 16.72s/it]
100%|█████████████████████████████████████████| 100/100 [27:51<00:00, 16.72s/it]
100%|█████████████████████████████████████████| 100/100 [27:51<00:00, 16.71s/it]
100%|█████████████████████████████████████████| 100/100 [27:55<00:00, 16.75s/it]
100%|█████████████████████████████████████████| 100/100 [27:54<00:00, 16.75s/it]
100%|███████████████████████

In [None]:
distance_matrix.to_excel('../100_ndd_transition/distance_matrix.xlsx')