In [2]:
import pandas as pd
import numpy as np
np.set_printoptions(threshold = np.inf)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
from collections import Counter

In [72]:
def read_original_data(dataset_name = 'nr'):
    pwd = '/home/chujunyi/1_MFDF/1_original_data/' + str.upper(dataset_name) + '/'
    filename = dataset_name + '_admat_dgc_mat_2_line.txt'
    with open(pwd + filename, 'r') as f:
        data = f.readlines()
    print('# DTI = ', len(data))
    
    data = [tuple(item.replace('\n', '').split('\t')) for item in data]
    return data

In [59]:
def read_updated_data(dataset_name = 'nr'):
    pwd = '/home/chujunyi/2_Program/2_output_file/1_construct_dataset/3_drop_repeat_smiles_from2folder/'
    filename = 'v2_' + dataset_name + '_updated_drug_smiles_ids_drop_repeated.csv'
    
    data = pd.read_csv(pwd + filename)
    data = data[['drug_id', 'hsa_id']].apply(tuple, axis=1).tolist()
    print('# DTI = ', len(data))
    return data

In [112]:
def calc_data(data):
    drugs = list(set(np.array(data)[:,0]))
    targets = list(set(np.array(data)[:,1]))
    nd, nt, ndti = len(drugs), len(targets), len(data)
    print('# drugs = {}, # targets = {}, # DTI = {}'.format(nd, nt, ndti))
    
    bg = nx.Graph() # bg = bipartite graph
    bg.add_nodes_from(drugs, bipartite = 'drug')
    bg.add_nodes_from(targets, bipartite = 'target')
    bg.add_edges_from(data)
    assert nx.is_bipartite(bg) == True
    print('nx.is_connected(bg): ', nx.is_connected(bg))

    target_nodes = set(n for n,d in bg.nodes(data=True) if d['bipartite']=='target')
    drug_nodes = set(bg) - target_nodes
    assert len(target_nodes) == len(targets)
    assert len(drug_nodes) == len(drugs)

    density = bipartite.density(bg, drug_nodes)
    assert density == ndti / (nd * nt)
    
    target_degrees, drug_degrees = bipartite.degrees(bg, drug_nodes, 1)
    Dd_1 = Counter(list(dict(drug_degrees).values()))[1] / nd
    Dt_1 = Counter(list(dict(target_degrees).values()))[1] / nt
    
    print('Density (%) = {:.2f}'.format(density * 100))
    print('Dd = {:.2f}, Dt = {:.2f}'.format(ndti / nd, ndti / nt))
    print('[Dd = 1] (%) = {:.2f}, [Dt = 1] (%) = {:.2f}'.format(Dd_1 * 100, Dt_1 * 100))
    return

In [110]:
def show_stat(o_dataset_name = 'nr', u_dataset_name = 'nr'):
    print('\n==========' + str.upper(o_dataset_name) + '   Original dataset info:')
    o_data = read_original_data(dataset_name = o_dataset_name)
    calc_data(o_data)
    print('\n==========' + str.upper(o_dataset_name) + '   Updated dataset info:')
    u_data = read_updated_data(dataset_name = u_dataset_name)
    calc_data(u_data)
    return

In [113]:
show_stat(o_dataset_name = 'nr', u_dataset_name = 'nr')
show_stat(o_dataset_name = 'gpcr', u_dataset_name = 'GPCR')
show_stat(o_dataset_name = 'ic', u_dataset_name = 'IC')
show_stat(o_dataset_name = 'e', u_dataset_name = 'E')


# DTI =  90
# drugs = 54, # targets = 26, # DTI = 90
nx.is_connected(bg):  False
Density (%) = 6.41
Dd = 1.67, Dt = 3.46
[Dd = 1] (%) = 72.22, [Dt = 1] (%) = 30.77

# DTI =  886
# drugs = 541, # targets = 33, # DTI = 886
nx.is_connected(bg):  False
Density (%) = 4.96
Dd = 1.64, Dt = 26.85
[Dd = 1] (%) = 65.99, [Dt = 1] (%) = 18.18

# DTI =  635
# drugs = 223, # targets = 95, # DTI = 635
nx.is_connected(bg):  False
Density (%) = 3.00
Dd = 2.85, Dt = 6.68
[Dd = 1] (%) = 47.53, [Dt = 1] (%) = 35.79

# DTI =  5383
# drugs = 1680, # targets = 156, # DTI = 5383
nx.is_connected(bg):  False
Density (%) = 2.05
Dd = 3.20, Dt = 34.51
[Dd = 1] (%) = 46.13, [Dt = 1] (%) = 14.74

# DTI =  1476
# drugs = 210, # targets = 204, # DTI = 1476
nx.is_connected(bg):  False
Density (%) = 3.45
Dd = 7.03, Dt = 7.24
[Dd = 1] (%) = 38.57, [Dt = 1] (%) = 11.27

# DTI =  6385
# drugs = 765, # targets = 238, # DTI = 6385
nx.is_connected(bg):  False
Density (%) = 3.51
Dd = 8.35, Dt = 26.83
[Dd = 1] (%) = 21.70, [Dt