In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import random
from tqdm import tqdm

path = 'data/'

In [2]:
def read_files(filename, merge=False):
    if merge == True:
        df = pd.read_csv(path + filename, dtype={'t':str})
    else:    
        df = pd.read_csv(path + filename)
    print('read_files| read file {} with shape {}'.format(filename, df.shape))
    print()
    return df

def get_df_by_relation(df_m, list_r):
    df = df_m.loc[df_m['r'].isin(list_r)]
    print('get_df_by_relation| df: {}'.format(df.shape))
    return df

def get_list_kinase(filename):
    df = pd.read_csv(path + filename)
    df = df.rename(columns = {'uniprot':'id'})

    lst= list(df['id'].unique())
    print('get_list_kinase| df: {}, list: {}'.format(len(lst), df.shape))
    print()
    
    return lst, df
   
def filter_merge_by_head_N_relation(df_m, list_h, list_r):
    df_sel = df_m.loc[(df_m['h'].isin(list_h)) & (df_m['r'].isin(list_r))]
    print('filter_merge_by_head_N_relation| df_sel: {}'.format(df_sel.shape))
    
def df_to_dict(df):
    d = dict()
    for i in df.itertuples():
        
        h_id = i[1]
        t_id = i[2]
        
        if h_id not in d:
            d[h_id] = set()
        d[h_id].add(t_id)
        
        if t_id not in d:
            d[t_id] = set()
        d[t_id].add(h_id)
    print('df_to_dict| d:{}'.format(len(d)))
    return d

def randomly_sample_df(df, rnd_state = 1, split_ratio = 3):
    split_number = int(df.shape[0] / split_ratio)
    
    df_short = df[['h_id', 't_id', 'r_id']]
    d = df_to_dict(df_short)
    
    idx_list = df_short.index
    idx = random.choices(idx_list, k = split_number)
    
    selected_idx = list()
    
    for i in tqdm(range(len(idx))):
        
        res = df_short.iloc[i, [0, 1]].to_numpy()
        h_id = res[0]
        t_id = res[1]
        
        if (len(d[h_id]) > 1) and (len(d[t_id]) > 1) :
            try:
                d[h_id].remove(t_id)
                d[t_id].remove(h_id)
                selected_idx.append(i)
            except KeyError:
                print(idx, h_id, t_id, d[h_id], d[t_id])
                
    print('randomly_sample_df| delete dictionary.')        
    del(d)
    
    print('randomly_sample_df| selected_idx: {}, split_number: {}'.format(len(selected_idx), split_number))
    
    df_sample = df.iloc[selected_idx]
    return df_sample
    
def split_df(df,df_sample):
    sample_index = df_sample.index
    df_rest = df.loc[~df.index.isin(sample_index)]
    print('split_df| df_rest: {}, df_sample: {}, df: {}'.format(df_rest.shape, df_sample.shape, df.shape))
    return df_rest

In [3]:
def split_process(edge_file, node_file, relation_file, light_file, dark_file, train_file, test_file):
    df_merge      = read_files(edge_file, merge=True)
    df_nodes      = read_files(node_file)
    df_relations  = read_files(relation_file)

    print('split_process| df_merge: {}, df_nodes: {}, df_relations: {}'.format(df_merge.shape, df_nodes.shape, df_relations.shape))

    list_light, df_light = get_list_kinase(light_file)
    list_dark, df_dark = get_list_kinase(dark_file)

    df_light_pathway = filter_merge_by_head_N_relation(df_merge, list_light, ['hasPathway'])
    df_dark_pathway = filter_merge_by_head_N_relation(df_merge, list_dark, ['hasPathway'])

    df_hasPathway = get_df_by_relation(df_merge, ['hasPathway'])

    df_sample = randomly_sample_df(df_hasPathway)
    df_rest = split_df(df_merge, df_sample)

    print('split_process| saving...')
    df_sample.to_csv(path + test_file, index=False)
    df_rest.to_csv(path + train_file, index=False)
    print('split_process| saving done.')
    
    return df_sample, df_rest, df_merge


In [4]:
def get_edges_by_relation(df, r):
    return df[df['r'] == r]

def get_column_uniq(df, r):
    return set(df[r].unique())

def check_nodes_appears_in_both_splits(df_m, df_s):
    df_m_hasPathway = get_edges_by_relation(df_m, 'hasPathway')
    set_m_pathways = get_column_uniq(df_m_hasPathway, 't')
    set_m_proteins = get_column_uniq(df_m_hasPathway, 'h')

    df_s_hasPathway = get_edges_by_relation(df_s, 'hasPathway')
    set_s_pathways = get_column_uniq(df_s_hasPathway, 't')
    set_s_proteins = get_column_uniq(df_s_hasPathway, 'h')
    
    print('check_nodes_appears_in_both_splits| set_m_pathways: {}, set_s_pathways: {}'.format(len(set_m_pathways), len(set_s_pathways)))
    print('check_nodes_appears_in_both_splits| number of common nodes: {}'.format(len(set_m_pathways & set_s_pathways)))
    print()
    
    print('check_nodes_appears_in_both_splits| set_m_proteins: {}, set_s_proteins: {}'.format(len(set_m_proteins), len(set_s_proteins)))
    print('check_nodes_appears_in_both_splits| number of common nodes: {}'.format(len(set_m_proteins & set_s_proteins)))
    # df_s['r'].unique()

### For Original Graph, Merge

In [6]:
df_s, df_rest,df_m = split_process('df_merge_cc.csv', 'df_nodes_cc.csv', 'df_relations.csv', 'light_kinase.csv', 'dark_kinase.csv', 'df_merge_cc_train.csv','df_merge_cc_test.csv')
check_nodes_appears_in_both_splits(df_m, df_s)

read_files| read file df_merge_cc.csv with shape (2886875, 6)

read_files| read file df_nodes_cc.csv with shape (212557, 2)

read_files| read file df_relations.csv with shape (7, 2)

split_process| df_merge: (2886875, 6), df_nodes: (212557, 2), df_relations: (7, 2)
get_list_kinase| df: 385, list: (385, 3)

get_list_kinase| df: 160, list: (160, 3)

filter_merge_by_head_N_relation| df_sel: (1725, 6)
filter_merge_by_head_N_relation| df_sel: (136, 6)
get_df_by_relation| df: (170965, 6)


  2%|▏         | 1173/56988 [00:00<00:09, 5884.71it/s]

df_to_dict| d:58351


100%|██████████| 56988/56988 [00:09<00:00, 5866.87it/s]


randomly_sample_df| delete dictionary.
randomly_sample_df| selected_idx: 35759, split_number: 56988
split_df| df_rest: (2851116, 6), df_sample: (35759, 6), df: (2886875, 6)
split_process| saving...
split_process| saving done.
check_nodes_appears_in_both_splits| set_m_pathways: 1584, set_s_pathways: 892
check_nodes_appears_in_both_splits| number of common nodes: 892

check_nodes_appears_in_both_splits| set_m_proteins: 56767, set_s_proteins: 9470
check_nodes_appears_in_both_splits| number of common nodes: 9470


In [7]:
df_rest.shape

(2851116, 6)

In [8]:
def load_graph(df):
    
    print('load_graph| df : {}'.format(df.shape))
    df_edges = df[['h_id', 't_id','r_id']]
    edges = [tuple(x) for x in df_edges.values]
    print('load_graph| edges: {}'.format(len(edges)))

    GRAPH = nx.Graph()   # or DiGraph, MultiGraph, MultiDiGraph, etc
    GRAPH.add_weighted_edges_from(edges)
    return GRAPH

def Connected_component(G):
    connected_components_list_len = [len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]
    print('__check_connectivity| number of connected components: {}'.format(len(connected_components_list_len)))
    cc_g = list( nx.connected_components(G) )
    total_from_cc = np.sum(connected_components_list_len) # IT WAS OKAY
    print('__check_connectivity| number of total node in data based on the Networkx\'s Graph {}'.format(total_from_cc) )
    return cc_g, connected_components_list_len
    

In [9]:
g_train = load_graph(df_rest)
cc_g, connected_components_list_len = Connected_component(g_train)

load_graph| df : (2851116, 6)
load_graph| edges: 2851116
__check_connectivity| number of connected components: 1
__check_connectivity| number of total node in data based on the Networkx's Graph 212557


### check reachable Nodes cause the 196430

In [None]:
df_merge_cc_train = pd.read_csv('data/df_merge_cc_train.csv')
df_merge_cc_test = pd.read_csv('data/df_merge_cc_test.csv')
df_nodes_cc = pd.read_csv('data/df_nodes_cc.csv')
df_nodes = pd.read_csv('data/df_nodes.csv') 
print('df_merge_cc_train: {}, df_merge_cc_test: {}, df_nodes_cc: {}, df_nodes: {}'.format(df_merge_cc_train.shape, df_merge_cc_test.shape, df_nodes_cc.shape, df_nodes.shape))

In [None]:
{type(i) for i in df_merge_cc_train['t'] }

In [None]:
h_lst = list(df_merge_cc_train['h'].unique())
t_lst = list(df_merge_cc_train['t'].unique())

nodes_train = list(set(h_lst + t_lst))
set_nodes_train = set(nodes_train)
print('nodes_train: {}'.format(len(nodes_train)))

In [None]:
h_lst2 = list(df_merge_cc_test['h'].unique())
t_lst2 = list(df_merge_cc_test['t'].unique())
t_lst2 = [str(i) for i in t_lst2]
print({type(i) for i in t_lst2})

nodes_test = list(set(h_lst2+ t_lst2))
print('nodes_test: {}'.format(len(nodes_test)))

In [None]:
c = 0
for i in nodes_test:
    if i not in set_nodes_train:
        c+=1
c

In [None]:
nodes_cc = list(df_nodes_cc['name'])
set_nodes_cc = set(nodes_cc)
len(nodes_cc), len(set_nodes_cc)

In [None]:
for i in nodes_test:
    if i not in set_nodes_cc:
        c+=1
c

In [None]:
nodes = list(df_nodes['name'])
set_nodes = set(nodes)
len(nodes), len(set_nodes)

In [None]:
for i in nodes_test:
    if i not in set_nodes:
        c+=1
c

In [None]:
212557 - 196431

In [None]:
d_train = dict()
for i in df_merge_cc_train.itertuples():
    h = i[1]
    t = i[2]
    
    if h not in d_train:
        d_train[h] = 0
    d_train[h] += 1
    
    if t not in d_train:
        d_train[t] = 0
    d_train[t] +=1
len(d_train)

In [None]:
c = 0
for d in d_train:
    if d_train[d] == 4:
        c+=1
c 

In [None]:
st = set()
for d in d_train:
    st.add(d_train[d])
np.std(list(st)), np.mean(list(st))