In [3]:
# Read DRKG data as pandas data frame
# takes around 9 seconds
import pandas as pd
drkg_df = pd.read_csv('data/drkg/drkg_no_hetionet.tsv', sep='\t')
triplets = drkg_df.values.tolist()
# print(drkg_df)

In [32]:
def parse_edge(e, src_type, tgt_type):
    edge_arr = e.split('::')
    edge_type = edge_arr[1].lower().replace(' ', '_')
    str2name = {'ddi-interactor-in': 'ddi_interactor',
                'x-atc': 'belongs2ATC', 'a+': 'agonism_activation',
                'a-': 'antagonism_blocking', 'b': 'binding', 'c': 'inhibit_cell_growth', 'd': 'drug_targets',
                'e+': 'increase_expression_production', 'e-': 'decrease_expression_production',
                'e': 'affect_expression_production', 'g': 'promote_progression', 'h': 'same_protein_or_complex',
                'i': 'signaling_pathway', 'j': 'role_in_pathogenesis', 'k': 'metabolism_pharmacokinetics',
                'l': 'improper_regulation', 'md': 'diognostic_biomarker', 'mp': 'disease_progress_biomarker',
                'n': 'inhibit', 'o': 'transport_channel', 'pa': 'alleviate_reduce', 'pr': 'prevent_suppress', 'q': 'production_by_cell_population',
                'rg': 'regulate', 'sa': 'sideEffect_adverseEvent', 't': 'treatment', 'te': 'possible_therapeutic_effect',
                'u': 'causal_mutation', 'ud': 'mutations_affecting_disease_course', 'v+': 'activate', 'w': 'enhance_response', 'x': 'overexpression',
                'y': 'polymorphisms_alter_risk', 'z': 'enzyme_activity', 'ptmod': 'post_translational_modification',
                'humgenhumgen:gene:gene': 'hum_gen_hum_gen', 'virgenhumgen:gene:gene': 'vir_gen_hum_gen', 'drugvirgen:compound:gene': 'drug_vir_gen',
                'drughumgen:compound:gene': 'drug_hum_gen', 'activation': 'activate', 'target': 'gene_target'}
    dir = f'_{src_type}2{tgt_type}'
    if edge_type in str2name:
        return {'name': edge_arr[0] + '_' + str2name[edge_type] + dir, 'data_provider': edge_arr[0]}
    return {'name': edge_arr[0] + '_' + edge_type + dir, 'data_provider': edge_arr[0]}


# Process Schema info from DRKG (takes around 16 seconds)

In [52]:
edge_types_counts = {}
edge_types = {} # dictionary of string to string to dictionary
node_types = {} # dictionary of string to dictionary
for tri in triplets:
    [src, edge, tgt] = tri
    src_arr = src.split('::')
    src_type = src_arr[0].replace(' ', '_')
    src_id = str(src_arr[1])
    tgt_arr = tgt.split('::')
    tgt_type = tgt_arr[0].replace(' ', '_')
    tgt_id = str(tgt_arr[1])

    if not src_type in node_types:
        node_types[src_type] = {}
    if not tgt_type in node_types:
        node_types[tgt_type] = {}
    
    # add the edge type per type couple
    type_edge = f'{src_type}::{tgt_type}'
    if not type_edge in edge_types:
        edge_types[type_edge] = {}
    edge_data = parse_edge(edge, src_type, tgt_type)
        
    if not edge_data['name'] in edge_types[type_edge]:
        edge_types[type_edge][edge_data['name']] = {}
    if edge_data['name'] not in edge_types_counts:
        edge_types_counts[edge_data['name']] = 0
    edge_types_counts[edge_data['name']] += 1 

In [53]:
edge_types_counts

{'bioarx_hum_gen_hum_gen_Gene2Gene': 58093,
 'bioarx_vir_gen_hum_gen_Gene2Gene': 535,
 'bioarx_drug_vir_gen_Compound2Gene': 1165,
 'bioarx_drug_hum_gen_Compound2Gene': 24501,
 'bioarx_covid2_acc_host_gene_Disease2Gene': 332,
 'bioarx_coronavirus_ass_host_gene_Disease2Gene': 129,
 'DGIDB_inhibitor_Gene2Compound': 5971,
 'DGIDB_antagonist_Gene2Compound': 3006,
 'DGIDB_other_Gene2Compound': 11070,
 'DGIDB_agonist_Gene2Compound': 3012,
 'DGIDB_binder_Gene2Compound': 143,
 'DGIDB_modulator_Gene2Compound': 243,
 'DGIDB_blocker_Gene2Compound': 979,
 'DGIDB_channel_blocker_Gene2Compound': 352,
 'DGIDB_antibody_Gene2Compound': 188,
 'DGIDB_positive_allosteric_modulator_Gene2Compound': 618,
 'DGIDB_allosteric_modulator_Gene2Compound': 317,
 'DGIDB_activator_Gene2Compound': 316,
 'DGIDB_partial_agonist_Gene2Compound': 75,
 'DRUGBANK_belongs2ATC_Compound2Atc': 15750,
 'DRUGBANK_ddi_interactor_Compound2Compound': 1379271,
 'DRUGBANK_gene_target_Compound2Gene': 19158,
 'DRUGBANK_enzyme_Compound2Gene

In [41]:
def py2TigerType(o):
    s = str(type(o))
    if 'int' in s:
        return 'INT'
    if 'float' in s:
        return 'FLOAT'
    return 'STRING'

In [20]:
# parse hetionet data to create schema
# takes around 21 seconds
import json
hetionet =  json.load(open('data/hetionet/hetionet-v1.0.json/hetionet-v1.0.json'))

In [42]:
for node in hetionet['nodes']:
    kind = node['kind']
    kind = kind.replace(' ', '_')
    if kind not in node_types:
        node_types[kind] = {}
    for prop in node:
        if prop == 'kind' or prop == 'identifier':
            continue
        if 'data' in prop:
            for d in node['data']:
                node_types[kind][d] = py2TigerType(node['data'][d])
        else:
            node_types[kind][prop] = py2TigerType(node[prop])

    
print(json.dumps(node_types))

{"Gene": {"name": "STRING", "description": "STRING", "source": "STRING", "license": "STRING", "url": "STRING", "chromosome": "STRING"}, "Compound": {"name": "STRING", "license": "STRING", "source": "STRING", "inchikey": "STRING", "inchi": "STRING", "url": "STRING"}, "Disease": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING"}, "Atc": {}, "Tax": {}, "Molecular_Function": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING"}, "Side_Effect": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING"}, "Biological_Process": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING"}, "Pathway": {"name": "STRING", "license": "STRING", "source": "STRING", "url": "STRING"}, "Anatomy": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING", "mesh_id": "STRING", "bto_id": "STRING"}, "Cellular_Component": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING"}, "Symptom":

In [40]:
str(type(12.1))

"<class 'float'>"

In [43]:
# takes around 8 seconds
for edge in hetionet['edges']:
    src_type = edge['source_id'][0].replace(' ', '_')
    tgt_type = edge['target_id'][0].replace(' ', '_')
    kind = 'HETIONET_' + edge['kind'] + f'_{src_type}2{tgt_type}'
    type_edge = f'{src_type}::{tgt_type}'
    
    if type_edge not in edge_types:
        edge_types[type_edge] = {}
    if kind not in edge_types[type_edge]:
        edge_types[type_edge][kind] = {}
    
    for prop in edge:
        if prop in {'kind', 'identifier', 'source_id', 'target_id'}:
            continue
        if 'data' in prop:
            for d in edge['data']:
                edge_types[type_edge][kind][d] = py2TigerType(edge['data'][d])
        else:
            edge_types[type_edge][kind][prop] = py2TigerType(edge[prop])

print(json.dumps(edge_types))

{"Gene::Gene": {"bioarx_hum_gen_hum_gen_Gene2Gene": {}, "bioarx_vir_gen_hum_gen_Gene2Gene": {}, "GNBR_activate_Gene2Gene": {}, "GNBR_production_by_cell_population_Gene2Gene": {}, "GNBR_regulate_Gene2Gene": {}, "GNBR_binding_Gene2Gene": {}, "GNBR_signaling_pathway_Gene2Gene": {}, "GNBR_increase_expression_production_Gene2Gene": {}, "GNBR_same_protein_or_complex_Gene2Gene": {}, "GNBR_enhance_response_Gene2Gene": {}, "GNBR_affect_expression_production_Gene2Gene": {}, "INTACT_physical_association_Gene2Gene": {}, "INTACT_association_Gene2Gene": {}, "INTACT_colocalization_Gene2Gene": {}, "INTACT_dephosphorylation_reaction_Gene2Gene": {}, "INTACT_cleavage_reaction_Gene2Gene": {}, "INTACT_direct_interaction_Gene2Gene": {}, "INTACT_phosphorylation_reaction_Gene2Gene": {}, "INTACT_adp_ribosylation_reaction_Gene2Gene": {}, "INTACT_ubiquitination_reaction_Gene2Gene": {}, "INTACT_protein_cleavage_Gene2Gene": {}, "STRING_reaction_Gene2Gene": {}, "STRING_catalysis_Gene2Gene": {}, "STRING_activate_Gen

In [57]:
import numpy as np
dgidb_df = pd.read_csv('data/dgidb/interactions.tsv', sep='\t')
np.sum(dgidb_df['entrez_id'].notnull() & dgidb_df['drug_concept_id'].notnull())

58690

In [None]:
# create schema
schema = ''
for t in node_types:
    schema += f'CREATE VERTEX {t} (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"\n'
for endpoints in edge_types:
    [source_name, target_name] = endpoints.split('::')
    for edge_name in edge_types[endpoints]:
        schema += f'CREATE DIRECTED EDGE {edge_name} (FROM {source_name}, TO {target_name}, data_provider STRING)\n'
# print(schema)

# Connect to TigerGraph Cloud instance

In [20]:
import pyTigerGraph as tg 
# host = 'https://derman.i.tgcloud.io'
host = 'http://localhost'
secret = "jq5bspum1be7l7hlncgnpo0453t8l407"
graph_name = "derman"
user_name = "tigergraph"
password = "tigergraph"
# no need auth on local instance
# token = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password).getToken(secret, "1000000")[0]
# conn = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password, apiToken=token)
conn = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password)
conn.echo()

'Hello GSQL'

# Execute gsql to create schema in tgcloud

In [21]:
print(conn.gsql('use global\n' + schema))
# print(conn.gsql('USE GLOBAL\n DROP ALL')) # delete all the data

Successfully created vertex types: [Gene].
Successfully created vertex types: [Compound].
Successfully created vertex types: [Disease].
Successfully created vertex types: [Atc].
Successfully created vertex types: [Tax].
Successfully created vertex types: [Biological_Process].
Successfully created vertex types: [Symptom].
Successfully created vertex types: [Anatomy].
Successfully created vertex types: [Molecular_Function].
Successfully created vertex types: [Pharmacologic_Class].
Successfully created vertex types: [Cellular_Component].
Successfully created vertex types: [Pathway].
Successfully created vertex types: [Side_Effect].
Successfully created edge types: [hum_gen_hum_gen_Gene2Gene].
Successfully created edge types: [vir_gen_hum_gen_Gene2Gene].
Successfully created edge types: [activate_Gene2Gene].
Successfully created edge types: [production_by_cell_population_Gene2Gene].
Successfully created edge types: [regulate_Gene2Gene].
Successfully created edge types: [binding_Gene2Gene].

# Prepare edge bulks for bulk insert

In [23]:
# Takes around 60 seconds.
edge_bulks = {}
# prepare to insert edges in bulks
for tri in triplets:
    [src, edge, tgt] = tri
    src_arr = src.split('::')
    src_type = src_arr[0].replace(' ', '_')
    src_id = str(src_arr[1])
    tgt_arr = tgt.split('::')
    tgt_type = tgt_arr[0].replace(' ', '_')
    tgt_id = str(tgt_arr[1])

    # use this to check if there is any empty!
    # if len(src_id) < 1 or len(tgt_id) < 1:
    #     print (tri)

    edge_data = parse_edge(edge)
    edge_name = f'{edge_data["name"]}'
    # if edge_data['data_provider'] is None or len(edge_data['data_provider']) < 3:
    #     print ("asdasd")

    # below block is used with pyTigerGraph
    if not src_type in edge_bulks:
        edge_bulks[src_type] = {}
    if not edge_name in edge_bulks[src_type]:
        edge_bulks[src_type][edge_name] = {}
    if not tgt_type in edge_bulks[src_type][edge_name]:
        edge_bulks[src_type][edge_name][tgt_type] = []
    edge_bulks[src_type][edge_name][tgt_type].append(
        (src_id, tgt_id, {"data_provider": edge_data['data_provider']}))
    # print(src_type, src_id, edge, tgt_type, tgt_id)
    

# Insert the edge bulks chunk by chunk

- Takes around 4 MINUTES! Be patient. (In my local TigerGraph, it takes around 60 seconds!)
- Original data has 29 Genes without ids. I deleted them. See https://github.com/gnn4dr/DRKG/issues/32
- The fixed drkg.tsv is [here](https://drive.google.com/file/d/1Hc1mMEyh_4p6qHm4VLK2isb-E51himCA/view?usp=sharing)


In [25]:
# Firstly, create schema in your graph.
# use edge_bulks to insert BIG data chunk by chunk
CHUNK_SIZE = 100000
cnt = 0
prev_cnt_stop = 0 # if somehow an error occurs, set this variable to not start from zero 
for src_type in edge_bulks:
    for edge in edge_bulks[src_type]:
        for tgt_type in edge_bulks[src_type][edge]:
            edges = edge_bulks[src_type][edge][tgt_type]
            for i in range(0, len(edges), CHUNK_SIZE):
                if cnt >= prev_cnt_stop: 
                    print(cnt)
                    conn.upsertEdges(src_type, edge, tgt_type, edges[i:i + CHUNK_SIZE])
                cnt += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
