In [1]:
# Read DRKG data as pandas data frame
# takes around 9 seconds
import pandas as pd
drkg_df = pd.read_csv('data/drkg/drkg.tsv', sep='\t')
triplets = drkg_df.values.tolist()
# print(drkg_df)

In [2]:
def parse_edge(e):
    edge_arr = e.split('::')
    edge_type = edge_arr[1].lower().replace(' ', '_')
    str2name = {'ddi-interactor-in': 'ddi_interactor',
                'x-atc': 'belongs2ATC', 'a+': 'agonism_activation',
                'a-': 'antagonism_blocking', 'b': 'binding', 'c': 'inhibit_cell_growth', 'd': 'drug_targets',
                'e+': 'increase_expression_production', 'e-': 'decrease_expression_production',
                'e': 'affect_expression_production', 'g': 'promote_progression', 'h': 'same_protein_or_complex',
                'i': 'signaling_pathway', 'j': 'role_in_pathogenesis', 'k': 'metabolism_pharmacokinetics',
                'l': 'improper_regulation', 'md': 'diognostic_biomarker', 'mp': 'disease_progress_biomarker',
                'n': 'inhibit', 'o': 'transport_channel', 'pa': 'alleviate_reduce', 'pr': 'prevent_suppress', 'q': 'production_by_cell_population',
                'rg': 'regulate', 'sa': 'sideEffect_adverseEvent', 't': 'treatment', 'te': 'possible_therapeutic_effect',
                'u': 'causal_mutation', 'ud': 'mutations_affecting_disease_course', 'v+': 'activate', 'w': 'enhance_response', 'x': 'overexpression',
                'y': 'polymorphisms_alter_risk', 'z': 'enzyme_activity', 'adg': 'downregulatation', 'aeg': 'expression', 'aug': 'upregulation',
                'cbg': 'binding', 'ccse': 'cause', 'cdg': 'downregulation', 'cpd': 'palliation', 'crc': 'resemblence', 'ctd': 'treatment',
                'cug': 'upregulation', 'dag': 'association', 'ddg': 'downregulation', 'dla': 'localization', 'dps': 'presents', 'drd': 'resemblence',
                'dug': 'upregulation', 'gcg': 'covariation', 'gig': 'interaction', 'gpbp': 'participation', 'gpcp': 'participation', 'gpmf': 'participation',
                'gpgw': 'participation', 'gpcc': 'participation', 'gppw': 'participation', 'gr>g': 'regulation', 'pcic': 'inclusion', 'ptmod': 'post_translational_modification',
                'humgenhumgen:gene:gene': 'hum_gen_hum_gen', 'virgenhumgen:gene:gene': 'vir_gen_hum_gen', 'drugvirgen:compound:gene': 'drug_vir_gen',
                'drughumgen:compound:gene': 'drug_hum_gen', 'activation': 'activate', 'target': 'gene_target'}
    if edge_type in str2name:
        return {'name': str2name[edge_type], 'data_provider': edge_arr[0]}
    return {'name': edge_type, 'data_provider': edge_arr[0]}


# Create Schema (takes around 70 seconds)

In [19]:
edge_types = {} # dictionary of string to string array
node_types = {} # dictionary of string to bool
for tri in triplets:
    [src, edge, tgt] = tri
    src_arr = src.split('::')
    src_type = src_arr[0].replace(' ', '_')
    src_id = str(src_arr[1])
    tgt_arr = tgt.split('::')
    tgt_type = tgt_arr[0].replace(' ', '_')
    tgt_id = str(tgt_arr[1])

    if not src_type in node_types:
        node_types[src_type] = True
    if not tgt_type in node_types:
        node_types[tgt_type] = True

    # add the edge type per type couple
    type_edge = f'{src_type}::{tgt_type}'
    if not type_edge in edge_types:
        edge_types[type_edge] = []
    edge_data = parse_edge(edge)
    if not edge_data['name'] in edge_types[type_edge]:
        edge_types[type_edge].append(edge_data['name'])


schema = ''
for t in node_types.keys():
    schema += f'CREATE VERTEX {t} (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"\n'
for endpoints in edge_types:
    [source_name, target_name] = endpoints.split('::')
    for edge_name in edge_types[endpoints]:
        schema += f'CREATE DIRECTED EDGE {edge_name}_{source_name}2{target_name} (FROM {source_name}, TO {target_name}, data_provider STRING)\n'
# print(schema)

CREATE VERTEX Gene (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Compound (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Disease (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Atc (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Tax (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Biological_Process (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Symptom (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Anatomy (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Molecular_Function (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Pharmacologic_Class (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Cellular_Component (PRIMARY_ID Id STRING) With primary_id_as_attribute="true"
CREATE VERTEX Pathway (PRIMARY_ID Id STRING) With primary_id_as_attribute=

# Connect to TigerGraph Cloud instance

In [20]:
import pyTigerGraph as tg 
# host = 'https://derman.i.tgcloud.io'
host = 'http://localhost'
secret = "jq5bspum1be7l7hlncgnpo0453t8l407"
graph_name = "derman"
user_name = "tigergraph"
password = "tigergraph"
# no need auth on local instance
# token = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password).getToken(secret, "1000000")[0]
# conn = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password, apiToken=token)
conn = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password)
conn.echo()

'Hello GSQL'

# Execute gsql to create schema in tgcloud

In [21]:
print(conn.gsql('use global\n' + schema))
# print(conn.gsql('USE GLOBAL\n DROP ALL')) # delete all the data

Successfully created vertex types: [Gene].
Successfully created vertex types: [Compound].
Successfully created vertex types: [Disease].
Successfully created vertex types: [Atc].
Successfully created vertex types: [Tax].
Successfully created vertex types: [Biological_Process].
Successfully created vertex types: [Symptom].
Successfully created vertex types: [Anatomy].
Successfully created vertex types: [Molecular_Function].
Successfully created vertex types: [Pharmacologic_Class].
Successfully created vertex types: [Cellular_Component].
Successfully created vertex types: [Pathway].
Successfully created vertex types: [Side_Effect].
Successfully created edge types: [hum_gen_hum_gen_Gene2Gene].
Successfully created edge types: [vir_gen_hum_gen_Gene2Gene].
Successfully created edge types: [activate_Gene2Gene].
Successfully created edge types: [production_by_cell_population_Gene2Gene].
Successfully created edge types: [regulate_Gene2Gene].
Successfully created edge types: [binding_Gene2Gene].

# Prepare edge bulks for bulk insert

In [23]:
# Takes around 60 seconds.
edge_bulks = {}
# prepare to insert edges in bulks
for tri in triplets:
    [src, edge, tgt] = tri
    src_arr = src.split('::')
    src_type = src_arr[0].replace(' ', '_')
    src_id = str(src_arr[1])
    tgt_arr = tgt.split('::')
    tgt_type = tgt_arr[0].replace(' ', '_')
    tgt_id = str(tgt_arr[1])

    # use this to check if there is any empty!
    # if len(src_id) < 1 or len(tgt_id) < 1:
    #     print (tri)

    edge_data = parse_edge(edge)
    edge_name = f'{edge_data["name"]}_{src_type}2{tgt_type}'
    # if edge_data['data_provider'] is None or len(edge_data['data_provider']) < 3:
    #     print ("asdasd")

    # below block is used with pyTigerGraph
    if not src_type in edge_bulks:
        edge_bulks[src_type] = {}
    if not edge_name in edge_bulks[src_type]:
        edge_bulks[src_type][edge_name] = {}
    if not tgt_type in edge_bulks[src_type][edge_name]:
        edge_bulks[src_type][edge_name][tgt_type] = []
    edge_bulks[src_type][edge_name][tgt_type].append(
        (src_id, tgt_id, {"data_provider": edge_data['data_provider']}))
    # print(src_type, src_id, edge, tgt_type, tgt_id)
    

# Insert the edge bulks chunk by chunk

- Takes around 4 MINUTES! Be patient. (In my local TigerGraph, it takes around 60 seconds!)
- Original data has 29 Genes without ids. I deleted them. See https://github.com/gnn4dr/DRKG/issues/32
- The fixed drkg.tsv is [here](https://drive.google.com/file/d/1Hc1mMEyh_4p6qHm4VLK2isb-E51himCA/view?usp=sharing)


In [25]:
# Firstly, create schema in your graph.
# use edge_bulks to insert BIG data chunk by chunk
CHUNK_SIZE = 100000
cnt = 0
prev_cnt_stop = 0 # if somehow an error occurs, set this variable to not start from zero 
for src_type in edge_bulks:
    for edge in edge_bulks[src_type]:
        for tgt_type in edge_bulks[src_type][edge]:
            edges = edge_bulks[src_type][edge][tgt_type]
            for i in range(0, len(edges), CHUNK_SIZE):
                if cnt >= prev_cnt_stop: 
                    print(cnt)
                    conn.upsertEdges(src_type, edge, tgt_type, edges[i:i + CHUNK_SIZE])
                cnt += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145


In [26]:
# update genes data with dgidb genes.tsv
import pandas as pd
dgidb_genes = pd.read_csv('data/dgidb/genes.tsv', sep='\t')
dgidb_genes

Unnamed: 0,gene_claim_name,gene_name,entrez_id,gene_claim_source
0,ENSG00000135776,ABCB10,23456.0,Ensembl
1,ENSG00000259277,,,Ensembl
2,KHK,KHK,3795.0,Pharos
3,CX3CL1,CX3CL1,6376.0,CIViC
4,ENSG00000213150,,,Ensembl
...,...,...,...,...
99246,ENSG00000284575,MIR4793,100616112.0,Ensembl
99247,ENSG00000177173,NAP1L4P1,728589.0,Ensembl
99248,ENSG00000147180,ZNF711,7552.0,Ensembl
99249,ENSG00000114554,PLXNA1,5361.0,Ensembl
