Full data is inside data.7z file

In [1]:
# Read DRKG data as pandas data frame
# takes around 9 seconds
import pandas as pd
drkg_df = pd.read_csv('data/drkg/drkg_no_hetionet_dgidb.tsv', sep='\t')
triplets = drkg_df.values.tolist()
# print(drkg_df)

In [2]:
def parse_edge(e, src_type, tgt_type):
    edge_arr = e.split('::')
    edge_type = edge_arr[1].lower().replace(' ', '_')
    str2name = {'ddi-interactor-in': 'ddi_interactor',
                'x-atc': 'belongs2ATC', 'a+': 'agonism_activation',
                'a-': 'antagonism_blocking', 'b': 'binding', 'c': 'inhibit_cell_growth', 'd': 'drug_targets',
                'e+': 'increase_expression_production', 'e-': 'decrease_expression_production',
                'e': 'affect_expression_production', 'g': 'promote_progression', 'h': 'same_protein_or_complex',
                'i': 'signaling_pathway', 'j': 'role_in_pathogenesis', 'k': 'metabolism_pharmacokinetics',
                'l': 'improper_regulation', 'md': 'diognostic_biomarker', 'mp': 'disease_progress_biomarker',
                'n': 'inhibit', 'o': 'transport_channel', 'pa': 'alleviate_reduce', 'pr': 'prevent_suppress', 'q': 'production_by_cell_population',
                'rg': 'regulate', 'sa': 'sideEffect_adverseEvent', 't': 'treatment', 'te': 'possible_therapeutic_effect',
                'u': 'causal_mutation', 'ud': 'mutations_affecting_disease_course', 'v+': 'activate', 'w': 'enhance_response', 'x': 'overexpression',
                'y': 'polymorphisms_alter_risk', 'z': 'enzyme_activity', 'ptmod': 'post_translational_modification',
                'humgenhumgen:gene:gene': 'hum_gen_hum_gen', 'virgenhumgen:gene:gene': 'vir_gen_hum_gen', 'drugvirgen:compound:gene': 'drug_vir_gen',
                'drughumgen:compound:gene': 'drug_hum_gen', 'activation': 'activate', 'target': 'gene_target'}
    dir = f'_{src_type}2{tgt_type}'
    if edge_type in str2name:
        return edge_arr[0] + '_' + str2name[edge_type] + dir
    return edge_arr[0] + '_' + edge_type + dir

In [3]:
edge_bulks = {} # dictionary of src_type > edge_type > tgt_type > List of 3 tuples (src_id, tgt_id, attributes)
node_bulks = {} # dictionary of node_type > List of 2 tuples (id, attributes)

def insert2edge_bulks(src_type, edge_type, tgt_type, src_id, tgt_id, attr):
    if not src_type in edge_bulks:
        edge_bulks[src_type] = {}
    if not edge_type in edge_bulks[src_type]:
        edge_bulks[src_type][edge_type] = {}
    if not tgt_type in edge_bulks[src_type][edge_type]:
        edge_bulks[src_type][edge_type][tgt_type] = []
    edge_bulks[src_type][edge_type][tgt_type].append(
        (src_id, tgt_id, attr))

def insert2node_bulks(node_type, id, attr):
    if not node_type in node_bulks:
        node_bulks[node_type] = []
    node_bulks[node_type].append((id, attr))
    

# Process Schema info from DRKG (takes around 45 seconds)

In [4]:
edge_types = {} # dictionary of string to string to dictionary
node_types = {} # dictionary of string to dictionary
for tri in triplets:
    [src, edge, tgt] = tri
    src_arr = src.split('::')
    src_type = src_arr[0].replace(' ', '_')
    src_id = str(src_arr[1])
    tgt_arr = tgt.split('::')
    tgt_type = tgt_arr[0].replace(' ', '_')
    tgt_id = str(tgt_arr[1])

    if not src_type in node_types:
        node_types[src_type] = {}
    if not tgt_type in node_types:
        node_types[tgt_type] = {}
    
    # add the edge type per type couple
    type_edge = f'{src_type}::{tgt_type}'
    if not type_edge in edge_types:
        edge_types[type_edge] = {}
    edge = parse_edge(edge, src_type, tgt_type)
        
    if not edge in edge_types[type_edge]:
        edge_types[type_edge][edge] = {}
    
    # insert to edge bulks (no need to insert node bulks since node will be created automatically)
    insert2edge_bulks(src_type, edge, tgt_type, src_id, tgt_id, {})

In [5]:
def py2TigerType(o):
    s = str(type(o))
    if 'int' in s:
        return 'INT'
    if 'float' in s:
        return 'FLOAT'
    if 'bool' in s:
        return 'BOOL'
    return 'STRING'

In [6]:
# parse hetionet data to create schema
# takes around 21 seconds
import json
hetionet =  json.load(open('data/hetionet/hetionet-v1.0.json/hetionet-v1.0.json'))

In [7]:
# parse hetionet nodes
for node in hetionet['nodes']:
    kind = node['kind']
    kind = kind.replace(' ', '_')
    if kind not in node_types:
        node_types[kind] = {}
    
    attr = {}
    for prop in node:
        if prop == 'kind' or prop == 'identifier':
            continue
        if 'data' in prop:
            for d in node['data']:
                node_types[kind][d] = py2TigerType(node['data'][d])
                if isinstance(node['data'][d], list):
                    attr[d] = ','.join(node['data'][d])
                else:
                    attr[d] = node['data'][d]
                
        else:
            node_types[kind][prop] = py2TigerType(node[prop])
            attr[prop] = node[prop]
    insert2node_bulks(kind, node['identifier'], attr)

In [8]:
# parse hetionet edges
# takes around 58 seconds
for edge in hetionet['edges']:
    src_type = edge['source_id'][0].replace(' ', '_')
    tgt_type = edge['target_id'][0].replace(' ', '_')
    kind = 'HETIONET_' + edge['kind'] + f'_{src_type}2{tgt_type}'
    type_edge = f'{src_type}::{tgt_type}'
    
    if type_edge not in edge_types:
        edge_types[type_edge] = {}
    if kind not in edge_types[type_edge]:
        edge_types[type_edge][kind] = {}
    attr = {}
    for prop in edge:
        if prop in {'kind', 'identifier', 'source_id', 'target_id'}:
            continue
        if 'data' in prop:
            for d in edge['data']:
                edge_types[type_edge][kind][d] = py2TigerType(edge['data'][d])
                if isinstance(edge['data'][d], list):
                    attr[d] = ','.join(edge['data'][d])
                else:
                    attr[d] = edge['data'][d]
        else:
            edge_types[type_edge][kind][prop] = py2TigerType(edge[prop])
            if isinstance(edge[prop], list):
                attr[prop] = ','.join(edge[prop])
            else:
                attr[prop] = edge[prop]

    insert2edge_bulks(src_type, kind, tgt_type, edge['source_id'][1], edge['target_id'][1], attr)

In [9]:
# Add node types from DGIDB manually, takes around 13 seconds
node_types['Gene']['claim_name'] = 'STRING'
node_types['Compound']['drug_claim_name'] = 'STRING'
node_types['Compound']['drug_claim_primary_name'] = 'STRING'
node_types['Compound']['drug_name'] = 'STRING'

dgidb_df = pd.read_csv('data/dgidb/interactions.tsv', sep='\t')

# add edge types and properties
dir = 'Compound::Gene'
if dir not in edge_types:
    edge_types[dir] = {}
dgidb_df = dgidb_df[dgidb_df['entrez_id'].notnull()
                    & dgidb_df['drug_concept_id'].notnull()
                    & dgidb_df['interaction_types'].notnull()]
for i, row in dgidb_df.iterrows():
    edge_name = 'DGIDB_' + row['interaction_types'].replace(' ', '_').replace(
        ',', '_') + '_Compound2Gene'

    if edge_name not in edge_types[dir]:
        edge_types[dir][edge_name] = {}
    row = row.fillna('')
    edge_types[dir][edge_name]['interaction_claim_source'] = 'STRING'
    edge_types[dir][edge_name]['interaction_types'] = 'STRING'
    edge_types[dir][edge_name]['interaction_group_score'] = 'FLOAT'
    edge_types[dir][edge_name]['PMIDs'] = 'STRING'
    edge = {
        'interaction_claim_source': row['interaction_claim_source'],
        'interaction_types': row['interaction_types'],
        'interaction_group_score': row['interaction_group_score'],
        'PMIDs': row['PMIDs']
    }
    comp_id = row['drug_concept_id'].split(':')[1]
    gene_id = str(int(row['entrez_id']))
    insert2edge_bulks('Compound', edge_name, 'Gene', comp_id, gene_id, edge)
    comp_attr = {
        'drug_claim_name': row['drug_claim_name'],
        'drug_claim_primary_name': row['drug_claim_primary_name'],
        'drug_name': row['drug_name']
    }
    gene_attr = {'name': row['gene_name'], 'claim_name': row['gene_claim_name']}
    insert2node_bulks('Compound', comp_id, comp_attr)
    insert2node_bulks('Gene', gene_id, gene_attr)

# np.sum(dgidb_df['entrez_id'].notnull() & dgidb_df['drug_concept_id'].notnull())

In [11]:
json.dumps(node_types)

'{"Gene": {"name": "STRING", "description": "STRING", "source": "STRING", "license": "STRING", "url": "STRING", "chromosome": "STRING", "claim_name": "STRING"}, "Compound": {"name": "STRING", "license": "STRING", "source": "STRING", "inchikey": "STRING", "inchi": "STRING", "url": "STRING", "drug_claim_name": "STRING", "drug_claim_primary_name": "STRING", "drug_name": "STRING"}, "Disease": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING"}, "Atc": {}, "Tax": {}, "Molecular_Function": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING"}, "Side_Effect": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING"}, "Biological_Process": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING"}, "Pathway": {"name": "STRING", "license": "STRING", "source": "STRING", "url": "STRING"}, "Anatomy": {"name": "STRING", "source": "STRING", "license": "STRING", "url": "STRING", "mesh_id": "STRING", "bto_id": "STRING

In [10]:
def attr_dict2schema(d):
    if len(d) < 1:
        return ''
    s = ''
    l = []
    for k,v in d.items():
        l.append(f'{k} {v}')
    return ', ' + ', '.join(l)
# create schema
schema = ''
for t in node_types:
    schema += f'CREATE VERTEX {t} (PRIMARY_ID Id STRING{attr_dict2schema(node_types[t])}) With primary_id_as_attribute="true"\n'
for endpoints in edge_types:
    [source_name, target_name] = endpoints.split('::')
    for edge_name in edge_types[endpoints]:
        schema += f'CREATE DIRECTED EDGE {edge_name} (FROM {source_name}, TO {target_name}{attr_dict2schema(edge_types[endpoints][edge_name])})\n'
# print(schema)

# Connect to TigerGraph Cloud instance

In [16]:
import pyTigerGraph as tg 
host = 'https://derman.i.tgcloud.io'
# host = 'http://localhost'
secret = "xxxx"
graph_name = "derman"
user_name = "tigergraph"
# password = "tigergraph"
password = "123456"
# no need auth on local instance
token = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password).getToken(secret, "1000000")[0]
conn = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password, apiToken=token)
# conn = tg.TigerGraphConnection(host=host, graphname=graph_name, username=user_name, password=password)
conn.echo()

'Hello GSQL'

# Execute gsql to create schema in tgcloud

In [13]:
print(conn.gsql('use global\n' + schema))
# print(conn.gsql('USE GLOBAL\n DROP ALL')) # delete all the data

Successfully created vertex types: [Gene].
Successfully created vertex types: [Compound].
Successfully created vertex types: [Disease].
Successfully created vertex types: [Atc].
Successfully created vertex types: [Tax].
Successfully created vertex types: [Molecular_Function].
Successfully created vertex types: [Side_Effect].
Successfully created vertex types: [Biological_Process].
Successfully created vertex types: [Pathway].
Successfully created vertex types: [Anatomy].
Successfully created vertex types: [Cellular_Component].
Successfully created vertex types: [Symptom].
Successfully created vertex types: [Pharmacologic_Class].
Successfully created edge types: [bioarx_hum_gen_hum_gen_Gene2Gene].
Successfully created edge types: [bioarx_vir_gen_hum_gen_Gene2Gene].
Successfully created edge types: [GNBR_activate_Gene2Gene].
Successfully created edge types: [GNBR_production_by_cell_population_Gene2Gene].
Successfully created edge types: [GNBR_regulate_Gene2Gene].
Successfully created edg

# Insert/Update the edge bulks chunk by chunk

- Takes around 16 MINUTES! Be patient. (In my local TigerGraph, it takes around 60 seconds!)
- Original data has 29 Genes without ids. I deleted them. See https://github.com/gnn4dr/DRKG/issues/32
- The fixed drkg.tsv is [here](https://drive.google.com/file/d/1Hc1mMEyh_4p6qHm4VLK2isb-E51himCA/view?usp=sharing)


In [17]:
# Firstly, CREATE SCHEMA in your graph.
# use edge_bulks to insert BIG data chunk by chunk
CHUNK_SIZE = 100000
cnt = 0
prev_cnt_stop = 0 # if somehow an error occurs, set this variable to not start from zero 
for src_type in edge_bulks:
    for edge in edge_bulks[src_type]:
        for tgt_type in edge_bulks[src_type][edge]:
            edges = edge_bulks[src_type][edge][tgt_type]
            for i in range(0, len(edges), CHUNK_SIZE):
                if cnt >= prev_cnt_stop: 
                    print(cnt)
                    conn.upsertEdges(src_type, edge, tgt_type, edges[i:i + CHUNK_SIZE])
                cnt += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158


In [18]:
# use node_bulks to insert BIG data chunk by chunk
cnt = 0
prev_cnt_stop = 0 # if somehow an error occurs, set this variable to not start from zero 
for node_type in node_bulks:
    nodes = node_bulks[node_type]
    for i in range(0, len(nodes), CHUNK_SIZE):
        if cnt >= prev_cnt_stop: 
            print(cnt)
            conn.upsertVertices(node_type, nodes[i:i + CHUNK_SIZE])
        cnt += 1

0
1
2
3
4
5
6
7
8
9
10
