## **A notebook to process knowledge graphs and import them into graph DB**

### **1. Handling PrimeKG**
### data source: https://github.com/mims-harvard/PrimeKG

**1.1 Check csv KG using pandas**

In [1]:
import pandas as pd

df = pd.read_csv("/Users/zongchang/Desktop/科研/DT-MLM/PrimeKG/kg.csv", low_memory=False)
df.head() 

Unnamed: 0,relation,display_relation,x_index,x_id,x_type,x_name,x_source,y_index,y_id,y_type,y_name,y_source
0,protein_protein,ppi,0,9796,gene/protein,PHYHIP,NCBI,8889,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,1,7918,gene/protein,GPANK1,NCBI,2798,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,2,8233,gene/protein,ZRSR2,NCBI,5646,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,3,4899,gene/protein,NRF1,NCBI,11592,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,4,5297,gene/protein,PI4KA,NCBI,2122,8601,gene/protein,RGS20,NCBI


In [2]:
df.shape

(8100498, 12)

In [126]:
# a minimum subgraph for test, filtered by particular disease
df_mini = df.loc[(df.x_name == 'hepatocellular carcinoma') | (df.y_name == 'hepatocellular carcinoma') | (df.x_name == 'diffuse large B-cell lymphoma') | (df.y_name == 'diffuse large B-cell lymphoma') | (df.x_name == 'RPS6KB1') | (df.y_name == 'RPS6KB1') ]

**1.2 Create nodes, relationships, and graph using DataFrame**

In [3]:
# map original labels to neo4j labels
node_label_dict = {'gene/protein': 'GeneProtein', 'anatomy': 'Anatomy', 'biological_process': 'BiologicalProcess', 'cellular_component': 'CellularComponent', 'disease': 'Disease', 'drug': 'Drug', 'effect/phenotype': 'EffectPhenotype', 'exposure': 'Exposure', 'molecular_function': 'MolecularFunction', 'pathway':'Pathway'}
link_lable_dict = {'associated with': 'AssociatedWith', 'carrier': 'Carrier', 'contraindication':'Contraindication', 'enzyme': 'Enzyme', 'expression absent': 'ExpressionAbsent', 'expression present': 'ExpressionPresent', 'indication': 'Indication', 'interacts with': 'InteractsWith', 'linked to': 'LinkedTo', 'off-label use': 'OffLabelUse', 'parent-child': 'ParentChild', 'phenotype absent': 'PhenotypeAbsent', 'phenotype present': 'PhenotypePresent', 'ppi': 'ProteinProtein', 'side effect': 'SideEffect', 'synergistic interaction': 'SynergisticInteraction', 'target': 'Target', 'transporter': 'Transporter'}

In [4]:
from py2neo import Graph, Node, Relationship, Subgraph

In [5]:
# a = Node("Gene", id='9796', name='PHYHIP', source='NCBI')
# b = Node("Gene", id='56992', name='KIF15', source='NCBI')
# protein_protein = Relationship.type("ppi")
# ab = protein_protein(a, b)
node_dict = {}
link_dict = {}
count = 0
for index, row in df.iterrows():
    x = Node(node_label_dict[row["x_type"]], id=row["x_id"], name=row["x_name"], source=row["x_source"])
    y = Node(node_label_dict[row["y_type"]], id=row["y_id"], name=row["y_name"], source=row["y_source"])
    if row["x_id"] not in node_dict:
        node_dict[row["x_id"]] = x
    if row["y_id"] not in node_dict:
        node_dict[row["y_id"]] = y
    if (row["x_id"] + "_" + row["y_id"] not in link_dict) and (row["y_id"] + "_" + row["x_id"] not in link_dict):
        r = Relationship.type(link_lable_dict[row["display_relation"]])
        triple = r(node_dict[row["x_id"]], node_dict[row["y_id"]], name=row["relation"])
        link_dict[row["x_id"] + "_" + row["y_id"]] = triple
    count += 1
    if count % 100000 == 0:
        print("Processed: " + str(count) + " triples.")

Processed: 100000 triples.
Processed: 200000 triples.
Processed: 300000 triples.
Processed: 400000 triples.
Processed: 500000 triples.
Processed: 600000 triples.
Processed: 700000 triples.
Processed: 800000 triples.
Processed: 900000 triples.
Processed: 1000000 triples.
Processed: 1100000 triples.
Processed: 1200000 triples.
Processed: 1300000 triples.
Processed: 1400000 triples.
Processed: 1500000 triples.
Processed: 1600000 triples.
Processed: 1700000 triples.
Processed: 1800000 triples.
Processed: 1900000 triples.
Processed: 2000000 triples.
Processed: 2100000 triples.
Processed: 2200000 triples.
Processed: 2300000 triples.
Processed: 2400000 triples.
Processed: 2500000 triples.
Processed: 2600000 triples.
Processed: 2700000 triples.
Processed: 2800000 triples.
Processed: 2900000 triples.
Processed: 3000000 triples.
Processed: 3100000 triples.
Processed: 3200000 triples.
Processed: 3300000 triples.
Processed: 3400000 triples.
Processed: 3500000 triples.
Processed: 3600000 triples.
P

In [6]:
print(len(node_dict.keys()))
print(len(link_dict.keys()))

90067
4037851


**1.3 Add nodes and relationships to graph in batches**

In [7]:
import pdb
# may change to update graph with smaller batches to avoid waiting too long
graph = Graph('bolt://localhost:7687', auth=('neo4j', 'zongc0725'))
node_list = [v for k,v in node_dict.items()]
link_list = [v for k,v in link_dict.items()]

In [8]:
# add nodes in batch 
count = 0
sub_node_list = []
sub_link_list = []
for item in node_list:
    sub_node_list.append(item)
    count += 1
    if count % 10000 == 0:
        subgraph = Subgraph(sub_node_list)
        tx = graph.begin() 
        tx.create(subgraph)
        graph.commit(tx)
        sub_node_list = []
        print("Added " + str(count) + " nodes")
subgraph = Subgraph(sub_node_list, sub_link_list)
tx = graph.begin() 
tx.create(subgraph)
graph.commit(tx)

Added 10000 nodes
Added 20000 nodes
Added 30000 nodes
Added 40000 nodes
Added 50000 nodes
Added 60000 nodes
Added 70000 nodes
Added 80000 nodes
Added 90000 nodes


In [9]:
len(graph.nodes)

90067

In [10]:
# add relationships in batch 
count = 0
sub_link_list = []
for item in link_list:
    sub_link_list.append(item)
    count += 1
    if count % 10000 == 0:
        subgraph = Subgraph(relationships = sub_link_list)
        tx = graph.begin() 
        tx.create(subgraph)
        graph.commit(tx)
        sub_link_list = []
        print("Added " + str(count) + " links")
subgraph = Subgraph(relationships = sub_link_list)
tx = graph.begin() 
tx.create(subgraph)
graph.commit(tx)

Added 10000 links
Added 20000 links
Added 30000 links
Added 40000 links
Added 50000 links
Added 60000 links
Added 70000 links
Added 80000 links
Added 90000 links
Added 100000 links
Added 110000 links
Added 120000 links
Added 130000 links
Added 140000 links
Added 150000 links
Added 160000 links
Added 170000 links
Added 180000 links
Added 190000 links
Added 200000 links
Added 210000 links
Added 220000 links
Added 230000 links
Added 240000 links
Added 250000 links
Added 260000 links
Added 270000 links
Added 280000 links
Added 290000 links
Added 300000 links
Added 310000 links
Added 320000 links
Added 330000 links
Added 340000 links
Added 350000 links
Added 360000 links
Added 370000 links
Added 380000 links
Added 390000 links
Added 400000 links
Added 410000 links
Added 420000 links
Added 430000 links
Added 440000 links
Added 450000 links
Added 460000 links
Added 470000 links
Added 480000 links
Added 490000 links
Added 500000 links
Added 510000 links
Added 520000 links
Added 530000 links
Ad

In [11]:
len(graph.relationships)

4037851

**1.4 Test: given a disease name, show all its related genes**

In [12]:
# cypher = "MATCH (n:Disease)-[r1:AssociatedWith]->(m:GeneProtein)-[r2:ProteinProtein]->(o:GeneProtein) WHERE n.name='hepatocellular carcinoma' RETURN n, r1, m, r2, o LIMIT 20"
# cypher = "MATCH (n:Disease)-[r1:AssociatedWith]->(m:GeneProtein)-[r2:AssociatedWith]->(o:Disease)-[r3*..3]->(p:GeneProtein) WHERE n.name='hepatocellular carcinoma' RETURN n, r1, m, r2, o, r3, p LIMIT 10"
# cypher = "MATCH (n:Disease)-[r1:AssociatedWith]->(m:GeneProtein)-[r2*..3]->(o:GeneProtein)-[r3]->(p:Drug) WHERE n.name='hepatocellular carcinoma' RETURN n, r1, m, r2, o, r3, p LIMIT 10"
cypher = "match p=(n:Disease)-[r *10]-() where toLower(n.name)=toLower('hepatocellular carcinoma') return p limit 20"
res = graph.run(cypher).data()

In [13]:
res[0]['p'].relationships[0].start_node.labels

:Exposure