In [2]:
#import libraries and create connection to database through driver
import pandas as pd
from neo4j import GraphDatabase, basic_auth
driver = GraphDatabase.driver(uri="bolt://localhost:7687", auth=("neo4j","password"))
session = driver.session()

In [7]:
#creating constraints on node kinds so there won't be duplicates
session.run("CREATE CONSTRAINT ON (n:Gene) ASSERT n.node_id is unique")

<neo4j.work.result.Result at 0x7f8eb1f65e10>

In [8]:
session.run("CREATE CONSTRAINT ON (n:Anatomy) ASSERT n.node_id is unique")

<neo4j.work.result.Result at 0x7f8eb5006630>

In [9]:
session.run("CREATE CONSTRAINT ON (n:Disease) ASSERT n.node_id is unique")

<neo4j.work.result.Result at 0x7f8eb50067b8>

In [10]:
session.run("CREATE CONSTRAINT ON (n:Compound) ASSERT n.node_id is unique")

<neo4j.work.result.Result at 0x7f8eb50891d0>

In [3]:
#load 2 tsv files, read them as csv and creating 2 dataframes
nodes = pd.read_csv("nodes.tsv", sep="\t")
edges = pd.read_csv("edges.tsv", sep="\t")

In [54]:
edges.shape

(1292203, 4)

In [53]:
nodes.shape

(23036, 3)

In [6]:
#check to see how many different kinds there are
nodes['kind'].unique()

array(['Anatomy', 'Compound', 'Disease', 'Gene'], dtype=object)

In [56]:
nodes[nodes['kind']=='Gene'].shape

(20945, 3)

In [57]:
nodes[nodes['kind']=='Anatomy'].shape

(402, 3)

In [58]:
nodes[nodes['kind']=='Disease'].shape

(137, 3)

In [59]:
nodes[nodes['kind']=='Compound'].shape

(1552, 3)

In [4]:
#creating dataframes for each kind
gene_nodes = nodes[nodes['kind'] == "Gene"]
anatomy_nodes = nodes[nodes['kind'] == "Anatomy"]
disease_nodes = nodes[nodes['kind'] == "Disease"]
compound_nodes = nodes[nodes['kind'] == "Compound"]

In [12]:
#convert these dataframes to lists
gene_node_list = gene_nodes.values.tolist()
anatomy_node_list = anatomy_nodes.values.tolist()
disease_node_list = disease_nodes.values.tolist()
compound_node_list = compound_nodes.values.tolist()

In [13]:
#a list that holds all above lists
total_list = [gene_node_list, anatomy_node_list,disease_node_list,compound_node_list]

In [21]:
#iterate through the list of lists
for kind in total_list:
    node_commands = []

    #the quotation marks give error when included in "name", Cypher threw an exception. Took me a while make it run
    #iterate through each list
    for i in kind:
        #write cypher queries from the info in the list and add it to commands list to be executed
        create_statement = 'create (n:' + str(i[2]) +' {node_id:"' + str(i[0]) +'", name:  "' + str(i[1]) +'"})'
        node_commands.append(create_statement)
    #a helper function to run the commands
    def execute_transactions(node_commands): 
        for i in node_commands:
            session.run(i)
    execute_transactions(node_commands)

In [43]:
#this helps determine what to put for the dictionary function below
edges['metaedge'].unique()

array(['GiG', 'CrC', 'DdG', 'DlA', 'CtD', 'CbG', 'CuG', 'DrD', 'DaG',
       'CpD', 'AdG', 'AuG', 'GcG', 'Gr>G', 'CdG', 'DuG', 'AeG'],
      dtype=object)

In [44]:
edges['relation'].unique()

array(['INTERACTS', 'RESEMBLES', 'DOWNREGULATES', 'LOCALIZES', 'TREATS',
       'BINDS', 'UPREGULATES', 'ASSOCIATES', 'PALLIATES', 'COVARIES',
       'REGULATES', 'EXPRESSES'], dtype=object)

In [5]:
#a helper function acting as a dictionary for relationships. It extracting values from metaedges
def extract_relation(metaedge):
    if metaedge[1] == "i":
        return "INTERACTS"
    elif metaedge[1] == "e":
        return "EXPRESSES"
    elif metaedge[1:3] == "r>":
        return "REGULATES"
    elif metaedge[1] == "d":
        return "DOWNREGULATES"
    elif metaedge[1] == "u":
        return "UPREGULATES"
    elif metaedge[1] == "c":
        return "COVARIES"
    elif metaedge[1] == "a":
        return "ASSOCIATES"
    elif metaedge[1] == "b":
        return "BINDS"
    elif metaedge[1] == "r":
        return "RESEMBLES"
    elif metaedge[1] == "l":
        return "LOCALIZES"
    elif metaedge[1] == "i":
        return "INCLUDES"
    elif metaedge[1] == "t":
        return "TREATS"
    elif metaedge[1] == "r":
        return "RESEMBLES"
    elif metaedge[1] == "p":
        return "PALLIATES"
    else:
        return ""

#make a new column in the dataframe for easier access
edges['relation'] = edges['metaedge'].apply(extract_relation)

In [6]:
#break edges dataframe down to smaller dataframes for each relation, convert it to a list
edges_palliate = edges[edges['relation'] == 'PALLIATES']
edges_palliate_list = edges_palliate.values.tolist()

edges_interact = edges[edges['relation'] == 'INTERACTS']
edges_interact_list = edges_interact.values.tolist()

edges_treat = edges[edges['relation'] == 'TREATS']
edges_treat_list = edges_treat.values.tolist()

edges_localize = edges[edges['relation'] == 'LOCALIZES']
edges_localize_list = edges_localize.values.tolist()

edges_covary = edges[edges['relation'] == 'COVARIES']
edges_covary_list = edges_covary.values.tolist()

In [7]:
edges_downregulate = edges[edges['relation'] == 'DOWNREGULATES']
edges_downregulate_list = edges_downregulate.values.tolist()

edges_upregulate = edges[edges['relation'] == 'UPREGULATES']
edges_upregulate_list = edges_upregulate.values.tolist()

edges_regulate = edges[edges['relation'] == 'REGULATES']
edges_regulate_list = edges_regulate.values.tolist()

edges_resemble = edges[edges['relation'] == 'RESEMBLES']
edges_resemble_list = edges_resemble.values.tolist()

edges_bind = edges[edges['relation'] == 'BINDS']
edges_bind_list = edges_bind.values.tolist()

In [14]:
edges_associate = edges[edges['relation'] == 'ASSOCIATES']
edges_associate_list = edges_associate.values.tolist()

In [8]:
#same helper function to run queries
def execute_transactions(node_commands): 
    for i in node_commands:
        session.run(i)

In [43]:
#run cypher queries to create relationships
edge_palliate_conmmands = []

for i in edges_palliate_list:
    cypher = 'MATCH (a:Compound), (b:Disease) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:PALLIATES]->(b)'
    edge_palliate_conmmands.append(cypher)

execute_transactions(edge_palliate_conmmands)


In [17]:
edge_interact_conmmands = []

for i in edges_interact_list:
    cypher = 'MATCH (a:Gene), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:INTERACTS]->(b)'
    edge_interact_conmmands.append(cypher)

execute_transactions(edge_interact_conmmands)


In [41]:
edge_treat_conmmands = []

for i in edges_treat_list:
    cypher = 'MATCH (a:Compound), (b:Disease) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:TREATS]->(b)'
    edge_treat_conmmands.append(cypher)

execute_transactions(edge_treat_conmmands)

In [42]:
edge_localize_conmmands = []

for i in edges_localize_list:
    cypher = 'MATCH (a:Disease), (b:Anatomy) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:LOCALIZES]->(b)'
    edge_localize_conmmands.append(cypher)

execute_transactions(edge_localize_conmmands)

In [13]:
edge_covary_conmmands = []


for i in edges_covary_list:
    cypher = 'MATCH (a:Gene), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:COVARIES]->(b)'
    edge_covary_conmmands.append(cypher)

execute_transactions(edge_covary_conmmands)

In [74]:
edge_downregulate_conmmands = []

for i in edges_downregulate_list:
    cypher = 'MATCH (a:Disease), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:DOWNREGULATES]->(b)'
    edge_downregulate_conmmands.append(cypher)

execute_transactions(edge_downregulate_conmmands)

In [9]:
edge_downregulate_conmmands = []

for i in edges_downregulate_list:
    cypher = 'MATCH (a:Anatomy), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:DOWNREGULATES]->(b)'
    edge_downregulate_conmmands.append(cypher)

execute_transactions(edge_downregulate_conmmands)

In [11]:
edge_downregulate_conmmands = []

for i in edges_downregulate_list:
    cypher = 'MATCH (a:Compound), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:DOWNREGULATES]->(b)'
    edge_downregulate_conmmands.append(cypher)

execute_transactions(edge_downregulate_conmmands)

In [72]:
edge_upregulate_conmmands = []

for i in edges_upregulate_list:
    cypher = 'MATCH (a:Disease), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:UPREGULATES]->(b)'
    edge_upregulate_conmmands.append(cypher)

execute_transactions(edge_upregulate_conmmands)

In [73]:
edge_upregulate_conmmands = []

for i in edges_upregulate_list:
    cypher = 'MATCH (a:Anatomy), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:UPREGULATES]->(b)'
    edge_upregulate_conmmands.append(cypher)

execute_transactions(edge_upregulate_conmmands)

In [12]:
edge_upregulate_conmmands = []

for i in edges_upregulate_list:
    cypher = 'MATCH (a:Compound), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:UPREGULATES]->(b)'
    edge_upregulate_conmmands.append(cypher)

execute_transactions(edge_upregulate_conmmands)

In [71]:
edge_resemble_conmmands = []

for i in edges_resemble_list:
    cypher = 'MATCH (a:Compound), (b:Compound) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:RESEMBLES]->(b)'
    edge_resemble_conmmands.append(cypher)

execute_transactions(edge_resemble_conmmands)

In [None]:
edge_regulate_conmmands = []

for i in edges_regulate_list:
    cypher = 'MATCH (a:Gene), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:REGULATES]->(b)'
    edge_regulate_conmmands.append(cypher)

execute_transactions(edge_regulate_conmmands)

In [70]:
edge_bind_conmmands = []

for i in edges_bind_list:
    cypher = 'MATCH (a:Compound), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:BINDS]->(b)'
    edge_bind_conmmands.append(cypher)

execute_transactions(edge_bind_conmmands)

In [15]:
edge_associate_conmmands = []

for i in edges_associate_list:
    cypher = 'MATCH (a:Disease), (b:Gene) WHERE a.node_id ="' + str(i[0]) +'" and b.node_id =  "' + str(i[2]) +'" CREATE (a)-[r:ASSOCIATES]->(b)'
    edge_associate_conmmands.append(cypher)

execute_transactions(edge_associate_conmmands)

In [None]:
#query for #1 to be run neo4j (id: Disease::DOID:1781)
match (c:Compound)-[relatedTo]->(d:Disease {name: "thyroid cancer"}),(d:Disease \
{name: "thyroid cancer"})-[r:LOCALIZES]->(a:Anatomy) return c.name, type(relatedTo), d.name ,type(r), a.name

In [38]:
#query for #1 to be run from Python
query_for_q1 = 'match (c:Compound)-[relatedTo]->(d:Disease {name: \"thyroid cancer\"}),(d:Disease {name: \"thyroid cancer\"})-[r:LOCALIZES]->(a:Anatomy) return c.name, type(relatedTo), d.name ,type(r), a.name'

In [40]:
query_for_q1

'match (c:Compound)-[relatedTo]->(d:Disease {name: "thyroid cancer"}),(d:Disease {name: "thyroid cancer"})-[r:LOCALIZES]->(a:Anatomy) return c.name, type(relatedTo), d.name ,type(r), a.name'

In [37]:
#60 results were retrieved from the Database for thyroid cancer, id: Disease::DOID:1781
result = list(session.run(query_for_q1))
result

[<Record c.name='Sorafenib' type(relatedTo)='TREATS' d.name='thyroid cancer' type(r)='LOCALIZES' a.name='anterior vena cava'>,
 <Record c.name='Sorafenib' type(relatedTo)='TREATS' d.name='thyroid cancer' type(r)='LOCALIZES' a.name='saliva-secreting gland'>,
 <Record c.name='Sorafenib' type(relatedTo)='TREATS' d.name='thyroid cancer' type(r)='LOCALIZES' a.name='neck'>,
 <Record c.name='Sorafenib' type(relatedTo)='TREATS' d.name='thyroid cancer' type(r)='LOCALIZES' a.name='hyoid bone'>,
 <Record c.name='Sorafenib' type(relatedTo)='TREATS' d.name='thyroid cancer' type(r)='LOCALIZES' a.name='lymph node'>,
 <Record c.name='Sorafenib' type(relatedTo)='TREATS' d.name='thyroid cancer' type(r)='LOCALIZES' a.name='subclavian vein'>,
 <Record c.name='Sorafenib' type(relatedTo)='TREATS' d.name='thyroid cancer' type(r)='LOCALIZES' a.name='parathyroid gland'>,
 <Record c.name='Sorafenib' type(relatedTo)='TREATS' d.name='thyroid cancer' type(r)='LOCALIZES' a.name='ultimobranchial body'>,
 <Record c.n

In [46]:
disease_nodes.tail(10)

Unnamed: 0,id,name,kind
2081,Disease::DOID:9008,psoriatic arthritis,Disease
2082,Disease::DOID:9074,systemic lupus erythematosus,Disease
2083,Disease::DOID:9206,Barrett's esophagus,Disease
2084,Disease::DOID:9296,cleft lip,Disease
2085,Disease::DOID:9352,type 2 diabetes mellitus,Disease
2086,Disease::DOID:9744,type 1 diabetes mellitus,Disease
2087,Disease::DOID:9835,refractive error,Disease
2088,Disease::DOID:986,alopecia areata,Disease
2089,Disease::DOID:9917,pleural cancer,Disease
2090,Disease::DOID:9970,obesity,Disease


In [48]:
edges.shape

(1292203, 4)

In [49]:
compound_nodes.head(10)

Unnamed: 0,id,name,kind
402,Compound::DB00014,Goserelin,Compound
403,Compound::DB00035,Desmopressin,Compound
404,Compound::DB00050,Cetrorelix,Compound
405,Compound::DB00091,Cyclosporine,Compound
406,Compound::DB00093,Felypressin,Compound
407,Compound::DB00104,Octreotide,Compound
408,Compound::DB00115,Cyanocobalamin,Compound
409,Compound::DB00116,Tetrahydrofolic acid,Compound
410,Compound::DB00117,L-Histidine,Compound
411,Compound::DB00118,S-Adenosylmethionine,Compound


In [None]:
#attemped query for #2
match (c1:Compound {name: "Sorafenib"}),(c2:Compound),(d:Disease {name:"thyroid cancer"}),(g:Gene)
where (c1)-[:RESEMBLES]-(c2) and
((c1)-[:DOWNREGULATES]->(g) or (c2)-[:UPREGULATES]->(g))
return (c2.name)

In [None]:
match (c1:Compound {name: "Sorafenib"})-[:RESEMBLES]-(c2:Compound), (c2)-[relatedTo]->(g:Gene), (c1)-[TREATS]-(d:Disease {name:"thyroid cancer"}) return c2