In [1]:
# https://python.langchain.com/v0.1/docs/use_cases/graph/constructing/

In [1]:
%%bash
pip install --upgrade --quiet langchain langchain-neo4j langchain-openai langgraph

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
%%bash
export JAVA_HOME=./java/jdk-21.0.5
./opt/neo4j-community-5.26.0/bin/neo4j-admin dbms set-initial-password password
./opt/neo4j-community-5.26.0/bin/neo4j start

Changed password for user 'neo4j'. IMPORTANT: this change will only take effect if performed before the database is started for the first time.
Directories in use:
home:         /datasets/soc-20241121132750/Hackathon/GraphRAG/opt/neo4j-community-5.26.0
config:       /datasets/soc-20241121132750/Hackathon/GraphRAG/opt/neo4j-community-5.26.0/conf
logs:         /datasets/soc-20241121132750/Hackathon/GraphRAG/opt/neo4j-community-5.26.0/logs
plugins:      /datasets/soc-20241121132750/Hackathon/GraphRAG/opt/neo4j-community-5.26.0/plugins
import:       /datasets/soc-20241121132750/Hackathon/GraphRAG/opt/neo4j-community-5.26.0/import
data:         /datasets/soc-20241121132750/Hackathon/GraphRAG/opt/neo4j-community-5.26.0/data
certificates: /datasets/soc-20241121132750/Hackathon/GraphRAG/opt/neo4j-community-5.26.0/certificates
licenses:     /datasets/soc-20241121132750/Hackathon/GraphRAG/opt/neo4j-community-5.26.0/licenses
run:          /datasets/soc-20241121132750/Hackathon/GraphRAG/opt/neo4j-

In [14]:
# Import modules.
from neo4j import GraphDatabase
import pandas as pd
import json
import time

In [5]:
# Connect to Neo4j DB.
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "password"))

In [6]:
# Load JSON data.
with open('./drug_interaction_graph.json') as f:
    data = json.load(f)

with open('./filtered_drug_list.csv') as f:
    filtered = pd.read_csv(f, header=None)

In [7]:
temp = {}

for i in filtered[0]:
    temp[i] = True

for i in ["Pravastatin", "Lovastatin", "Simvastatin", "Atorvastatin", "Fluvastatin", "Rosuvastatin", "Pitavastatin"]:
    temp[i] = True

filtered = temp

In [8]:
for i in data["nodes"][:10]:
    print(i)

{'label': 'Lepirudin', 'id': 'Lepirudin'}
{'label': 'Apixaban', 'id': 'Apixaban'}
{'label': 'Dabigatran etexilate', 'id': 'Dabigatran etexilate'}
{'label': 'Dasatinib', 'id': 'Dasatinib'}
{'label': 'Deferasirox', 'id': 'Deferasirox'}
{'label': 'Ursodeoxycholic acid', 'id': 'Ursodeoxycholic acid'}
{'label': 'Glycochenodeoxycholic Acid', 'id': 'Glycochenodeoxycholic Acid'}
{'label': 'Cholic Acid', 'id': 'Cholic Acid'}
{'label': 'Glycocholic acid', 'id': 'Glycocholic acid'}
{'label': 'Deoxycholic acid', 'id': 'Deoxycholic acid'}


In [9]:
for i in data["links"][:10]:
    print(i)

{'description': 'Apixaban may increase the anticoagulant activities of Lepirudin.', 'id': '0', 'source': 'Lepirudin', 'target': 'Apixaban'}
{'description': 'Dabigatran etexilate may increase the anticoagulant activities of Lepirudin.', 'id': '1', 'source': 'Lepirudin', 'target': 'Dabigatran etexilate'}
{'description': 'The risk or severity of bleeding and hemorrhage can be increased when Dasatinib is combined with Lepirudin.', 'id': '2', 'source': 'Lepirudin', 'target': 'Dasatinib'}
{'description': 'The risk or severity of gastrointestinal bleeding can be increased when Lepirudin is combined with Deferasirox.', 'id': '3', 'source': 'Lepirudin', 'target': 'Deferasirox'}
{'description': 'The risk or severity of bleeding and bruising can be increased when Lepirudin is combined with Ursodeoxycholic acid.', 'id': '4', 'source': 'Lepirudin', 'target': 'Ursodeoxycholic acid'}
{'description': 'The risk or severity of bleeding and bruising can be increased when Lepirudin is combined with Glycoc

In [10]:
clean_data = {
    "nodes": [],
    "links": [],
}

def cleaning(raw_data):
    start_time = time.time()
    for i, node in enumerate(data['nodes']):
        if node["label"] not in filtered: continue
        clean_data["nodes"].append(node)
    
    for i, link in enumerate(data['links']):
        if link["target"] not in filtered or link["source"] not in filtered: continue
        clean_data["links"].append(link)
        
        if i % 50000 == 0 and i != 0:
            current_time = time.time()
            print(f"Created {i + 1} links, used {current_time - start_time} seconds.")

cleaning(data)

Created 450001 links, used 0.1029202938079834 seconds.


In [11]:
len(data["nodes"]), len(data["links"]), len(clean_data["nodes"]), len(clean_data["links"])

(16582, 2839610, 471, 74784)

In [12]:
# Function to create nodes and relationships
def create_graph(tx, data):
    start_time = time.time()
    for i, node in enumerate(data['nodes']):
        if node["label"] not in filtered: continue
        
        tx.run("CREATE (n:Node {id: $id, label: $label})", id=node['id'], label=node['label'])
    
    for i, link in enumerate(data['links']):
        if link["target"] not in filtered or link["source"] not in filtered: continue
        
        tx.run("""
        MATCH (a:Node {id: $source}), (b:Node {id: $target})
        CREATE (a)-[:RELATIONSHIP {description: $description}]->(b)
        """, source=link['source'], target=link['target'], description=link['description'])
        
        if i % 1000 == 0 and i != 0:
            current_time = time.time()
            print(f"Created {i + 1} links, used {current_time - start_time} seconds.")

In [13]:
# Insert data into Neo4j.
with driver.session() as session:
    session.execute_write(create_graph, clean_data)

driver.close()

Created 1001 links, used 47.253438234329224 seconds.
Created 2001 links, used 78.05210161209106 seconds.
Created 3001 links, used 109.42906284332275 seconds.
Created 4001 links, used 139.51378798484802 seconds.
Created 5001 links, used 170.41807389259338 seconds.
Created 6001 links, used 200.89821529388428 seconds.
Created 7001 links, used 231.37676239013672 seconds.
Created 8001 links, used 261.3435318470001 seconds.
Created 9001 links, used 291.7904486656189 seconds.
Created 10001 links, used 321.69237422943115 seconds.
Created 11001 links, used 351.78085494041443 seconds.
Created 12001 links, used 382.70699191093445 seconds.
Created 13001 links, used 412.89570593833923 seconds.
Created 14001 links, used 443.2705523967743 seconds.
Created 15001 links, used 473.3867337703705 seconds.
Created 16001 links, used 504.01982378959656 seconds.
Created 17001 links, used 534.0837721824646 seconds.
Created 18001 links, used 564.3774394989014 seconds.
Created 19001 links, used 594.6533403396606 

<Record n=<Node element_id='4:1f17da69-1a9c-425d-a392-112358e1506f:0' labels=frozenset({'Node'}) properties={'id': 'Lepirudin', 'label': 'Lepirudin'}> r=<Relationship element_id='5:1f17da69-1a9c-425d-a392-112358e1506f:0' nodes=(<Node element_id='4:1f17da69-1a9c-425d-a392-112358e1506f:0' labels=frozenset({'Node'}) properties={'id': 'Lepirudin', 'label': 'Lepirudin'}>, <Node element_id='4:1f17da69-1a9c-425d-a392-112358e1506f:1' labels=frozenset({'Node'}) properties={'id': 'Apixaban', 'label': 'Apixaban'}>) type='RELATIONSHIP' properties={'description': 'Apixaban may increase the anticoagulant activities of Lepirudin.'}> m=<Node element_id='4:1f17da69-1a9c-425d-a392-112358e1506f:1' labels=frozenset({'Node'}) properties={'id': 'Apixaban', 'label': 'Apixaban'}>>
