In [10]:
from neo4j import GraphDatabase
import json
from typing import Dict, List
import logging

class Neo4jLoader:
    def __init__(self, uri: str, username: str, password: str):
        """Initialize Neo4j connection"""
        self.driver = GraphDatabase.driver(uri, auth=(username, password))
        self.logger = logging.getLogger(__name__)
        logging.basicConfig(level=logging.INFO)

    def close(self):
        """Close the driver connection"""
        self.driver.close()

    def load_data(self, json_file_path: str):
        """Load data from JSON file and import to Neo4j"""
        try:
            with open(json_file_path, 'r') as file:
                data = json.load(file)
        except Exception as e:
            self.logger.error(f"Error reading JSON file: {e}")
            raise

        # Create constraints for unique IDs
        self._create_constraints()

        # Load data in batches
        batch_size = 100
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            self._process_batch(batch)
            self.logger.info(f"Processed {i + len(batch)} records")

    def _create_constraints(self):
        """Create necessary constraints"""
        with self.driver.session() as session:
            try:
                session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (d:Disease) REQUIRE d.id IS UNIQUE")
                session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (s:Symptom) REQUIRE s.cui IS UNIQUE")
                self.logger.info("Created constraints successfully")
            except Exception as e:
                self.logger.error(f"Error creating constraints: {e}")
                raise

    def _process_batch(self, batch: List[Dict]):
        """Process a batch of disease records"""
        query = """
        UNWIND $batch AS disease
        MERGE (d:Disease {id: disease.disease_id})
        ON CREATE SET d.name = disease.disease_name
        WITH d, disease.symptoms AS symptoms
        UNWIND CASE WHEN symptoms IS NOT NULL AND size(symptoms) > 0 THEN symptoms ELSE [null] END AS symptom
        WITH d, symptom
        WHERE symptom IS NOT NULL
        MERGE (s:Symptom {cui: symptom.symptom_cui})
        ON CREATE SET s.name = symptom.synonyms[0],
                     s.synonyms = symptom.synonyms
        ON MATCH SET s.synonyms = 
            CASE 
                WHEN s.synonyms IS NULL THEN symptom.synonyms
                ELSE s.synonyms + [x IN symptom.synonyms WHERE NOT x IN s.synonyms]
            END
        MERGE (d)-[r:HAS_SYMPTOM]->(s)
        ON CREATE SET r.weight = symptom.positive_count
        ON MATCH SET r.weight = r.weight + symptom.positive_count
        """
        
        with self.driver.session() as session:
            try:
                session.run(query, batch=batch)
            except Exception as e:
                self.logger.error(f"Error processing batch: {e}")
                raise

    def verify_data(self):
        """Verify loaded data with some basic statistics"""
        with self.driver.session() as session:
            disease_count = session.run("MATCH (d:Disease) RETURN COUNT(d) as count").single()["count"]
            symptom_count = session.run("MATCH (s:Symptom) RETURN COUNT(s) as count").single()["count"]
            rel_count = session.run("MATCH ()-[r:HAS_SYMPTOM]->() RETURN COUNT(r) as count").single()["count"]
            
            return {
                "diseases": disease_count,
                "symptoms": symptom_count,
                "relationships": rel_count
            }

In [11]:

# Connection parameters - Note the new port 6687 for bolt
uri = "bolt://localhost:8687"
username = "neo4j"
password = "neo4j_pass5"  # Replace with your password
json_file_path = "sparse_kg_data.json"  # Replace with your file path

In [12]:


    # Initialize loader
loader = Neo4jLoader(uri, username, password)

ERROR:neo4j.io:Failed to write data to connection IPv4Address(('localhost', 8687)) (ResolvedIPv6Address(('::1', 8687, 0, 0)))


In [13]:

loader.load_data(json_file_path)


INFO:__main__:Created constraints successfully
INFO:__main__:Processed 100 records
INFO:__main__:Processed 200 records
INFO:__main__:Processed 300 records
INFO:__main__:Processed 400 records
INFO:__main__:Processed 500 records
INFO:__main__:Processed 600 records
INFO:__main__:Processed 700 records
INFO:__main__:Processed 800 records
INFO:__main__:Processed 900 records
INFO:__main__:Processed 1000 records
INFO:__main__:Processed 1100 records
INFO:__main__:Processed 1200 records
INFO:__main__:Processed 1300 records
INFO:__main__:Processed 1400 records
INFO:__main__:Processed 1500 records
INFO:__main__:Processed 1600 records
INFO:__main__:Processed 1700 records
INFO:__main__:Processed 1800 records
INFO:__main__:Processed 1900 records
INFO:__main__:Processed 2000 records
INFO:__main__:Processed 2100 records
INFO:__main__:Processed 2200 records
INFO:__main__:Processed 2300 records
INFO:__main__:Processed 2400 records
INFO:__main__:Processed 2500 records
INFO:__main__:Processed 2600 records


In [14]:

# Verify data
stats = loader.verify_data()
print("Data loading completed successfully!")
print(f"Statistics:\n{json.dumps(stats, indent=2)}")

Data loading completed successfully!
Statistics:
{
  "diseases": 13145,
  "symptoms": 1478,
  "relationships": 21363
}


In [17]:
uri = "neo4j://localhost:8687" 
auth = ("neo4j", "neo4j_pass5")

driver = GraphDatabase.driver(uri, auth=auth) 
driver.verify_connectivity()

In [18]:
with driver.session() as session:
            session.run("CREATE FULLTEXT INDEX symptomIndex FOR (s:Symptom) ON EACH [s.name]")