# Phase 3.3: Neo4j Graph Database Population (CORRECTED)
# Buddhist RAG System - Clear Light of Bliss

This version has been corrected to match your actual JSON file structures.

## Step 1: Install Neo4j Python Driver

In [20]:
!pip install neo4j



## Step 2: Import Libraries

In [21]:
import json
from pathlib import Path
from neo4j import GraphDatabase
from typing import Dict, List
import time

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


## Step 3: Configuration

In [22]:
# Neo4j connection
NEO4J_URI = "neo4j://127.0.0.1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"  # Your actual password

# Data files
CHECKPOINT_DIR = Path.home() / "Documents" / "gesha_la_rag" / "checkpoints"
LAYER1_FILE = CHECKPOINT_DIR / "06_document_structure_layer1.json"
LAYER2_FILE = CHECKPOINT_DIR / "07_semantic_relationships.json"
NORMALIZATION_FILE = CHECKPOINT_DIR / "04b_normalization_map.json"

for filepath in [LAYER1_FILE, LAYER2_FILE, NORMALIZATION_FILE]:
    if not filepath.exists():
        raise FileNotFoundError(f"Required file not found: {filepath}")

print("✓ Configuration set")

✓ Configuration set


## Step 4: Connect to Neo4j

In [23]:
class Neo4jConnection:
    def __init__(self, uri: str, user: str, password: str):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        
    def close(self):
        if self.driver:
            self.driver.close()
            
    def test_connection(self):
        with self.driver.session() as session:
            result = session.run("RETURN 1 as test")
            return result.single()["test"] == 1
    
    def execute_query(self, query: str, parameters: Dict = None):
        """Execute a Cypher query and return results as a list."""
        with self.driver.session() as session:
            result = session.run(query, parameters or {})
            return list(result)  # Convert to list BEFORE session closes
        
neo4j_conn = Neo4jConnection(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

if neo4j_conn.test_connection():
    print("✓ Successfully connected to Neo4j!")
else:
    print("✗ Connection failed")

✓ Successfully connected to Neo4j!


## Step 5: Create Schema

In [24]:
def create_schema(conn):
    schema_queries = [
        "CREATE CONSTRAINT book_id IF NOT EXISTS FOR (b:Book) REQUIRE b.book_id IS UNIQUE",
        "CREATE CONSTRAINT chapter_id IF NOT EXISTS FOR (c:Chapter) REQUIRE (c.book_id, c.chapter_index) IS UNIQUE",
        "CREATE CONSTRAINT paragraph_id IF NOT EXISTS FOR (p:Paragraph) REQUIRE p.paragraph_id IS UNIQUE",
        "CREATE CONSTRAINT concept_canonical IF NOT EXISTS FOR (c:Concept) REQUIRE c.canonical_form IS UNIQUE",
        "CREATE INDEX paragraph_chapter IF NOT EXISTS FOR (p:Paragraph) ON (p.chapter_index)",
        "CREATE INDEX paragraph_citation IF NOT EXISTS FOR (p:Paragraph) ON (p.citation)",
    ]
    
    print("Creating schema...")
    for query in schema_queries:
        try:
            conn.execute_query(query)
            print(f"  ✓ {query.split()[2]}")
        except Exception as e:
            if "already exists" in str(e).lower():
                print(f"  ⊙ {query.split()[2]} (exists)")
            else:
                print(f"  ✗ Error: {e}")
    
    print("✓ Schema complete")

create_schema(neo4j_conn)

Creating schema...
  ✓ book_id
  ✓ chapter_id
  ✓ paragraph_id
  ✓ concept_canonical
  ✓ paragraph_chapter
  ✓ paragraph_citation
✓ Schema complete


## Step 6: Load Data Files

In [25]:
# Load Layer 1
print("Loading Layer 1...")
with open(LAYER1_FILE, 'r', encoding='utf-8') as f:
    layer1_data = json.load(f)
print(f"  ✓ {layer1_data['total_chapters']} chapters")
print(f"  ✓ {layer1_data['total_paragraphs']} paragraphs")

# Load Layer 2
print("\nLoading Layer 2...")
with open(LAYER2_FILE, 'r', encoding='utf-8') as f:
    layer2_data = json.load(f)
print(f"  ✓ {len(layer2_data['relationships'])} relationships")
print(f"  ✓ {layer2_data['metadata']['unique_concepts']} concepts")

# Load normalization
print("\nLoading normalization...")
with open(NORMALIZATION_FILE, 'r', encoding='utf-8') as f:
    normalization_map = json.load(f)
print(f"  ✓ {normalization_map['metadata']['total_concepts']} canonical concepts")

Loading Layer 1...
  ✓ 33 chapters
  ✓ 3449 paragraphs

Loading Layer 2...
  ✓ 683 relationships
  ✓ 50 concepts

Loading normalization...
  ✓ 76 canonical concepts


## Step 7: Populate Layer 1 - Document Structure

In [26]:
def populate_layer1(conn, data):
    print("\n" + "="*70)
    print("POPULATING LAYER 1")
    print("="*70)
    
    # Create Book
    print("\n1. Creating Book node...")
    book_query = """
    MERGE (b:Book {book_id: $book_id})
    SET b.title = $title,
        b.total_chapters = $total_chapters,
        b.total_paragraphs = $total_paragraphs
    RETURN b
    """
    conn.execute_query(book_query, {
        'book_id': data['book_id'],
        'title': data['book_title'],
        'total_chapters': data['total_chapters'],
        'total_paragraphs': data['total_paragraphs']
    })
    print(f"  ✓ Created: {data['book_title']}")
    
    # Create Chapters
    print(f"\n2. Creating {data['total_chapters']} chapters...")
    chapter_query = """
    MATCH (b:Book {book_id: $book_id})
    MERGE (c:Chapter {book_id: $book_id, chapter_index: $chapter_index})
    SET c.title = $title,
        c.paragraph_count = $paragraph_count
    MERGE (b)-[:HAS_CHAPTER]->(c)
    RETURN c
    """
    
    for chapter in data['chapters']:
        conn.execute_query(chapter_query, {
            'book_id': data['book_id'],
            'chapter_index': chapter['chapter_index'],
            'title': chapter.get('chapter_title', f"Chapter {chapter['chapter_index']}"),
            'paragraph_count': len(chapter['paragraphs'])
        })
        if (chapter['chapter_index'] + 1) % 10 == 0:
            print(f"  ... {chapter['chapter_index'] + 1} chapters")
    
    print(f"  ✓ All chapters created")
    
    # Create Paragraphs
    print(f"\n3. Creating {data['total_paragraphs']} paragraphs (1-2 min)...")
    paragraph_query = """
    MATCH (c:Chapter {book_id: $book_id, chapter_index: $chapter_index})
    CREATE (p:Paragraph {
        paragraph_id: $paragraph_id,
        citation: $citation,
        text: $text,
        chapter_index: $chapter_index,
        paragraph_index: $paragraph_index,
        structural_role: $structural_role,
        section_index: $section_index,
        heading_level: $heading_level
    })
    CREATE (c)-[:HAS_PARAGRAPH]->(p)
    RETURN p
    """
    
    total = 0
    start = time.time()
    
    for chapter in data['chapters']:
        for para in chapter['paragraphs']:
            conn.execute_query(paragraph_query, {
                'book_id': data['book_id'],
                'chapter_index': chapter['chapter_index'],
                'paragraph_id': para['paragraph_id'],
                'citation': para['citation'],
                'text': para['text'],
                'paragraph_index': para['paragraph_index'],
                'structural_role': para.get('structural_role', 'BODY'),
                'section_index': para.get('section_index'),
                'heading_level': para.get('heading_level')
            })
            
            total += 1
            if total % 500 == 0:
                rate = total / (time.time() - start)
                print(f"  ... {total}/{data['total_paragraphs']} ({rate:.0f}/sec)")
    
    elapsed = time.time() - start
    print(f"  ✓ All paragraphs created in {elapsed:.1f}s")
    print("\n✓ Layer 1 complete!")

populate_layer1(neo4j_conn, layer1_data)


POPULATING LAYER 1

1. Creating Book node...


  ✓ Created: Clear Light of Bliss

2. Creating 33 chapters...
  ... 10 chapters
  ... 20 chapters
  ... 30 chapters
  ✓ All chapters created

3. Creating 3449 paragraphs (1-2 min)...
  ... 500/3449 (72/sec)
  ... 1000/3449 (76/sec)
  ... 1500/3449 (77/sec)
  ... 2000/3449 (79/sec)
  ... 2500/3449 (81/sec)
  ... 3000/3449 (83/sec)
  ✓ All paragraphs created in 40.5s

✓ Layer 1 complete!


## Step 8: Populate Layer 2 - Semantic Concepts

In [27]:
def populate_layer2(conn, data):
    print("\n" + "="*70)
    print("POPULATING LAYER 2")
    print("="*70)
    
    # Create Concepts
    print(f"\n1. Creating {layer2_data['metadata']['unique_concepts']} concepts...")
    concept_query = """
    MERGE (c:Concept {canonical_form: $canonical_form})
    SET c.display_name = $display_name,
        c.mention_count = $mention_count
    RETURN c
    """
    
    for concept_name, count in data['concept_inventory'].items():
        display_name = concept_name.replace('_', ' ').title()
        conn.execute_query(concept_query, {
            'canonical_form': concept_name,
            'display_name': display_name,
            'mention_count': count
        })
    
    print(f"  ✓ All concepts created")
    
    # Create Relationships
    print(f"\n2. Creating {len(data['relationships'])} relationships...")
    
    counts = {}
    start = time.time()
    
    for idx, rel in enumerate(data['relationships']):
        rel_type = rel['relation'].upper().replace(' ', '_').replace('-', '_')
        
        rel_query = f"""
        MATCH (subj:Concept {{canonical_form: $subject}})
        MATCH (obj:Concept {{canonical_form: $object}})
        CREATE (subj)-[r:{rel_type} {{
            source_paragraph_id: $source_paragraph_id,
            source_chapter: $source_chapter,
            relation_type: $relation_type,
            source_citation: $source_citation
        }}]->(obj)
        RETURN r
        """
        
        conn.execute_query(rel_query, {
            'subject': rel['subject'],
            'object': rel['object'],
            'source_paragraph_id': rel['source']['paragraph_id'],
            'source_chapter': rel['source']['chapter_index'],
            'relation_type': rel['relation_type'],
            'source_citation': rel['source']['citation']
        })
        
        counts[rel_type] = counts.get(rel_type, 0) + 1
        
        if (idx + 1) % 100 == 0:
            rate = (idx + 1) / (time.time() - start)
            print(f"  ... {idx + 1}/{len(data['relationships'])} ({rate:.0f}/sec)")
    
    elapsed = time.time() - start
    print(f"  ✓ All relationships created in {elapsed:.1f}s")
    
    print("\n  Relationship types:")
    for rel_type, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):
        print(f"    - {rel_type}: {count}")
    
    print("\n✓ Layer 2 complete!")

populate_layer2(neo4j_conn, layer2_data)


POPULATING LAYER 2

1. Creating 50 concepts...
  ✓ All concepts created

2. Creating 683 relationships...
  ... 100/683 (39/sec)
  ... 200/683 (53/sec)
  ... 300/683 (59/sec)
  ... 400/683 (63/sec)
  ... 500/683 (66/sec)
  ... 600/683 (68/sec)
  ✓ All relationships created in 9.8s

  Relationship types:
    - OF: 265
    - AND: 124
    - ON: 32
    - DEPENDS_UPON: 27
    - MIXING_WITH: 26
    - DISSOLVE_WITHIN: 24
    - KNOWN_AS: 23
    - MEDITATING_ON: 22
    - WITH: 16
    - FREE_FROM: 15
    - EMPTY_OF: 13
    - ARISE_FROM: 11
    - DISSOLVES_INTO: 9
    - FOCUSED_ON: 7
    - EXPLAINED_IN: 6
    - MOUNTED_UPON: 6
    - ARISES_FROM: 6
    - ENGAGING_IN: 5
    - ASSOCIATED_WITH: 5
    - DEPENDING_UPON: 5
    - RELYING_UPON: 5
    - INSEPARABLE_FROM: 5
    - THROUGH: 3
    - BEFORE: 3
    - ATTAINED_IN: 3
    - WITHIN: 2
    - INDICATIVE_OF: 2
    - FOR: 2
    - ACCUSTOMED_TO: 2
    - DURING: 2
    - INCLUDED_WITHIN: 2
    - FROM: 2
    - DISSOLVING_INTO: 1
    - OR: 1
    - UPON: 1



## Step 9: Validation Queries

In [28]:
print("\n" + "="*70)
print("VALIDATION")
print("="*70)

# Node counts
print("\n1. Node counts:")
result = neo4j_conn.execute_query("""
MATCH (n)
RETURN labels(n)[0] as label, count(n) as count
ORDER BY count DESC
""")
for record in list(result):  # Convert to list
    print(f"   {record['label']}: {record['count']}")

# Relationship counts
print("\n2. Relationship counts (top 10):")
result = neo4j_conn.execute_query("""
MATCH ()-[r]->()
RETURN type(r) as type, count(r) as count
ORDER BY count DESC
LIMIT 10
""")
for record in list(result):  # Convert to list
    print(f"   {record['type']}: {record['count']}")


VALIDATION

1. Node counts:
   Paragraph: 3449
   Concept: 50
   Chapter: 33
   Book: 1

2. Relationship counts (top 10):
   HAS_PARAGRAPH: 3449
   OF: 265
   AND: 124
   HAS_CHAPTER: 33
   ON: 32
   DEPENDS_UPON: 27
   MIXING_WITH: 26
   DISSOLVE_WITHIN: 24
   KNOWN_AS: 23
   MEDITATING_ON: 22


In [29]:
# Provenance test
print("\n3. Example relationships with provenance:")
result = neo4j_conn.execute_query("""
MATCH (c1:Concept {canonical_form: 'clear_light'})-[r]->(c2:Concept)
MATCH (p:Paragraph {paragraph_id: r.source_paragraph_id})
MATCH (ch:Chapter {chapter_index: p.chapter_index})
RETURN c1.canonical_form as subject,
       type(r) as relation,
       c2.canonical_form as object,
       ch.title as chapter,
       p.citation as citation,
       substring(p.text, 0, 100) as excerpt
LIMIT 3
""")
for record in result:
    print(f"\n   {record['subject']} --[{record['relation']}]--> {record['object']}")
    print(f"   Source: {record['citation']}")
    print(f"   Chapter: {record['chapter']}")
    print(f"   Text: {record['excerpt']}...")


3. Example relationships with provenance:

   clear_light --[KNOWN_AS]--> mahamudra
   Source: CLB.7.§1.p40
   Chapter: Introduction and Preliminaries
   Text: The second stage of causal-time Mahamudra is the Mahamudra that is theunion of the two truths: the c...

   clear_light --[OF]--> completion_stage
   Source: CLB.7.§6.p104
   Chapter: Introduction and Preliminaries
   Text: This is the brief meditation onbringing death into the path to the Truth Body. It functions principa...

   clear_light --[OF]--> sleep
   Source: CLB.8.§10.p86
   Chapter: Channels, Winds and Drops
   Text: At death we need to be able to practise three special methods:bringing theclear light of death into ...


In [30]:
# Top concepts
print("\n4. Top 10 concepts:")
result = neo4j_conn.execute_query("""
MATCH (c:Concept)
RETURN c.display_name as concept, c.mention_count as mentions
ORDER BY mentions DESC
LIMIT 10
""")
for idx, record in enumerate(result, 1):
    print(f"   {idx}. {record['concept']}: {record['mentions']}")


4. Top 10 concepts:
   1. Mind: 618
   2. Meditation: 350
   3. Clear Light: 244
   4. Emptiness: 226
   5. Illusory Body: 200
   6. Central Channel: 197
   7. Secret Mantra: 136
   8. Mahamudra: 127
   9. Inner Fire: 118
   10. Sleep: 118


## Step 10: Summary

In [31]:
print("\n" + "="*70)
print("✓ NEO4J POPULATION COMPLETE!")
print("="*70)
print("\nYour Buddhist RAG graph database is ready!")
print("  • Book → Chapter → Paragraph structure")
print("  • Concept nodes with semantic relationships")
print("  • Cross-layer provenance linking")
print("\nConnection: neo4j://127.0.0.1:7687")

neo4j_conn.close()
print("\n✓ Connection closed")


✓ NEO4J POPULATION COMPLETE!

Your Buddhist RAG graph database is ready!
  • Book → Chapter → Paragraph structure
  • Concept nodes with semantic relationships
  • Cross-layer provenance linking

Connection: neo4j://127.0.0.1:7687

✓ Connection closed
