# Auto-Graph-RAG Modular Architecture Demo

This notebook demonstrates the new modular architecture of Auto-Graph-RAG, showing how each component can be used independently or composed together.

## Key Benefits
- **Independent Testing**: Test each component with known inputs
- **Reusable Assets**: Reuse expensive operations (schemas, datasets, models)
- **Flexible Composition**: Mix and match modules or skip steps
- **Better Debugging**: Isolate issues to specific components
- **Incremental Development**: Work on parts without full pipeline

In [None]:
# RUN IF IN COLAB
# check if "auto-graph-rag" repo exists - if so delete it recursivly
# import os
# import shutil
# if os.path.exists("auto-graph-rag"):
#     shutil.rmtree("auto-graph-rag")

# !git clone https://github.com/benjaminwfriedman/auto-graph-rag.git
# !cd auto-graph-rag && git branch
# !cd auto-graph-rag && git pull
# !cd auto-graph-rag && pip install -e .


# Restart Runtime if in Colab❗

## Setup and Imports

In [None]:
import sys
import os
import shutil
from pathlib import Path

# Create working directory
# Detect environment and set working directory
try:
    import google.colab
    in_colab = True

    work_dir = Path("/content/auto-graph-rag/modular_demo_workspace")
    sys.path.append("/content/auto-graph-rag/src")
    print("🔍 Detected Google Colab environment")
except ImportError:
    in_colab = False
    work_dir = Path("./modular_demo_workspace")
    local_src = Path("./auto-graph-rag/src")
    if local_src.exists():
        sys.path.append(str(local_src))
    print("🔍 Detected local environment")

In [None]:
import os
import json
import networkx as nx
from pathlib import Path
import tempfile
import shutil
from dotenv import load_dotenv
load_dotenv()

# Import modular components
from auto_graph_rag.modules import (
    GraphBuilder,
    GraphExplorer,
    DataGenerator,
    ModelTrainer,
    QueryExecutor
)

# Original interface for comparison
from auto_graph_rag import GraphRAG

print("✅ Imports successful!")
print("Available modules:")
for module in [GraphBuilder, GraphExplorer, DataGenerator, ModelTrainer, QueryExecutor]:
    print(f"  - {module.__name__}")

## Environment Setup

Let's set up our working directory and check for required API keys.

In [None]:
# set OPENAI_API_KEY and HF_TOKEN
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
os.environ["HF_TOKEN"] = "YOUR_HF_TOKEN"

In [None]:
print(f"📁 Working directory: {work_dir.absolute()}")

# Check environment - require API key for full demo
has_openai = bool(os.getenv("OPENAI_API_KEY"))
has_hf = bool(os.getenv("HF_TOKEN"))

print(f"🔑 OpenAI API Key: {'✅ Available' if has_openai else '❌ Missing'}")
print(f"🤗 HuggingFace Token: {'✅ Available' if has_hf else '❌ Missing (optional for some models)'}")

if not has_openai:
    print("\n❌ Error: OPENAI_API_KEY is required for this demo")
    print("Please set it with: os.environ['OPENAI_API_KEY'] = 'your-key-here'")
    raise ValueError("OPENAI_API_KEY environment variable is required")

## Step 1: Graph Builder Module

The `GraphBuilder` creates Kuzu databases from NetworkX graphs or raw data. This can be used standalone without any other components.

In [None]:
# Create a sample company graph
def create_sample_graph():
    """Create a sample company graph for demonstration."""
    G = nx.DiGraph()

    # Add employees
    employees = [
        ("emp1", {"name": "Alice Johnson", "department": "Engineering", "salary": 120000, "level": "Senior"}),
        ("emp2", {"name": "Bob Smith", "department": "Engineering", "salary": 95000, "level": "Junior"}),
        ("emp3", {"name": "Carol White", "department": "Marketing", "salary": 85000, "level": "Mid"}),
        ("emp4", {"name": "David Brown", "department": "Sales", "salary": 90000, "level": "Senior"}),
    ]

    for emp_id, attrs in employees:
        G.add_node(emp_id, type="Employee", **attrs)

    # Add departments
    departments = [
        ("dept1", {"name": "Engineering", "budget": 2000000, "head_count": 25}),
        ("dept2", {"name": "Marketing", "budget": 800000, "head_count": 10}),
        ("dept3", {"name": "Sales", "budget": 1200000, "head_count": 15}),
    ]

    for dept_id, attrs in departments:
        G.add_node(dept_id, type="Department", **attrs)

    # Add projects
    projects = [
        ("proj1", {"name": "Alpha", "budget": 500000, "status": "Active"}),
        ("proj2", {"name": "Beta", "budget": 300000, "status": "Planning"}),
    ]
    for proj_id, attrs in projects:
        G.add_node(proj_id, type="Project", **attrs)

    # Add relationships
    G.add_edge("emp1", "dept1", type="BELONGS_TO", since="2020-01-15")
    G.add_edge("emp2", "dept1", type="BELONGS_TO", since="2023-03-01")
    G.add_edge("emp3", "dept2", type="BELONGS_TO", since="2021-06-15")
    G.add_edge("emp4", "dept3", type="BELONGS_TO", since="2019-09-01")

    G.add_edge("emp1", "proj1", type="WORKED_ON", hours=320, role="Lead")
    G.add_edge("emp2", "proj1", type="WORKED_ON", hours=480, role="Developer")
    G.add_edge("emp3", "proj2", type="WORKED_ON", hours=200, role="Marketing Lead")

    G.add_edge("proj1", "dept1", type="OWNED_BY")
    G.add_edge("proj2", "dept2", type="OWNED_BY")

    return G

# Create the graph
sample_graph = create_sample_graph()

print(f"📊 Created sample graph:")
print(f"  - Nodes: {sample_graph.number_of_nodes()}")
print(f"  - Edges: {sample_graph.number_of_edges()}")
print(f"  - Node types: {set(data['type'] for _, data in sample_graph.nodes(data=True))}")
print(f"  - Edge types: {set(data['type'] for _, _, data in sample_graph.edges(data=True))}")

In [None]:
# Initialize GraphBuilder and build database
builder = GraphBuilder()

# Show module info
print("🔧 GraphBuilder Info:")
info = builder.get_info()
for key, value in info.items():
    print(f"  {key}: {value}")

print("\n📦 Building graph database...")

# Extract labels
node_labels = {node: data["type"] for node, data in sample_graph.nodes(data=True)}
edge_labels = {(u, v): data["type"] for u, v, data in sample_graph.edges(data=True)}

# Build the database
db_path = work_dir / "company_db"
stats = builder.build_from_networkx(
    graph=sample_graph,
    db_path=db_path,
    graph_name="company",
    node_labels=node_labels,
    edge_labels=edge_labels
)

print("\n✅ Graph database created!")
print(f"📊 Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

## Step 2: Graph Explorer Module

The `GraphExplorer` analyzes an existing Kuzu database to understand its schema using LLM analysis. It can work with any Kuzu database, regardless of how it was created.

In [None]:
# Initialize GraphExplorer
explorer = GraphExplorer(llm_provider="openai", llm_model="gpt-4")

# Show module info
print("🔧 GraphExplorer Info:")
info = explorer.get_info()
for key, value in info.items():
    print(f"  {key}: {value}")

print("\n🕵️ Exploring graph schema...")

# Explore the database we just created
schema_path = work_dir / "company_schema.json"
schema = explorer.explore_from_db(
    db_path=db_path,
    max_samples=15,
    save_schema_to=schema_path
)

print("\n✅ Schema exploration complete!")

# Display schema results
print(f"\n📋 Discovered Schema:")
print(f"  Summary: {schema['summary']}")
print(f"  Node Types: {list(schema['nodes'].keys())}")
print(f"  Edge Types: {list(schema['edges'].keys())}")

# Show detailed info for one node type
if 'Employee' in schema['nodes']:
    emp_info = schema['nodes']['Employee']
    print(f"\n👤 Employee Node Details:")
    print(f"  Description: {emp_info['description']}")
    print(f"  Properties: {emp_info['properties']}")
    print(f"  Example: {emp_info['example_values']}")

## Step 3: Data Generator Module

The `DataGenerator` creates training datasets from graph schemas. It can work with any schema JSON file, regardless of how it was created.

In [None]:
# Initialize DataGenerator
generator = DataGenerator(llm_provider="openai", llm_model="gpt-4")
schema_path = work_dir / "company_schema.json"# Show module info
db_path = work_dir / "company_db"


print("🔧 DataGenerator Info:")
info = generator.get_info()
for key, value in info.items():
    print(f"  {key}: {value}")

print("\n📝 Generating training data...")

# Generate dataset from the schema we created
dataset_path = work_dir / "company_dataset.jsonl"
dataset = generator.generate_from_schema(
    schema_path=schema_path,
    num_examples=100,  # Small number for demo
    output_path=dataset_path,
    complexity_distribution={
        1: 0.3,  # Simple lookups
        2: 0.3,  # Filtered queries
        3: 0.2,  # Relationships
        4: 0.2,  # Aggregations
    },
    db_path=db_path  # For validation
)

print("\n✅ Training data generated!")

# Show dataset statistics
print(f"\n📊 Dataset Statistics:")
print(f"  Total examples: {len(dataset)}")

# Analyze complexity distribution
complexity_counts = {}
intent_counts = {}
for item in dataset:
    complexity = item.get('complexity', 0)
    intent = item.get('intent', 'unknown')
    complexity_counts[complexity] = complexity_counts.get(complexity, 0) + 1
    intent_counts[intent] = intent_counts.get(intent, 0) + 1

print(f"  Complexity distribution: {complexity_counts}")
print(f"  Intent distribution: {intent_counts}")

# Show sample examples
print(f"\n💡 Sample Training Examples:")
for i, example in enumerate(dataset[:3], 1):
    print(f"\n  Example {i}:")
    print(f"    Question: {example.get('question', 'N/A')}")
    print(f"    Cypher: {example.get('cypher', 'N/A')[:80]}...")
    print(f"    Complexity: {example.get('complexity', 'N/A')}")
    print(f"    Intent: {example.get('intent', 'N/A')}")

## Step 4: Model Trainer Module

The `ModelTrainer` fine-tunes language models on training datasets. It can work with any dataset file, regardless of how it was created.

**Note**: Model training is computationally expensive and time-consuming. In this demo, we'll show the setup but skip the actual training.

In [None]:
# Initialize ModelTrainer
trainer = ModelTrainer()


# Show module info
print("🔧 ModelTrainer Info:")
info = trainer.get_info()
for key, value in info.items():
    print(f"  {key}: {value}")

# Show training configuration
print("\n⚙️ Training Configuration:")
dataset_path = work_dir / "company_dataset.jsonl"
training_config = {
    "dataset_path":dataset_path,
    "base_model": "meta-llama/Llama-3.2-1B-Instruct",
    "epochs": 15,
    "learning_rate": 5e-4,
    "batch_size": 4,
    "lora_rank": 8,
    "output_dir": str(work_dir / "company_model")
}

for key, value in training_config.items():
    print(f"  {key}: {value}")

print("\n🎯 Model Training:")

model = trainer.train_from_file(
    dataset_path=Path(training_config['dataset_path']),
    model_name=training_config['base_model'],
    output_dir=Path(training_config['output_dir']),
    epochs=training_config['epochs'],
    learning_rate=training_config['learning_rate'],
    batch_size=training_config['batch_size'],
    lora_rank=training_config['lora_rank']
)



## Step 5: Query Executor Module

The `QueryExecutor` executes natural language queries using fine-tuned models and graph databases. It can work with any trained model and database combination.

**This step uses the actual trained model from Step 4 and the database from Step 1, demonstrating how assets can be reused across modules.**

In [None]:
# Initialize QueryExecutor
executor = QueryExecutor()

# Use the trained model from Step 4 and database from Step 1
model_path = work_dir / "company_model"  # From Step 4 ModelTrainer
db_path = work_dir / "company_db"        # From Step 1 GraphBuilder

print("🔧 QueryExecutor Info:")
info = executor.get_info()
for key, value in info.items():
    print(f"  {key}: {value}")

print(f"\n🎯 Using Assets from Previous Steps:")
print(f"  Model: {model_path} (from Step 4)")
print(f"  Database: {db_path} (from Step 1)")

# Test questions we'd like to ask
test_questions = [
    "Who are all the employees?",
    "Which employees work in Engineering?",
    "What projects has Alice Johnson worked on?",
    "What is the average salary by department?",
    "Which departments have the highest budgets?"
]

print("\n❓ Test Questions:")
for i, question in enumerate(test_questions, 1):
    print(f"  {i}. {question}")

print("\n🤖 Query Execution with Trained Model:")
for i, question in enumerate(test_questions, 1):
    print(f"\n  Query {i}: {question}")

    result = executor.query_with_model(
        question=question,
        model_path=Path(model_path),
        db_path=Path(db_path),
        return_cypher=True,
        format_results=True
    )

    if result['success']:
        print(f"    ✅ Generated Cypher: {result['cypher']}")
        print(f"    📊 Results: {result['count']} rows")
        if 'results' in result and result['results']:
            # Show first few results
            for j, row in enumerate(result['results'][:2]):
                print(f"      Row {j+1}: {row}")
            if len(result['results']) > 2:
                print(f"      ... and {len(result['results']) - 2} more rows")
    else:
        print(f"    ❌ Error: {result['error']}")