## Step 1: Clone the Repository

In [None]:
!git clone https://github.com/ab-2109/HMRAG.git
%cd HMRAG

## Step 1b: Patch Retrieval Files (Fix embedding dimension)
The GitHub repo has hardcoded `embedding_dim=1024`, but `nomic-embed-text` outputs 768 dims. This cell patches the files directly.

In [None]:
# =============================================================================
# CRITICAL FIX: Patch retrieval files to use correct embedding dimension (768)
# nomic-embed-text outputs 768-dim vectors, NOT 1024
# =============================================================================
import os

files_to_patch = [
    'retrieval/vector_retrieval.py',
    'retrieval/graph_retrieval.py',
]

for filepath in files_to_patch:
    if os.path.exists(filepath):
        with open(filepath, 'r') as f:
            content = f.read()
        
        if 'embedding_dim=1024' in content:
            content = content.replace('embedding_dim=1024', 'embedding_dim=768')
            with open(filepath, 'w') as f:
                f.write(content)
            print(f"✓ Patched {filepath}: embedding_dim 1024 → 768")
        elif 'embedding_dim=768' in content:
            print(f"✓ {filepath} already correct (embedding_dim=768)")
        else:
            print(f"⚠️ {filepath}: embedding_dim not found, check manually")
    else:
        print(f"❌ {filepath} not found!")

# Also fix the deprecated Ollama import in web_retrieval.py
web_retrieval_path = 'retrieval/web_retrieval.py'
if os.path.exists(web_retrieval_path):
    with open(web_retrieval_path, 'r') as f:
        content = f.read()
    
    if 'from langchain_community.llms.ollama import Ollama' in content:
        content = content.replace(
            'from langchain_community.llms.ollama import Ollama',
            'from langchain_ollama import OllamaLLM as Ollama'
        )
        with open(web_retrieval_path, 'w') as f:
            f.write(content)
        print(f"✓ Patched {web_retrieval_path}: Updated Ollama import")

# Fix deprecated import in summary_agent.py too
summary_path = 'agents/summary_agent.py'
if os.path.exists(summary_path):
    with open(summary_path, 'r') as f:
        content = f.read()
    
    if 'from langchain_community.llms.ollama import Ollama' in content:
        content = content.replace(
            'from langchain_community.llms.ollama import Ollama',
            'from langchain_ollama import OllamaLLM as Ollama'
        )
        with open(summary_path, 'w') as f:
            f.write(content)
        print(f"✓ Patched {summary_path}: Updated Ollama import")

# Clean up any leftover working directory from previous runs
!rm -rf ./lightrag_workdir
print("\n✓ Cleaned lightrag_workdir")
print("✓ All patches applied successfully!")

## Step 2: Install Dependencies

In [None]:
# Install required packages
# Handle dependency conflicts by installing core packages first
import os

print("Installing dependencies (this may take a few minutes)...")

# Install numpy first to avoid conflicts
!pip install -q numpy==1.26.4

# Install core dependencies with compatible versions
!pip install -q --no-deps lightrag-hku
!pip install -q langchain langchain-community langchain-core langchain-ollama
!pip install -q transformers torch tqdm ollama
!pip install -q google-search-results
!pip install -q networkx aiohttp tenacity tiktoken

# Install optional vision dependencies if needed
# Uncomment if you need vision model support (Qwen2.5-VL)
# !pip install -q qwen_vl_utils opencv-python

print("✓ Dependencies installed successfully!")
print("Note: Some dependency warnings are normal and won't affect functionality.")

## Step 3: Install and Setup Ollama (Required for LLM)
Note: Ollama is required for this system. On Colab, follow these steps to install it.

In [None]:
print("Installing zstd (required for Ollama)...")
!sudo apt-get update -qq
!sudo apt-get install -y -qq zstd

print("Installing Ollama...")
!curl -fsSL https://ollama.com/install.sh | sh

# Verify installation
import subprocess
import time
import os

# Check if ollama is installed# Install Ollama on Colab

try:
    result = subprocess.run(['which', 'ollama'], capture_output=True, text=True)
    ollama_path = result.stdout.strip()
    
    if not ollama_path:
        print("⚠️ Ollama not found in PATH. Trying common locations...")
        if os.path.exists('/usr/local/bin/ollama'):
            ollama_path = '/usr/local/bin/ollama'
        elif os.path.exists('/usr/bin/ollama'):
            ollama_path = '/usr/bin/ollama'
        else:
            raise FileNotFoundError("Ollama binary not found")
    
    print(f"✓ Ollama found at: {ollama_path}")
    
    # Start Ollama server in background
    print("Starting Ollama server...")
    ollama_process = subprocess.Popen(
        [ollama_path, 'serve'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    time.sleep(10)  # Wait for server to start
    
    # Pull required models
    print("Pulling qwen2.5:7b model (this may take 5-10 minutes)...")
    !{ollama_path} pull qwen2.5:7b
    
    print("Pulling nomic-embed-text model...")
    !{ollama_path} pull nomic-embed-text
    
    print("✓ Ollama setup complete!")
    
except Exception as e:
    print(f"❌ Error setting up Ollama: {e}")
    print("You may need to restart the runtime and try again.")

## Step 4: Setup API Keys

In [None]:
# Set your API keys here
import os
from google.colab import userdata

# Store secrets in Colab's secret manager (left sidebar -> Key icon)
# Then access them like this:
try:
    SERPAPI_API_KEY = userdata.get('SERPAPI_API_KEY')
except:
    # Or set directly (not recommended for production)
    SERPAPI_API_KEY = "your-serpapi-key-here"

print("API keys configured!")
print("Note: OpenAI API key is not required. System uses Ollama for LLM inference.")

In [None]:
# Verify Ollama is running (run this before inference if you get connection errors)
import subprocess
import time

try:
    # Check if Ollama server is responding
    result = subprocess.run(['curl', '-s', 'http://localhost:11434/api/tags'], 
                          capture_output=True, text=True, timeout=5)
    
    if result.returncode == 0:
        print("✓ Ollama server is running!")
    else:
        raise Exception("Ollama not responding")
        
except Exception as e:
    print("⚠️ Ollama server not running. Restarting...")
    
    # Find ollama binary
    result = subprocess.run(['which', 'ollama'], capture_output=True, text=True)
    ollama_path = result.stdout.strip()
    
    if not ollama_path:
        if os.path.exists('/usr/local/bin/ollama'):
            ollama_path = '/usr/local/bin/ollama'
        elif os.path.exists('/usr/bin/ollama'):
            ollama_path = '/usr/bin/ollama'
    
    # Kill any existing ollama processes
    subprocess.run(['pkill', 'ollama'], stderr=subprocess.DEVNULL)
    time.sleep(2)
    
    # Start Ollama server in background
    ollama_process = subprocess.Popen(
        [ollama_path, 'serve'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    
    print("Waiting for Ollama to start...")
    time.sleep(10)
    
    # Verify it's running
    result = subprocess.run(['curl', '-s', 'http://localhost:11434/api/tags'], 
                          capture_output=True, text=True, timeout=5)
    
    if result.returncode == 0:
        print("✓ Ollama server restarted successfully!")
    else:
        print("❌ Failed to start Ollama server. You may need to restart the runtime.")
        
# Show available models
print("\nAvailable models:")
!ollama list

## Step 5: Download ScienceQA Dataset

In [None]:
# Download the ScienceQA dataset
import os

# Make sure we're in the HMRAG directory
if not os.path.exists('main.py'):
    %cd /content/HMRAG

print("Current directory:", os.getcwd())

# Create dataset directory if it doesn't exist
if not os.path.exists('dataset'):
    os.makedirs('dataset')

%cd dataset

# Clone the ScienceQA repository
if not os.path.exists('ScienceQA'):
    print("Cloning ScienceQA repository...")
    !git clone https://github.com/lupantech/ScienceQA
    
    if os.path.exists('ScienceQA'):
        print("✓ Repository cloned successfully")
        %cd ScienceQA
        
        # Download the actual data
        if os.path.exists('tools/download.sh'):
            print("Downloading dataset files (this may take several minutes)...")
            !bash tools/download.sh
        else:
            print("Warning: download.sh not found, trying alternative...")
            !mkdir -p data
            # You may need to manually download the dataset
            
        %cd ..
    else:
        print("❌ Failed to clone repository")
else:
    print("✓ ScienceQA directory already exists")

# Go back to HMRAG directory
%cd ..

# Verify the dataset structure and files
print("\n=== Checking dataset structure ===")
print("HMRAG directory contents:")
!ls -la

print("\nDataset directory contents:")
if os.path.exists('dataset/ScienceQA'):
    !ls -la dataset/ScienceQA/
    
    print("\nData directory contents:")
    if os.path.exists('dataset/ScienceQA/data'):
        !ls -la dataset/ScienceQA/data/
        
        # Check for required files
        required_files = ['problems.json', 'pid_splits.json']
        for file in required_files:
            file_path = f'dataset/ScienceQA/data/{file}'
            if os.path.exists(file_path):
                print(f"✓ Found: {file}")
            else:
                print(f"❌ Missing: {file}")
    else:
        print("❌ data directory not found!")
else:
    print("❌ ScienceQA directory not found!")

print("\n=== Setup complete. Check for any missing files above ===.")

## Step 6: Create Required Directories

In [None]:
# Create output and working directories
!mkdir -p outputs
!mkdir -p lightrag_workdir

# Check dataset structure
!ls -la dataset/

## Step 7: Run Inference (Small Test)
Start with a small test run (5 examples)

In [None]:
# Run on a small subset first to test

# CRITICAL: Clean up previous LightRAG working directory to avoid dimension mismatch errors
# If previous runs (failed or successful) used different dimensions, the database will be corrupted.
!rm -rf ./lightrag_workdir

!python3 main.py \
    --data_root ./dataset/ScienceQA/data \
    --image_root ./dataset/ScienceQA/images \
    --output_root ./outputs \
    --caption_file ./dataset/ScienceQA/data/captions.json \
    --working_dir ./lightrag_workdir \
    --serpapi_api_key "$SERPAPI_API_KEY" \
    --test_split test \
    --test_number 5 \
    --shot_number 0 \
    --label test_run \
    --save_every 5

## Step 8: Run Full Inference
After testing, run on the full dataset

In [None]:
# Clean up previous LightRAG working directory to avoid dimension mismatch errors
# (Old runs might have created a DB with default 1024 dims, which conflicts with nomic-embed-text's 768 dims)
!rm -rf ./lightrag_workdir

# Full inference run
!python3 main.py \
    --data_root ./dataset/ScienceQA/data \
    --image_root ./dataset/ScienceQA/images \
    --output_root ./outputs \
    --caption_file ./dataset/ScienceQA/data/captions.json \
    --working_dir ./lightrag_workdir \
    --serpapi_api_key "$SERPAPI_API_KEY" \
    --test_split test \
    --shot_number 2 \
    --label full_run \
    --save_every 50 \
    --use_caption

## Step 9: View Results

In [None]:
# View output files
!ls -lh outputs/

# Load and display results
import json

with open('outputs/test_run_test.json', 'r') as f:
    results = json.load(f)

print(f"Total results: {len(results)}")
print("\nSample results:")
for qid, answer in list(results.items())[:5]:
    print(f"Question ID: {qid}, Answer: {answer}")

## Alternative: Use OpenAI Models Directly
If Ollama setup is difficult, modify the agents to use OpenAI API directly

## Download Results to Local Machine

In [None]:
# Download results
from google.colab import files

# Download the results file
files.download('outputs/test_run_test.json')

# Or zip and download all outputs
!zip -r outputs.zip outputs/
files.download('outputs.zip')