# RAGvix Scratch Notebook

Exploration and testing notebook for RAGvix components.

In [None]:
# Basic imports
import sys
from pathlib import Path

# Add project root to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

print(f"Project root: {project_root}")

In [None]:
# RAGvix imports
from src.ragvix.config import settings
from src.ragvix.utils.logging import get_logger
from src.ragvix.utils.io import read_jsonl, write_jsonl

logger = get_logger(__name__)
logger.info("RAGvix notebook initialized")

## Data Exploration

In [None]:
# Check data directories
print("Data directories:")
for name, path in [
    ("Raw", settings.raw_dir),
    ("Interim", settings.interim_dir),
    ("Processed", settings.processed_dir),
    ("Index", settings.index_dir),
]:
    exists = "‚úÖ" if path.exists() else "‚ùå"
    print(f"  {exists} {name}: {path}")

In [None]:
# Load metadata if available
metadata_file = settings.raw_dir / "metadata.jsonl"

if metadata_file.exists():
    papers = read_jsonl(metadata_file)
    print(f"Loaded {len(papers)} papers")
    
    # Show sample paper
    if papers:
        sample = papers[0]
        print("\nSample paper:")
        for key, value in sample.items():
            if isinstance(value, str) and len(value) > 100:
                print(f"  {key}: {value[:100]}...")
            else:
                print(f"  {key}: {value}")
else:
    print("No metadata file found. Run ingest first.")

## Retrieval Testing

In [None]:
# Test retrieval if index exists
from src.ragvix.retriever.retriever import Retriever

index_config = settings.index_dir / "config.json"

if index_config.exists():
    print("Index found! Testing retrieval...")
    
    retriever = Retriever()
    
    # Test queries
    test_queries = [
        "attention mechanisms",
        "transformer models",
        "language modeling",
    ]
    
    for query in test_queries:
        print(f"\nüîç Query: '{query}'")
        try:
            results = retriever.search(query, k=3)
            for i, result in enumerate(results, 1):
                fmt = result["formatted"]
                print(f"  {i}. {fmt['title']} (score: {fmt['score']})")
        except Exception as e:
            print(f"  Error: {e}")
else:
    print("No index found. Build index first.")

## RAG Pipeline Testing

In [None]:
# Test RAG pipeline (stub)
from src.ragvix.rag.pipeline import answer

if index_config.exists():
    print("Testing RAG pipeline...")
    
    query = "What are attention mechanisms in neural networks?"
    response = answer(query, k=3)
    
    print(f"\nQuery: {response['query']}")
    print(f"Answer: {response['answer']}")
    print(f"\nSources ({len(response['sources'])}):")    
    for i, source in enumerate(response['sources'], 1):
        meta = source['metadata']
        print(f"  {i}. {meta['title']} ({meta['arxiv_id']})")
else:
    print("No index found. Build index first.")

## Development Notes

- Week-1 scope: Retrieval-only RAG
- Next: Add LLM generation step
- TODO: Better chunking strategies
- TODO: Evaluation metrics