In [1]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
from datasets import load_dataset

from rag.pipeline import RAGPipeline
from rag.embeddings import create_embedder
from rag.retrieval import create_reranker
from rag.generation import create_llm
from rag.storage import PgvectorVectorStore, PostgresDocumentStore
from rag.config import settings

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


16:48:22 faiss.loader INFO   Loading faiss with AVX512 support.
16:48:22 faiss.loader INFO   Successfully loaded faiss with AVX512 support.


True

In [2]:
corpus_ds = load_dataset("rag-datasets/rag-mini-bioasq", "text-corpus")['passages']
queries_ds = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages")['test']
queries_ds

Dataset({
    features: ['question', 'answer', 'relevant_passage_ids', 'id'],
    num_rows: 4719
})

In [3]:
doc_store = PostgresDocumentStore(settings)
vec_store = PgvectorVectorStore(settings)
embedder = create_embedder(settings)
reranker = create_reranker(settings)
llm = create_llm(settings)

rag_pipeline = RAGPipeline(
    doc_store,
    vec_store,
    embedder,
    reranker,
    llm,
    settings,
)

[2025-10-25 16:48:25] [rag.storage.document_stores.postgres] [INFO] PostgresDocumentStore initialized
[2025-10-25 16:48:25] [rag.storage.document_stores.postgres] [INFO] PostgresDocumentStore initialized
16:48:25 rag.storage.document_stores.postgres INFO   PostgresDocumentStore initialized
[2025-10-25 16:48:25] [rag.storage.vector_stores.pgvector] [INFO] PgvectorVectorStore initialized (cosine distance)
[2025-10-25 16:48:25] [rag.storage.vector_stores.pgvector] [INFO] PgvectorVectorStore initialized (cosine distance)
16:48:25 rag.storage.vector_stores.pgvector INFO   PgvectorVectorStore initialized (cosine distance)
16:48:25 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 288.26it/s]


16:48:31 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
16:48:33 redisvl.index.index INFO   Index already exists, not overwriting.


In [4]:
from dotenv import load_dotenv
import os
from ragas import evaluate
from ragas.metrics import (
  context_precision,     # Are retrieved docs relevant? (needs ground truth)
  context_recall,        # Did we retrieve all relevant docs? (needs ground truth)
  faithfulness,          # Is answer grounded in context? (no hallucinations)
  answer_relevancy,      # Does answer address the question?
  answer_correctness,    # How correct vs ground truth? (needs ground truth)
)
from datasets import Dataset
import ast
from tqdm import tqdm

# Load OpenAI API key for LLM-as-judge metrics
load_dotenv()
assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not found in .env"

print("✓ RAGAS setup complete")

✓ RAGAS setup complete


In [5]:
# Parse relevant_passage_ids from string to list


In [12]:
import numpy as np

# Start with a small subset to test (5 queries)
np.random.seed(42)
# sample_ids = np.random.choice(len(queries_ds), size=100, replace=False)
sample_ids = list(range(len(queries_ds)))

eval_data = []

def parse_relevant_ids(ids_str):
  return ast.literal_eval(ids_str)

for i in tqdm(sample_ids):
  item = queries_ds[i]
  question = item['question']
  ground_truth_answer = item['answer']
  relevant_doc_ids = parse_relevant_ids(item['relevant_passage_ids'])

  # Run RAG pipeline
  answer, chunks = rag_pipeline.query(question)
  
  # Deduplicate doc IDs while preserving rank order (first occurrence kept)
  # This is critical for accurate IR metrics - duplicates would inflate precision
  doc_ids = list(dict.fromkeys(chunk.id.split('#')[0] for chunk in chunks))
  
  documents = doc_store.get_documents(doc_ids)

  # Extract data
  eval_data.append({
      'question': question,
      'answer': answer,  # Generated answer
      'contexts': [chunk.text for chunk in chunks],  # Retrieved texts
      'ground_truth': ground_truth_answer,  # Gold answer
      'retrieved_doc_ids': doc_ids,  # For analysis (deduplicated, rank-ordered)
      'relevant_doc_ids': relevant_doc_ids,  # Gold doc IDs
  })

print(f"\n✓ Processed {len(sample_ids)} queries")
print(f"\nSample output:")
print(f"Question: {eval_data[0]['question'][:100]}...")
print(f"Answer: {eval_data[0]['answer'][:100]}...")
print(f"Contexts: {len(eval_data[0]['contexts'])} documents retrieved")

100%|██████████| 4719/4719 [4:01:31<00:00,  3.07s/it]  


✓ Processed 4719 queries

Sample output:
Question: Is Hirschsprung disease a mendelian or a multifactorial disorder?...
Answer: Hirschsprung disease (HSCR) can be both a Mendelian and a multifactorial disorder. 

- The non-Mende...
Contexts: 10 documents retrieved





In [13]:
# RAGAS expects a HuggingFace Dataset with specific columns
ragas_data = {
    'question': [item['question'] for item in eval_data],
    'answer': [item['answer'] for item in eval_data],
    'contexts': [item['contexts'] for item in eval_data],
    'ground_truth': [item['ground_truth'] for item in eval_data],
}

ragas_dataset = Dataset.from_dict(ragas_data)
print("RAGAS dataset created:")
print(ragas_dataset)

RAGAS dataset created:
Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 4719
})


In [14]:
from rag.utils import evaluate_retrieval

ir_metrics = evaluate_retrieval(eval_data, k=settings.top_k)
ir_metrics = {k: round(v, 3) for k, v in ir_metrics.items()}
ir_metrics

{'P@10': 0.275,
 'R@10': 0.342,
 'MRR@10': 0.706,
 'nDCG@10': 0.475,
 'Hit@10': 0.782}

In [9]:
import logging

# Clean output - only show errors and progress bar
logging.getLogger("httpx").setLevel(logging.ERROR)
logging.getLogger("openai").setLevel(logging.ERROR)
logging.getLogger("ragas").setLevel(logging.ERROR)

# Keep your own logs visible
logging.getLogger("rag").setLevel(logging.INFO)

print("✓ Clean logging configured")

✓ Clean logging configured


In [10]:
from langchain.cache import RedisCache
from langchain.globals import set_llm_cache
import redis

# Use your existing Redis instance
redis_client = redis.from_url(settings.redis_url, decode_responses=False)

set_llm_cache(RedisCache(redis_client, ttl=settings.redis_ttl))

In [11]:
from langchain_openai import ChatOpenAI
from ragas.run_config import RunConfig
from ragas import evaluate

# Create async LLM
llm = ChatOpenAI(
    model="gpt-4o-mini",  # Cheaper, faster model
    temperature=0,
    # max_tokens=1000,
)

# Configure parallel execution
run_config = RunConfig(
    max_workers=8,           # Number of parallel workers
    timeout=120,             # Timeout per evaluation (seconds)
    max_retries=3,           # Retry failed API calls
    max_wait=60,             # Max wait between retries
)


result = evaluate(
    dataset=ragas_dataset,
    metrics=[
      # context_precision,    # Are retrieved docs relevant?
      # context_recall,       # Did we retrieve all relevant docs?
      faithfulness,         # Is answer grounded in context?
      answer_relevancy,     # Does answer address the question?
      answer_correctness,   # How correct is the answer?
    ],
    llm=llm,
    run_config=run_config,
)

print("✓ Evaluation complete!\n")
print(result)

Evaluating: 100%|██████████| 300/300 [13:57<00:00,  2.79s/it]


✓ Evaluation complete!

{'faithfulness': 0.8169, 'answer_relevancy': 0.7953, 'answer_correctness': 0.6064}
