# RAG Experiment

This notebook runs two experiments:
1. Query the LLM directly for a set of questions and store the answers.
2. Load PDF documents into Qdrant and re-run the questions using retrieval augmented generation (RAG).


In [9]:
import json
from pathlib import Path

from qdrant_utils import (
    answer_with_context,
    get_qdrant_client,
    load_pdf_and_chunk,
    embed_chunks,
    store_embeddings_in_qdrant,
    retrieve_similar_chunks,
)

from experiment_utils import (
    save_results_to_csv,
    load_processed_ids,
    append_result_to_csv,
)

DATA_PATH = 'question.json'
PDF_DIR = Path('papers')
GROUND_TRUTH_CSV ='results_ground_truth.csv'
RAG_CSV = 'results_rag.csv'
COLLECTION_NAME = 'julians_kleiner_mann'


## Load evaluation questions
The JSON file contains questions grouped by paper and additional metadata questions.
We flatten them into a single list with their gold answers.

In [10]:
with open(DATA_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)['evaluation_dataset']

questions = []
# content questions per paper
for paper in data['papers']:
    for q in paper.get('questions', []):
        questions.append({
            'question_id': q['question_id'],
            'question': q['question'],
            'answer': q['answer']
        })

# metadata questions are answered for each paper separately
for meta in data.get('metadata_questions', {}).get('questions', []):
    base_id = meta['question_id']
    text = meta['question']
    for entry in meta.get('papers', []):
        q_id = f"P{entry['paper_id']}_{base_id}"
        questions.append({
            'question_id': q_id,
            'question': text,
            'answer': entry['answer']
        })

len(questions)


35

## Experiment 1: direct LLM answers
Each question is sent to the LLM without any additional context. The answers are saved to *results_ground_truth.csv*.

In [11]:
import os
import csv
from typing import List, Dict
from collections import Counter

# === ROUGE-N metric functions ===
def get_ngrams(text: str, n: int) -> Counter:
    """Return a Counter of n-grams from whitespace-tokenized text."""
    tokens = text.split()
    return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))


def precision_n(pred: str, gold: str, n: int) -> float:
    """Compute precision for n-grams: overlap_count / total_predicted_ngrams."""
    pred_ngrams = get_ngrams(pred, n)
    gold_ngrams = get_ngrams(gold, n)
    overlap = sum((pred_ngrams & gold_ngrams).values())
    total_pred = sum(pred_ngrams.values())
    return overlap / total_pred if total_pred > 0 else 0.0


def recall_n(pred: str, gold: str, n: int) -> float:
    """Compute recall for n-grams: overlap_count / total_reference_ngrams."""
    pred_ngrams = get_ngrams(pred, n)
    gold_ngrams = get_ngrams(gold, n)
    overlap = sum((pred_ngrams & gold_ngrams).values())
    total_gold = sum(gold_ngrams.values())
    return overlap / total_gold if total_gold > 0 else 0.0


def f1_n(prec: float, rec: float) -> float:
    """Compute the F1 score given precision and recall."""
    return (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0.0

# === CSV append and deduplication ===
GROUND_TRUTH_CSV = "results_ground_truth.csv"


def load_processed_ids(filepath: str) -> set:
    """Return the set of question_id already in filepath (if it exists)."""
    if not os.path.exists(filepath):
        return set()
    with open(filepath, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        return {row["question_id"] for row in reader}


def append_result_to_csv(row: Dict[str, str], filepath: str) -> None:
    """Append a single result row to filepath, creating and headering if needed."""
    # Static headers including metrics
    fieldnames = [
        "question_id", "question_string", "answer_llm", "answer_gold",
        "precision-1", "recall-1", "ROUGE-1",
        "precision-2", "recall-2", "ROUGE-2"
    ]

    file_exists = os.path.exists(filepath)
    if filepath and os.path.dirname(filepath):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)

    # Always open in append mode (creates file if missing)
    with open(filepath, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        # Ensure row only has expected keys
        filtered = {key: row.get(key, "") for key in fieldnames}
        writer.writerow(filtered)

# === Main loop ===
processed_ids = load_processed_ids(GROUND_TRUTH_CSV)

for q in questions:
    qid = q["question_id"]
    if qid in processed_ids:
        continue

    llm_answer = answer_with_context(q["question"], [])
    gold = q["answer"]

    # ROUGE-1
    p1 = precision_n(llm_answer, gold, 1)
    r1 = recall_n(llm_answer, gold, 1)
    f1_1 = f1_n(p1, r1)

    # ROUGE-2
    p2 = precision_n(llm_answer, gold, 2)
    r2 = recall_n(llm_answer, gold, 2)
    f1_2 = f1_n(p2, r2)

    row = {
        "question_id": qid,
        "question_string": q["question"],
        "answer_llm": llm_answer,
        "answer_gold": gold,
        "precision-1": p1,
        "recall-1": r1,
        "ROUGE-1": f1_1,
        "precision-2": p2,
        "recall-2": r2,
        "ROUGE-2": f1_2,
    }

    append_result_to_csv(row, GROUND_TRUTH_CSV)
    processed_ids.add(qid)  # avoid re-processing


## Load PDF documents into Qdrant
All PDF files are chunked, embedded and stored in the collection defined above.

In [12]:
client = get_qdrant_client()
for pdf in PDF_DIR.glob('*.pdf'):
    chunks = load_pdf_and_chunk(str(pdf))
    embeddings = embed_chunks(chunks)
    store_embeddings_in_qdrant(client, COLLECTION_NAME, chunks, embeddings)


## Experiment 2: RAG answers
For each question we retrieve relevant chunks from Qdrant and pass them to the LLM. Results are written to *results_rag.csv*.

In [None]:
import os
import csv
from typing import List, Dict
from collections import Counter

# === ROUGE-N metric functions ===
def get_ngrams(text: str, n: int) -> Counter:
    """Return a Counter of n-grams from whitespace-tokenized text."""
    tokens = text.split()
    return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))


def precision_n(pred: str, gold: str, n: int) -> float:
    """Compute precision for n-grams: overlap_count / total_predicted_ngrams."""
    pred_ngrams = get_ngrams(pred, n)
    gold_ngrams = get_ngrams(gold, n)
    overlap = sum((pred_ngrams & gold_ngrams).values())
    total_pred = sum(pred_ngrams.values())
    return overlap / total_pred if total_pred > 0 else 0.0


def recall_n(pred: str, gold: str, n: int) -> float:
    """Compute recall for n-grams: overlap_count / total_reference_ngrams."""
    pred_ngrams = get_ngrams(pred, n)
    gold_ngrams = get_ngrams(gold, n)
    overlap = sum((pred_ngrams & gold_ngrams).values())
    total_gold = sum(gold_ngrams.values())
    return overlap / total_gold if total_gold > 0 else 0.0


def f1_n(prec: float, rec: float) -> float:
    """Compute the F1 score given precision and recall."""
    return (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0.0


def load_processed_ids(filepath: str) -> set:
    """Return the set of question_id already in filepath (if it exists)."""
    if not os.path.exists(filepath):
        return set()
    with open(filepath, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        return {row["question_id"] for row in reader}


def append_result_to_csv(row: Dict[str, str], filepath: str) -> None:
    """Append a single result row to filepath, creating and headering if needed."""
    # Static headers including metrics
    fieldnames = [
        "question_id", "question_string", "answer_llm", "answer_gold",
        "precision-1", "recall-1", "ROUGE-1",
        "precision-2", "recall-2", "ROUGE-2"
    ]

    file_exists = os.path.exists(filepath)
    if filepath and os.path.dirname(filepath):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)

    # Always open in append mode (creates file if missing)
    with open(filepath, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        # Ensure row only has expected keys
        filtered = {key: row.get(key, "") for key in fieldnames}
        writer.writerow(filtered)

# === Main loop ===
processed_ids = load_processed_ids(RAG_CSV)

for q in questions:
    qid = q["question_id"]
    if qid in processed_ids:
        continue

    llm_answer = answer_with_context(q["question"], [])
    gold = q["answer"]

    # ROUGE-1
    p1 = precision_n(llm_answer, gold, 1)
    r1 = recall_n(llm_answer, gold, 1)
    f1_1 = f1_n(p1, r1)

    # ROUGE-2
    p2 = precision_n(llm_answer, gold, 2)
    r2 = recall_n(llm_answer, gold, 2)
    f1_2 = f1_n(p2, r2)

    row = {
        "question_id": qid,
        "question_string": q["question"],
        "answer_llm": llm_answer,
        "answer_gold": gold,
        "precision-1": p1,
        "recall-1": r1,
        "ROUGE-1": f1_1,
        "precision-2": p2,
        "recall-2": r2,
        "ROUGE-2": f1_2,
    }

    append_result_to_csv(row, RAG_CSV)
    processed_ids.add(qid)  # avoid re-processing
