# RAG Experiment

This notebook runs two experiments:
1. Query the LLM directly for a set of questions and store the answers.
2. Load PDF documents into Qdrant and re-run the questions using retrieval augmented generation (RAG).


In [None]:
import json
from pathlib import Path

from Code.qdrant_utils import (
    answer_with_context,
    get_qdrant_client,
    load_pdf_and_chunk,
    embed_chunks,
    store_embeddings_in_qdrant,
    retrieve_similar_chunks,
)

from Experiment.experiment_utils import save_results_to_csv

DATA_PATH = Path('RAG Paper') / 'question.json'
PDF_DIR = Path('RAG Paper')
GROUND_TRUTH_CSV = Path('Experiment') / 'results_ground_truth.csv'
RAG_CSV = Path('Experiment') / 'results_rag.csv'
COLLECTION_NAME = 'rag_papers'


## Load evaluation questions
The JSON file contains questions grouped by paper and additional metadata questions.
We flatten them into a single list with their gold answers.

In [None]:
with open(DATA_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)['evaluation_dataset']

questions = []
# content questions per paper
for paper in data['papers']:
    for q in paper.get('questions', []):
        questions.append({
            'question_id': q['question_id'],
            'question': q['question'],
            'answer': q['answer']
        })

# metadata questions are answered for each paper separately
for meta in data.get('metadata_questions', {}).get('questions', []):
    base_id = meta['question_id']
    text = meta['question']
    for entry in meta.get('papers', []):
        q_id = f"P{entry['paper_id']}_{base_id}"
        questions.append({
            'question_id': q_id,
            'question': text,
            'answer': entry['answer']
        })

len(questions)


## Experiment 1: direct LLM answers
Each question is sent to the LLM without any additional context. The answers are saved to *results_ground_truth.csv*.

In [None]:
results_gt = []
for q in questions:
    llm_answer = answer_with_context(q['question'], [])
    results_gt.append({
        'question_id': q['question_id'],
        'question_string': q['question'],
        'answer_llm': llm_answer,
        'answer_gold': q['answer']
    })

save_results_to_csv(results_gt, GROUND_TRUTH_CSV)
len(results_gt)


## Load PDF documents into Qdrant
All PDF files are chunked, embedded and stored in the collection defined above.

In [None]:
client = get_qdrant_client()
for pdf in PDF_DIR.glob('*.pdf'):
    chunks = load_pdf_and_chunk(str(pdf))
    embeddings = embed_chunks(chunks)
    store_embeddings_in_qdrant(client, COLLECTION_NAME, chunks, embeddings)


## Experiment 2: RAG answers
For each question we retrieve relevant chunks from Qdrant and pass them to the LLM. Results are written to *results_rag.csv*.

In [None]:
results_rag = []
for q in questions:
    context = retrieve_similar_chunks(q['question'], client, COLLECTION_NAME)
    llm_answer = answer_with_context(q['question'], context)
    results_rag.append({
        'question_id': q['question_id'],
        'question_string': q['question'],
        'answer_llm': llm_answer,
        'answer_gold': q['answer']
    })

save_results_to_csv(results_rag, RAG_CSV)
len(results_rag)
