# Landmark RAG Pipeline

Prepare Wikipedia content for Retrieval Augmented Generation:
1. Load landmark Wikipedia data
2. Chunk articles into retrievable segments
3. Compute embeddings
4. Test retrieval + generation

In [None]:
!pip install -q transformers openai dotenv

In [2]:
import pandas as pd
import numpy as np
import torch
import json
import re
import tqdm
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import openai
from typing import List, Dict, Any
import time
import os
from dotenv import load_dotenv
import random

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using: {device}")

Using: cuda


## 1. Load Data

In [2]:
df = pd.read_csv('wiki-context.csv')
print(f"Loaded {len(df)} landmarks")

Loaded 938 landmarks


## 2. Chunking

In [11]:
def split_sentences(text):
    text = re.sub(r'(\.)(\s*)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*\n', r'\1\n\3. ', text)
    text = re.sub(r'(\.)(\s+)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s{2,}', r'\1\n\3. ', text)

    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    return [s.strip() for s in sentences if s.strip()]


def chunk_text(text, tokenizer, max_tokens=200, overlap_tokens=50):
    if not text or pd.isna(text):
        return []

    sentences = split_sentences(text)
    if not sentences:
        return [text] if text.strip() else []

    chunks = []
    current = []
    current_len = 0

    for sent in sentences:
        sent_len = len(tokenizer.encode(sent, add_special_tokens=False))

        if sent_len > max_tokens:
            if current:
                chunks.append(' '.join(current))
                current, current_len = [], 0
            chunks.append(sent)
            continue

        if current_len + sent_len > max_tokens:
            chunks.append(' '.join(current))

            overlap = []
            overlap_len = 0
            for s in reversed(current):
                s_len = len(tokenizer.encode(s, add_special_tokens=False))
                if overlap_len + s_len <= overlap_tokens:
                    overlap.insert(0, s)
                    overlap_len += s_len
                else:
                    break
            current, current_len = overlap, overlap_len

        current.append(sent)
        current_len += sent_len

    if current:
        chunks.append(' '.join(current))

    return chunks

In [12]:
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")

chunks = []
for _, row in df.iterrows():
    if pd.isna(row['page_content']):
        continue

    for i, text in enumerate(chunk_text(row['page_content'], tokenizer)):
        chunks.append({
            'landmark_name': row['landmark_name'],
            'wiki_url': row['wiki_url'],
            'chunk_index': i,
            'text': text
        })

Token indices sequence length is longer than the specified maximum sequence length for this model (606 > 512). Running this sequence through the model will result in indexing errors


## 3. Embeddings

In [13]:
model = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5").to(device)
model.eval()

def compute_embeddings(texts, batch_size=32, is_query=False):
    embeddings = []

    if is_query:
        texts = ["Represent this sentence for searching relevant passages: " + t for t in texts]

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model(**inputs).last_hidden_state[:, 0]
        embeddings.append(output.cpu().numpy())
    return np.vstack(embeddings)

chunk_texts = [c['text'] for c in chunks]
embeddings = compute_embeddings(chunk_texts, is_query=False)
print(f"Embeddings shape: {embeddings.shape}")

2025-12-08 14:47:47.155109: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Embeddings shape: (25262, 768)


## 4. Retrieval

In [14]:
def retrieve(query, landmark_name, top_k=3):
    indices = []
    for i, c in enumerate(chunks):
        if c['landmark_name'] == landmark_name:
            indices.append(i)
    if not indices:
        return []

    landmark_chunks = [chunks[i] for i in indices]
    landmark_embeddings = embeddings[indices]

    query_emb = compute_embeddings([query], is_query=True)[0]

    sims = np.dot(landmark_embeddings, query_emb) / (
        np.linalg.norm(landmark_embeddings, axis=1) * np.linalg.norm(query_emb)
    )

    top_idx = np.argsort(sims)[-top_k:][::-1]
    return [(landmark_chunks[i], sims[i]) for i in top_idx]

## 5. Generation

In [3]:
gen_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-1.5B-Instruct",
    torch_dtype=torch.float16,
    device_map="cuda"
)
gen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")

`torch_dtype` is deprecated! Use `dtype` instead!
2025-12-08 17:22:25.578128: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
def answer(question, landmark_name, top_k=10):
    results = retrieve(question, landmark_name, top_k)
    
    context = "\n\n".join([c['text'] for c, _ in results])

    messages = [
        {"role": "system", "content": "Answer in 1-2 sentences using ONLY facts from the provided context."},
        {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
    ]

    prompt = gen_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = gen_tokenizer(prompt, return_tensors="pt").to(device)
    outputs = gen_model.generate(**inputs, max_new_tokens=100, pad_token_id=gen_tokenizer.eos_token_id)

    answer_text = gen_tokenizer.decode(outputs[0, inputs['input_ids'].shape[-1]:], skip_special_tokens=True).strip()
    
    return answer_text

test_cases = [
    ("What is the Lone Cypress?", "17-Mile-Drive"),
    ("How long is the underground river?", "Puerto Princesa Undeground River"),
    ("What are trulli houses made of?", "Alberobello's Trulli"),
]

for q, landmark in test_cases:
    print(f"{landmark}: {answer(q, landmark)}\n")

## 6. Evaluation

### No context

In [4]:
def baseline_answer(question: str):
    messages = [
        {"role": "system", "content": "Answer the question in 1-2 sentences using your knowledge."},
        {"role": "user", "content": question}
    ]

    prompt = gen_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = gen_tokenizer(prompt, return_tensors="pt").to(device)
    outputs = gen_model.generate(**inputs, max_new_tokens=100, pad_token_id=gen_tokenizer.eos_token_id)
    answer_text = gen_tokenizer.decode(outputs[0, inputs['input_ids'].shape[-1]:], skip_special_tokens=True).strip()
    return answer_text

def generate_baseline(gen_path: str, save_path: str = "cached_baseline.json"):
    with open(gen_path, "r", encoding="utf-8") as f:
        payload = json.load(f)

    baseline_data = []
    print("Generating baseline answers (no context)...")
    for item in payload["data"]:
        q = item["question"]
        candidate = baseline_answer(q)
        baseline_data.append({
            "id": item["id"],
            "question": q,
            "reference": item["reference"],
            "candidate": candidate
        })

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump({"data": baseline_data}, f, indent=2)
    print(f"Baseline answers saved to {save_path}")
    return save_path

baseline_path = generate_baseline("cached_generation.json")

Generating baseline answers (no context)...
Baseline answers saved to cached_baseline.json


In [6]:
grade_cached_generations(baseline_path, model="gpt-4o-mini")

Grading 100 cached generations...
  Batch 1
  Batch 2
  Batch 3
  Batch 4
  Batch 5
  Batch 6
  Batch 7
  Batch 8
  Batch 9
  Batch 10
Grading completed and saved.


'cached_baseline.json'

In [None]:
import random
import json

def compute_baseline_accuracy(gen_path: str, sample_n: int = 5):
    with open(gen_path, "r", encoding="utf-8") as f:
        payload = json.load(f)

    grades = payload.get("grades", [])
    items = payload["data"]

    total = len(grades)
    correct = sum(1 for g in grades if g.get("correct", False))
    incorrect = total - correct
    accuracy = correct / total if total > 0 else 0.0

    print(f"Total examples: {total}")
    print(f"Correct: {correct} ({accuracy:.2%})")
    print(f"Incorrect: {incorrect} ({100 - accuracy*100:.2f}%)\n")

    correct_items = [it for it, g in zip(items, grades) if g.get("correct", False)]
    incorrect_items = [it for it, g in zip(items, grades) if not g.get("correct", False)]

    print("Sample Correct Examples:\n")
    for idx, it in enumerate(random.sample(correct_items, min(sample_n, len(correct_items))), 1):
        g = grades[it["id"]]
        print(f"Example {idx}:\nQuestion: {it['question']}\nCandidate Answer: {it['candidate']}\nReference Answer: {it['reference']}\nGrader Explanation: {g.get('explanation', '')}\n")

    print("Sample Incorrect Examples:\n")
    for idx, it in enumerate(random.sample(incorrect_items, min(sample_n, len(incorrect_items))), 1):
        g = grades[it["id"]]
        print(f"Example {idx}:\nQuestion: {it['question']}\nCandidate Answer: {it['candidate']}\nReference Answer: {it['reference']}\nGrader Explanation: {g.get('explanation', '')}\n")

    return accuracy

baseline_accuracy = compute_baseline_accuracy(baseline_path, sample_n=5)
print(f"Baseline Accuracy: {baseline_accuracy:.2%}")

Total examples: 100
Correct: 74 (74.00%)
Incorrect: 26 (26.00%)

Sample Correct Examples:

Example 1:
Question: In which country is Bruges?
Candidate Answer: Bruges is located in Belgium.
Reference Answer: Belgium
Grader Explanation: The student's answer correctly identifies Bruges as being located in Belgium, which is accurate.

Example 2:
Question: In which country is Ngorongoro Crater?
Candidate Answer: Ngorongoro Crater is located in Tanzania, East Africa.
Reference Answer: Tanzania
Grader Explanation: The student correctly identifies that Ngorongoro Crater is located in Tanzania, providing additional context that it is in East Africa.

Example 3:
Question: Who built the Great Pyramid of Giza?
Candidate Answer: The Great Pyramid of Giza was built by the ancient Egyptians as a tomb for Pharaoh Khufu.
Reference Answer: Khufu; Cheops
Grader Explanation: The student correctly identified that the Great Pyramid of Giza was built by Pharaoh Khufu.

Example 4:
Question: Who built the Alham

### With RAG Context

In [None]:
def generate_answers(
    tests: List[Dict],
    sample_n: int = 100,
    seed: int = 42,
    save_path: str = "cached_generation.json"
):
    random.seed(seed)
    sample_n = min(sample_n, len(tests))
    sampled = random.sample(tests, sample_n)

    generations = []

    print(f"Generating answers for {sample_n} samples...")

    for idx, item in enumerate(sampled):
        if (idx + 1) % 10 == 0:
            print(f"  {idx + 1}/{sample_n}", flush=True)

        q = item.get("question", "")
        lm = item.get("landmark_name", "")
        ref = "; ".join(item.get("keywords", [])) or lm

        generated_answer = answer(q, lm)

        generations.append({
            "id": idx,
            "question": q,
            "reference": ref,
            "candidate": generated_answer,
            "landmark": lm,
        })

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump({
            "seed": seed,
            "sample_n": sample_n,
            "data": generations
        }, f, indent=2)

    print(f"Saved generation cache to: {save_path}")
    return save_path


In [None]:
os.load_dotenv()
openai.api_key = os.get_env("OPENAI_API_KEY")

SYSTEM_INSTR = (
    "You are a grading assistant. "
    "For each question, compare the STUDENT ANSWER against the GROUND TRUTH ANSWER. "
    "Respond ONLY with JSON containing a 'results' array. Each element should be an object "
    "with 'correct': true/false and an 'explanation' string describing why it is correct or not."
)


def grade_batch_openai(items: List[Dict[str, str]], model: str = "gpt-4o-mini") -> List[Dict[str, Any]]:
    if not items:
        return []

    lines = []
    for it in items:
        idx = it["id"]
        q = it["question"].replace("\n", " ").strip()
        ref = it["reference"].replace("\n", " ").strip()
        cand = it["candidate"].replace("\n", " ").strip()
        lines.append(
            f"{idx}) QUESTION: {q}\n"
            f"GROUND TRUTH ANSWER: {ref}\n"
            f"STUDENT ANSWER: {cand}\n"
        )

    user_prompt = (
        "Here are items to grade (numbered):\n\n"
        + "\n".join(lines)
        + "\nRespond with a JSON object containing a 'results' array "
          "(one object per item) in the same order.\n"
    )

    resp = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_INSTR},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.0,
        max_tokens=2048,
        response_format={"type": "json_object"},
    )

    try:
        parsed = json.loads(resp.choices[0].message.content)
    except Exception:
        return [{"explanation": "Failed to parse grader response", "correct": False} for _ in items]

    results = parsed.get("results", []) if isinstance(parsed, dict) else parsed

    if not isinstance(results, list) or len(results) != len(items):
        return [{"explanation": "Malformed grader output", "correct": False} for _ in items]

    return [
        {
            "explanation": obj.get("explanation", ""),
            "correct": bool(obj.get("correct", False)),
        }
        if isinstance(obj, dict)
        else {"explanation": "Invalid entry", "correct": False}
        for obj in results
    ]


def grade_cached_generations(
    gen_path: str,
    model: str = "gpt-4o-mini",
    batch_size: int = 10,
):
    with open(gen_path, "r", encoding="utf-8") as f:
        payload = json.load(f)

    items = payload["data"]
    grades = []

    print(f"Grading {len(items)} cached generations...")

    for i in range(0, len(items), batch_size):
        batch = items[i:i + batch_size]
        to_grade = [
            {
                "id": it["id"],
                "question": it["question"],
                "reference": it["reference"],
                "candidate": it["candidate"],
            }
            for it in batch
        ]

        print(f"  Batch {i//batch_size + 1}", flush=True)
        results = grade_batch_openai(to_grade, model=model)
        grades.extend(results)
        time.sleep(1)

    payload["grades"] = grades
    payload["grader_model"] = model

    with open(gen_path, "w", encoding="utf-8") as f:
        json.dump(payload, f, indent=2)

    print("Grading completed and saved.")
    return gen_path

In [55]:
def load_tests(path: str):
    """Load test questions from a JSON file."""
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data.get("test_questions", [])

gen_path = generate_answers(
    load_tests("test_questions_500 (1).json"),
    sample_n=100
)

Generating answers for 100 samples...
  10/100
  20/100
  30/100
  40/100
  50/100
  60/100
  70/100
  80/100
  90/100
  100/100
Saved generation cache to: cached_generation.json


In [61]:
grade_cached_generations(gen_path, model="gpt-4o-mini")

Grading 100 cached generations...
  Batch 1
  Batch 2
  Batch 3
  Batch 4
  Batch 5
  Batch 6
  Batch 7
  Batch 8
  Batch 9
  Batch 10
Grading completed and saved.


'cached_generation.json'

In [66]:
import json
import random

def evaluate_generations(gen_path: str, sample_errors: int = 5):
    """
    Compute accuracy and print sample correct/incorrect generations.
    
    Args:
        gen_path: Path to the JSON with 'data' and 'grades'.
        sample_errors: Number of incorrect examples to show.
    
    Returns:
        accuracy: float
    """
    with open(gen_path, "r", encoding="utf-8") as f:
        payload = json.load(f)

    data = payload.get("data", [])
    grades = payload.get("grades", [])

    if not grades or not data:
        print("No grades or data found in the file.")
        return 0.0

    total = len(grades)
    correct_count = sum(1 for g in grades if g.get("correct", False))
    incorrect_count = total - correct_count
    accuracy = correct_count / total

    print(f"Total examples: {total}")
    print(f"Correct: {correct_count} ({accuracy:.2%})")
    print(f"Incorrect: {incorrect_count} ({incorrect_count/total:.2%})\n")

    # Pair data and grades
    paired = list(zip(data, grades))

    # Show some correct examples
    correct_examples = [p for p in paired if p[1].get("correct", False)]
    incorrect_examples = [p for p in paired if not p[1].get("correct", False)]

    print("Sample Correct Examples:")
    for i, (item, grade) in enumerate(random.sample(correct_examples, min(3, len(correct_examples)))):
        print(f"\nExample {i+1}:")
        print(f"Question: {item['question']}")
        print(f"Candidate Answer: {item['candidate']}")
        print(f"Reference Answer: {item['reference']}")
        print(f"Grader Explanation: {grade.get('explanation','')}")
    
    if incorrect_examples:
        print("\nSample Incorrect Examples:")
        for i, (item, grade) in enumerate(random.sample(incorrect_examples, min(sample_errors, len(incorrect_examples)))):
            print(f"\nExample {i+1}:")
            print(f"Question: {item['question']}")
            print(f"Candidate Answer: {item['candidate']}")
            print(f"Reference Answer: {item['reference']}")
            print(f"Grader Explanation: {grade.get('explanation','')}")

    return accuracy

# Usage
accuracy = evaluate_generations("cached_generation.json", sample_errors=5)

Total examples: 100
Correct: 81 (81.00%)
Incorrect: 19 (19.00%)

Sample Correct Examples:

Example 1:
Question: In which country is Sumela Monastery?
Candidate Answer: Sumela Monastery is located in Turkey.
Reference Answer: Turkey
Grader Explanation: The student accurately stated that Sumela Monastery is located in Turkey, matching the ground truth answer.

Example 2:
Question: In which country is Shwedagon Pagoda?
Candidate Answer: Shwedagon Pagoda is located in Yangon, Myanmar.
Reference Answer: Myanmar; Burma
Grader Explanation: The student answer correctly states that Shwedagon Pagoda is located in Yangon, Myanmar, which matches the ground truth.

Example 3:
Question: What famous rock formation is in Yosemite?
Candidate Answer: Half Dome is a famous rock formation in Yosemite.
Reference Answer: Half Dome; El Capitan
Grader Explanation: The student answer correctly identifies Half Dome as a famous rock formation in Yosemite, which is accurate.

Sample Incorrect Examples:

Example 1

## 7. Save Embeddings

In [12]:
with open("landmark_chunks.json", "w") as f:
    json.dump(chunks, f, indent=2)

np.save("chunk_embeddings.npy", embeddings)