# Landmark RAG Pipeline

Prepare Wikipedia content for Retrieval Augmented Generation:
1. Load landmark Wikipedia data
2. Chunk articles into retrievable segments
3. Compute embeddings
4. Test retrieval + generation

In [None]:
!pip install -q transformers

In [None]:
import pandas as pd
import numpy as np
import torch
import json
import re
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using: {device}")

## 1. Load Data

In [None]:
df = pd.read_csv('../src/rag/wiki-context.csv')
print(f"Loaded {len(df)} landmarks")

## 2. Chunking

In [None]:
def split_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    return [s.strip() for s in sentences if s.strip()]


def chunk_text(text, tokenizer, max_tokens=400, overlap_tokens=50):
    if not text or pd.isna(text):
        return []
    
    sentences = split_sentences(text)
    if not sentences:
        return [text] if text.strip() else []
    
    chunks = []
    current = []
    current_len = 0
    
    for sent in sentences:
        sent_len = len(tokenizer.encode(sent, add_special_tokens=False))
        
        if sent_len > max_tokens:
            if current:
                chunks.append(' '.join(current))
                current, current_len = [], 0
            chunks.append(sent)
            continue
        
        if current_len + sent_len > max_tokens:
            chunks.append(' '.join(current))
            
            overlap = []
            overlap_len = 0
            for s in reversed(current):
                s_len = len(tokenizer.encode(s, add_special_tokens=False))
                if overlap_len + s_len <= overlap_tokens:
                    overlap.insert(0, s)
                    overlap_len += s_len
                else:
                    break
            current, current_len = overlap, overlap_len
        
        current.append(sent)
        current_len += sent_len
    
    if current:
        chunks.append(' '.join(current))
    
    return chunks

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

chunks = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    if pd.isna(row['page_content']):
        continue
    
    for i, text in enumerate(chunk_text(row['page_content'], tokenizer)):
        chunks.append({
            'landmark_name': row['landmark_name'],
            'wiki_url': row['wiki_url'],
            'chunk_index': i,
            'text': text
        })

## 3. Embeddings

In [None]:
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").to(device)
model.eval()

def compute_embeddings(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model(**inputs).last_hidden_state.mean(dim=1)
        embeddings.append(output.cpu().numpy())
    return np.vstack(embeddings)

chunk_texts = [c['text'] for c in chunks]
embeddings = compute_embeddings(chunk_texts)
print(f"Embeddings shape: {embeddings.shape}")

## 4. Retrieval

In [None]:
def retrieve(query, landmark_name, top_k=3):
    indices = []
    for i, c in enumerate(chunks):
        if c['landmark_name'] == landmark_name:
            indices.append(i)
    if not indices:
        return []
    
    landmark_chunks = [chunks[i] for i in indices]
    landmark_embeddings = embeddings[indices]
    
    query_emb = compute_embeddings([query])[0]
    
    sims = np.dot(landmark_embeddings, query_emb) / (
        np.linalg.norm(landmark_embeddings, axis=1) * np.linalg.norm(query_emb)
    )
    
    top_idx = np.argsort(sims)[-top_k:][::-1]
    return [(landmark_chunks[i], sims[i]) for i in top_idx]

## 5. Generation

In [None]:
gen_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    torch_dtype=torch.float16,
    device_map="cuda"
)
gen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

In [None]:
def answer(question, landmark_name, top_k=3):
    results = retrieve(question, landmark_name, top_k)
    context = "\n\n".join([c['text'] for c, _ in results])
    
    messages = [
        {"role": "system", "content": "Answer in 1-2 sentences using only the provided context."},
        {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
    ]
    
    prompt = gen_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = gen_tokenizer(prompt, return_tensors="pt").to(device)
    outputs = gen_model.generate(**inputs, max_new_tokens=100, pad_token_id=gen_tokenizer.eos_token_id)
    
    return gen_tokenizer.decode(outputs[0, inputs['input_ids'].shape[-1]:], skip_special_tokens=True).strip()


test_cases = [
    ("What is the Lone Cypress?", "17-Mile-Drive"),
    ("How long is the underground river?", "Puerto Princesa Undeground River"),
    ("What are trulli houses made of?", "Alberobello's Trulli"),
]

for q, landmark in test_cases:
    print(f"{landmark}: {answer(q, landmark)}\n")

## 6. Save

In [None]:
with open("../src/rag/landmark_chunks.json", "w") as f:
    json.dump(chunks, f, indent=2)

np.save("../src/rag/chunk_embeddings.npy", embeddings)