## 1 — Imports & constants

In [None]:
!pip install -U "fsspec[http]==2024.6.1" "gcsfs==2024.6.1" "protobuf<6"
!pip install -U transformers accelerate safetensors huggingface_hub
!pip install -U qdrant-client==1.9.1 sentence-transformers==3.2.1
!pip install -U sacrebleu==2.4.2 rouge-score==0.1.2 tiktoken==0.7.0 psutil==6.0.0 datasets==3.0.1

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
from huggingface_hub import login
from getpass import getpass
from rouge_score import rouge_scorer

import pandas as pd
import sacrebleu
import tiktoken
import json
import os
import time
import psutil
import requests
import re
import random
import numpy as np
import torch

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
BOOK_ID = 35
VDB_PATH = "./vdb"
K_CHILD = 6
TOP_PARENTS = 3
PARENT_SZ, PARENT_OV = 1200, 150
CHILD_SZ, CHILD_OV = 300, 50

os.makedirs("data/clean", exist_ok=True)
os.makedirs("data/narrativeqa", exist_ok=True)

## 2 — Download & clean the Gutenberg book

In [None]:
url = f"https://www.gutenberg.org/cache/epub/{BOOK_ID}/pg{BOOK_ID}.txt"
raw = requests.get(url).text

# Remove Gutenberg header and footer
start = re.search(r"\*\*\* START OF.*?\*\*\*", raw, re.I)
end   = re.search(r"\*\*\* END OF.*?\*\*\*", raw, re.I)
text  = raw[(start.end() if start else 0):(end.start() if end else len(raw))]
text  = re.sub(r"\r","",text)
text  = re.sub(r"\n{3,}", "\n\n", text).strip()
open("data/clean/book_clean.txt","w", encoding="utf-8").write(text)

print("Saved:", "data/clean/book_clean.txt", "| chars:", len(text))

Saved: data/clean/book_clean.txt | chars: 179621


## 3 — NarrativeQA filter for the chosen title

In [None]:
# Load NarrativeQA metadata
docs = pd.read_csv("https://raw.githubusercontent.com/google-deepmind/narrativeqa/master/documents.csv")
qaps = pd.read_csv("https://raw.githubusercontent.com/google-deepmind/narrativeqa/master/qaps.csv")

# Find entries using Gutenberg URL
is_test = docs["set"].str.lower().eq("test")
mask = is_test & docs["story_url"].fillna("").str.contains(r"/ebooks/35(\.txt|\.utf-8|$)", case=False)

doc_ids = docs.loc[mask, "document_id"].unique()
print(f"Matched {len(doc_ids)} document(s) with Gutenberg ID 35")

# Build QA df for those documents
qa_df = qaps[(qaps["set"].str.lower() == "test") & (qaps["document_id"].isin(doc_ids))].copy()
print(f"Found {len(qa_df)} QA pairs linked to those docs")

# Helper to pick the preferred answer
def pick_answer(row):
    a1 = str(row.get("answer1") or "").strip()
    a2 = str(row.get("answer2") or "").strip()
    return a1 if a1 else a2

# Build the QA list
qa = [
    {"qid": int(i), "question": str(r["question"]), "answer": pick_answer(r)}
    for i, r in qa_df.iterrows()
    if str(r.get("question","")).strip()
]

# Save to JSON
os.makedirs("data/narrativeqa", exist_ok=True)
path = "data/narrativeqa/test.json"
with open(path, "w", encoding="utf-8") as f:
    json.dump(qa, f, ensure_ascii=False, indent=2)

print(f"Saved {len(qa)} QA items to {path} for 'The Time Machine' (Gutenberg ID 35)")

Matched 1 document(s) with Gutenberg ID 35
Found 29 QA pairs linked to those docs
Saved 29 QA items to data/narrativeqa/test.json for 'The Time Machine' (Gutenberg ID 35)


  mask = is_test & docs["story_url"].fillna("").str.contains(r"/ebooks/35(\.txt|\.utf-8|$)", case=False)


## 4 — Hierarchical chunking (parents & children)

In [None]:
enc = tiktoken.get_encoding("gpt2")
def toklen(s): return len(enc.encode(s))

def sliding_windows(tokens, size, overlap):
    step = size - overlap
    for i in range(0, max(1, len(tokens)-size+1), step):
        yield i, tokens[i:i+size]
    if len(tokens) > size and (len(tokens)-size) % step != 0:
        i = len(tokens)-size
        yield i, tokens[i:]

tokens = enc.encode(text)
parents = []
for p_start, p_tok in sliding_windows(tokens, PARENT_SZ, PARENT_OV):
    parents.append({"p_id": len(parents),
                    "toks": p_tok,
                    "text": enc.decode(p_tok),
                    "p_start": p_start, "p_end": p_start+len(p_tok)})

children = []
for p in parents:
    for c_start, c_tok in sliding_windows(p["toks"], CHILD_SZ, CHILD_OV):
        children.append({
            "c_id": len(children),
            "p_id": p["p_id"],
            "text": enc.decode(c_tok),
        })

print("Parents:", len(parents), "| Children:", len(children))

Parents: 43 | Children: 215


## 5 — Embeddings + on-disk Qdrant index

In [None]:
os.makedirs(VDB_PATH, exist_ok=True)
emb = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
X = emb.encode([c["text"] for c in children], batch_size=64, show_progress_bar=True)

qc = QdrantClient(path=VDB_PATH)
COL = f"gutenberg_{BOOK_ID}_children"

existing = [c.name for c in qc.get_collections().collections]
if COL in existing:
    qc.delete_collection(COL)

qc.recreate_collection(
    collection_name=COL,
    vectors_config=qmodels.VectorParams(size=X.shape[1], distance=qmodels.Distance.COSINE)
)

qc.upsert(
    collection_name=COL,
    points=[
        qmodels.PointStruct(id=i, vector=X[i].tolist(),
                            payload={"c_id": i, "p_id": children[i]["p_id"], "text": children[i]["text"]})
        for i in range(len(children))
    ]
)

print("Qdrant collection created:", COL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

  qc.recreate_collection(


Qdrant collection created: gutenberg_35_children


## 6 — Gemma-3-1b-it generator & prompt

In [None]:
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
    login(token=HF_TOKEN, add_to_git_credential=False)
else:
    print("Paste your HF token (starts with hf_):")
    HF_TOKEN = getpass()
    login(token=HF_TOKEN, add_to_git_credential=False)

MODEL_ID = "google/gemma-3-1b-it"

tok = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
mdl = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", token=HF_TOKEN, trust_remote_code=True)
print("Loaded Gemma 3:", MODEL_ID)

GEN_KW = dict(max_new_tokens=32, do_sample=False, temperature=0.1, top_p=0.8)

gen = pipeline("text-generation", model=mdl, tokenizer=tok)

def make_prompt(context, question):
    prompt = f"""
You are a helpful assistant answering questions about a book.
Use ONLY the following context to answer the question. If the answer is not found in the context, state that you don't know.

Context:
{context}

Question:
{question}

Answer: """
    return prompt

print("Gemma-3-1b-it generator and make_prompt function defined.")

Paste your HF token (starts with hf_):
··········


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Device set to use cuda:0


Loaded Gemma 3: google/gemma-3-1b-it
Gemma-3-1b-it generator and make_prompt function defined.


## 7 — Retrieval

In [None]:
def retrieve(q, k_child=K_CHILD, top_par=TOP_PARENTS, ctx_budget_tokens=1400):
    q_vec = emb.encode([q])[0]
    res = qc.search(COL, query_vector=q_vec, limit=k_child)
    child_hits = [children[p.payload["c_id"]] for p in res]
    p_ids = []
    for ch in child_hits[:top_par]:
        if ch["p_id"] not in p_ids:
            p_ids.append(ch["p_id"])
    ctx = ""
    for pid in p_ids:
        pt = parents[pid]["text"].strip()
        if toklen(ctx + "\n\n" + pt) <= ctx_budget_tokens:
            ctx += ("\n\n" + pt) if ctx else pt
        else:
            break
    return ctx, res

## 8 — Run predictions (baseline & RAG)

In [None]:
def answer_baseline(q):
    out = gen(q, **GEN_KW)[0]["generated_text"]
    return out.split(q)[-1].strip() if q in out else out

def answer_rag(q):
    ctx, _ = retrieve(q)
    prompt = make_prompt(ctx, q)
    out = gen(prompt, **GEN_KW)[0]["generated_text"]
    return out.split("Answer:")[-1].strip() if "Answer:" in out else out

qa = json.load(open("data/narrativeqa/test.json", encoding="utf-8"))
baseline_preds, rag_preds, refs = [], [], []
for r in qa:
    refs.append(r["answer"])
    baseline_preds.append(answer_baseline(r["question"]))
    rag_preds.append(answer_rag(r["question"]))

json.dump({"refs": refs, "baseline": baseline_preds, "rag": rag_preds},
          open("outputs_mvp.json","w", encoding="utf-8"), ensure_ascii=False, indent=2)
print("Saved new predictions to outputs_mvp.json with updated parameters and embedding model.")

## 9 — BLEU-4 & ROUGE-L summary table

In [None]:
data = json.load(open("outputs_mvp.json", encoding="utf-8"))
refs, baseline_preds, rag_preds = data["refs"], data["baseline"], data["rag"]

bleu_base = sacrebleu.corpus_bleu(baseline_preds, [refs]).score
bleu_rag  = sacrebleu.corpus_bleu(rag_preds, [refs]).score

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
def rougeL(preds, refs):
    return 100 * np.mean([scorer.score(a,b)['rougeL'].fmeasure for a,b in zip(preds, refs)])

rouge_base = rougeL(baseline_preds, refs)
rouge_rag  = rougeL(rag_preds, refs)

print("| Approach | BLEU-4 | ROUGE-L |")
print("|---|---:|---:|")
print(f"| Baseline (No RAG) | {bleu_base:.2f} | {rouge_base:.2f} |")
print(f"| RAG (Hierarchical - Updated) | {bleu_rag:.2f} | {rouge_rag:.2f} |")

| Approach | BLEU-4 | ROUGE-L |
|---|---:|---:|
| Baseline (No RAG) | 0.11 | 5.11 |
| RAG (Hierarchical - Updated) | 0.21 | 8.81 |
