<a href="https://colab.research.google.com/github/azz2021/rag-full-ai-act-project/blob/main/DSII_Challenge_DataPreparation_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q python-docx PyPDF2 tiktoken unidecode pandas numpy sentence-transformers faiss-cpu


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m245.8/253.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, re, json, gc, unicodedata
from docx import Document
from PyPDF2 import PdfReader
from unidecode import unidecode
import tiktoken

# Tokenizer for token-based chunking
enc = tiktoken.get_encoding("cl100k_base")

def normalize_text(s: str) -> str:
    # ASCII fallback for odd punctuation, then Unicode normalize
    s = unidecode(s or "")
    s = unicodedata.normalize("NFC", s)
    s = s.replace("\u00A0", " ")                     # non-breaking spaces
    s = re.sub(r"[ \t]+", " ", s)                   # collapse spaces
    s = re.sub(r"\s*\n\s*", "\n", s).strip()        # tidy newlines
    return s

def detext_hyphenation(s: str) -> str:
    # join words broken by line hyphens (PDF artifact)
    return re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", s)

def chunk_tokens(text: str, chunk_size=600, overlap=100):
    # chunk by tokens with overlap for context continuity
    ids = enc.encode(text)
    i = 0
    while i < len(ids):
        j = min(i + chunk_size, len(ids))
        yield enc.decode(ids[i:j])
        if j == len(ids): break
        i = max(0, j - overlap)


In [None]:
OUT_PATH = "/content/chunks.jsonl"

def write_record(rec, path=OUT_PATH):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

def stream_docx_paragraphs(path):
    doc = Document(path)
    for p in doc.paragraphs:
        t = normalize_text(p.text)
        if len(t) > 20:
            yield t
    del doc; gc.collect()

def stream_pdf_pages(path, page_start=0, page_end=None):
    pdf = PdfReader(path)
    total = len(pdf.pages)
    if page_end is None: page_end = total
    for i in range(page_start, min(page_end, total)):
        page = pdf.pages[i]
        t = normalize_text(page.extract_text() or "")
        t = detext_hyphenation(t)
        if len(t) > 20:
            yield t, i
        del page
        if (i + 1) % 10 == 0: gc.collect()
    del pdf; gc.collect()

def stream_and_chunk_texts(text_iter, doc_label, prefix, add_page=False,
                           chunk_size=600, overlap=100, progress_every=200):
    written = 0
    for idx, item in enumerate(text_iter):
        if add_page:
            text, page_idx = item
            meta = {"page": int(page_idx)}
        else:
            text = item
            meta = {}
        # chunk and write each unit immediately
        for j, c in enumerate(chunk_tokens(text, chunk_size=chunk_size, overlap=overlap)):
            rec = {"doc": doc_label, "chunk_id": f"{prefix}_{idx}_{j}", "text": c}
            rec.update(meta)
            write_record(rec)
            written += 1
        if (idx + 1) % progress_every == 0:
            print(f"{doc_label}: processed {idx+1} units, ~{written} chunks so far")
            gc.collect()
    print(f"{doc_label}: done.")


In [None]:
# Make sure your source files are uploaded to /content/ first.
DOCX_PATH = "/content/EU AI Act Doc (1) (3).docx"
PDF_PATH  = "/content/Attention_is_all_you_need (1) (3).pdf"

# Start fresh
if os.path.exists(OUT_PATH):
    os.remove(OUT_PATH)

# DOCX first (paragraph by paragraph)
stream_and_chunk_texts(
    text_iter=stream_docx_paragraphs(DOCX_PATH),
    doc_label="EU_AI_Act",
    prefix="eu",
    add_page=False,
    chunk_size=600, overlap=100
)

# PDF next (in small slices to keep RAM low)
reader = PdfReader(PDF_PATH)
total_pages = len(reader.pages)
del reader; gc.collect()

SLICE = 25  # reduce to 15 if memory is tight
start = 0
while start < total_pages:
    end = min(start + SLICE, total_pages)
    print(f"Processing PDF pages {start}-{end-1}")
    stream_and_chunk_texts(
        text_iter=stream_pdf_pages(PDF_PATH, page_start=start, page_end=end),
        doc_label="Transformer",
        prefix="tr",
        add_page=True,
        chunk_size=600, overlap=100
    )
    start = end; gc.collect()

# Sanity check
!ls -lh /content/chunks.jsonl
!head -n 3 /content/chunks.jsonl


EU_AI_Act: done.
Processing PDF pages 0-14
Transformer: done.
-rw-r--r-- 1 root root 63K Nov  9 10:23 /content/chunks.jsonl
{"doc": "EU_AI_Act", "chunk_id": "eu_0_0", "text": "High-level summary of the AI Act"}
{"doc": "EU_AI_Act", "chunk_id": "eu_1_0", "text": "Updated on 30 May in accordance with the Corrigendum version of the AI Act."}
{"doc": "EU_AI_Act", "chunk_id": "eu_2_0", "text": "In this article we provide you with a high-level summary of the AI Act, selecting the parts which are most likely to be relevant to you regardless of who you are. We provide links to the original document where relevant so that you can always reference the Act text."}


In [None]:
from google.colab import files
files.download("/content/chunks.jsonl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np, pandas as pd, faiss, torch, json, os, gc

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name, device=device)

def embed(texts, batch_size=64):
    return model.encode(
        texts,
        batch_size=batch_size,             # lower to 32/16 if OOM
        convert_to_numpy=True,
        normalize_embeddings=True,         # cosine via inner product in FAISS
        show_progress_bar=True,
        device=device
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
CHUNKS_PATH = "/content/chunks.jsonl"
assert os.path.exists(CHUNKS_PATH), "chunks.jsonl not found. Run the streaming step first."

N = sum(1 for _ in open(CHUNKS_PATH, "r", encoding="utf-8"))
print("Total chunks:", N)

d = model.get_sentence_embedding_dimension()
print("Embedding dim:", d)

EMB_PATH = "/content/embeddings.memmap"
emb_mmap = np.memmap(EMB_PATH, dtype="float32", mode="w+", shape=(N, d))

# minimal metadata
ids = np.empty(N, dtype=object)
docs = np.empty(N, dtype=object)
pages = np.full(N, -1, dtype=np.int32)
previews = np.empty(N, dtype=object)


Total chunks: 114
Embedding dim: 384


In [None]:
BATCH = 1000  # reduce to 500 if memory is tight
buf_texts, buf_meta = [], []
w = 0

def flush_buffer():
    global w, buf_texts, buf_meta
    if not buf_texts:
        return
    X = embed(buf_texts, batch_size=64).astype("float32")
    emb_mmap[w:w+len(X)] = X
    for k, m in enumerate(buf_meta):
        i = w + k
        ids[i] = m["chunk_id"]
        docs[i] = m["doc"]
        pages[i] = m.get("page", -1)
        previews[i] = (m["text"][:140]).replace("\n", " ")
    w += len(X)
    buf_texts, buf_meta = [], []
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        buf_texts.append(rec["text"])
        buf_meta.append(rec)
        if len(buf_texts) >= BATCH:
            flush_buffer()
    flush_buffer()

emb_mmap.flush()
print("Embedded:", w, "vectors saved to", EMB_PATH)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Embedded: 114 vectors saved to /content/embeddings.memmap


In [None]:
index = faiss.IndexFlatIP(d)     # cosine ≈ inner product because we normalized
index.add(emb_mmap)
print("FAISS index size:", index.ntotal)

FAISS_PATH = "/content/index.faiss"
faiss.write_index(index, FAISS_PATH)
print("Saved index:", FAISS_PATH)

meta_df = pd.DataFrame({
    "idx": np.arange(N),
    "chunk_id": ids,
    "doc": docs,
    "page": pages,
    "preview": previews
})
META_PATH = "/content/meta.csv"
meta_df.to_csv(META_PATH, index=False)
print("Saved meta:", META_PATH)


FAISS index size: 114
Saved index: /content/index.faiss
Saved meta: /content/meta.csv


In [None]:
# Reload fresh (proves persistence works)
index = faiss.read_index(FAISS_PATH)
meta = pd.read_csv(META_PATH)

def search(query, k=5):
    q = embed([query], batch_size=1).astype("float32")
    D, I = index.search(q, k)
    out = meta.iloc[I[0]].copy()
    out["score"] = D[0]
    return out[["doc","page","score","preview"]]

display(search("What are the AI risk categories in the EU AI Act?", k=5))
print("----")
display(search("Explain scaled dot-product attention.", k=5))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,doc,page,score,preview
4,EU_AI_Act,-1,0.825402,The AI Act classifies AI according to its risk:
38,EU_AI_Act,-1,0.721808,Some AI systems are considered 'High risk' und...
0,EU_AI_Act,-1,0.695345,High-level summary of the AI Act
2,EU_AI_Act,-1,0.693697,In this article we provide you with a high-lev...
79,EU_AI_Act,-1,0.677369,How will the AI Act be implemented?


----


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,doc,page,score,preview
94,Transformer,3,0.594114,Scaled Dot-Product Attention Multi-Head Attent...
111,Transformer,12,0.482228,Attention Visualizations Input-Input Layer5 It...
112,Transformer,13,0.413221,Input-Input Layer5 The Law will never be perfe...
95,Transformer,4,0.389805,output values. These are concatenated and once...
106,Transformer,9,0.36694,of the art. In the former task our best model...


In [None]:
from google.colab import files
files.download(FAISS_PATH)
files.download(META_PATH)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
DOCX_PATH = "/content/EU AI Act Doc (1) (3).docx"
PDF_PATH  = "/content/Attention_is_all_you_need (1) (3).pdf"


In [None]:
import os

print(os.path.exists("/content/EU AI Act Doc (1) (3).docx"))
print(os.path.exists("/content/Attention_is_all_you_need (1) (3).pdf"))


True
True


In [None]:
from docx import Document
from PyPDF2 import PdfReader

# --- DOCX (EU AI Act) ---
def read_docx(path):
    doc = Document(path)
    texts = []
    for p in doc.paragraphs:
        t = normalize_text(p.text)       # text normalisation
        if len(t) > 20:                  # filter short/empty lines
            texts.append(t)
    return texts

# --- PDF (Attention Is All You Need) ---
def read_pdf(path):
    pdf = PdfReader(path)
    texts = []
    for i, page in enumerate(pdf.pages):
        t = normalize_text(page.extract_text() or "")
        if len(t) > 20:
            texts.append(t)
    return texts

docx_texts = read_docx(DOCX_PATH)
pdf_texts = read_pdf(PDF_PATH)

print(f"EU AI Act paragraphs: {len(docx_texts)}")
print(f"Transformer paper pages: {len(pdf_texts)}")
print("\nSample cleaned paragraph:\n", docx_texts[5][:500])


EU AI Act paragraphs: 89
Transformer paper pages: 15

Sample cleaned paragraph:
 Unacceptable risk is prohibited (e.g. social scoring systems and manipulative AI).


In [None]:
# Example: chunk a few cleaned paragraphs
for i, text in enumerate(docx_texts[:2]):
    chunks = list(chunk_tokens(text, chunk_size=600, overlap=100))
    print(f"Paragraph {i} → {len(chunks)} chunks")
    print(chunks[0][:300], "\n---\n")


Paragraph 0 → 1 chunks
High-level summary of the AI Act 
---

Paragraph 1 → 1 chunks
Updated on 30 May in accordance with the Corrigendum version of the AI Act. 
---



In [None]:
OUT_PATH = "/content/chunks.jsonl"

def write_record(rec, path=OUT_PATH):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# Reset old file
if os.path.exists(OUT_PATH):
    os.remove(OUT_PATH)

# Process both documents
for i, text in enumerate(docx_texts):
    for j, chunk in enumerate(chunk_tokens(text, chunk_size=600, overlap=100)):
        rec = {"doc":"EU_AI_Act", "chunk_id":f"eu_{i}_{j}", "text":chunk}
        write_record(rec)

for i, text in enumerate(pdf_texts):
    for j, chunk in enumerate(chunk_tokens(text, chunk_size=600, overlap=100)):
        rec = {"doc":"Transformer", "chunk_id":f"tr_{i}_{j}", "text":chunk}
        write_record(rec)

!head -n 3 /content/chunks.jsonl


{"doc": "EU_AI_Act", "chunk_id": "eu_0_0", "text": "High-level summary of the AI Act"}
{"doc": "EU_AI_Act", "chunk_id": "eu_1_0", "text": "Updated on 30 May in accordance with the Corrigendum version of the AI Act."}
{"doc": "EU_AI_Act", "chunk_id": "eu_2_0", "text": "In this article we provide you with a high-level summary of the AI Act, selecting the parts which are most likely to be relevant to you regardless of who you are. We provide links to the original document where relevant so that you can always reference the Act text."}


In [None]:
!ls -lh /content/chunks.jsonl
!head -n 2 /content/chunks.jsonl


-rw-r--r-- 1 root root 62K Nov  9 10:25 /content/chunks.jsonl
{"doc": "EU_AI_Act", "chunk_id": "eu_0_0", "text": "High-level summary of the AI Act"}
{"doc": "EU_AI_Act", "chunk_id": "eu_1_0", "text": "Updated on 30 May in accordance with the Corrigendum version of the AI Act."}


In [None]:
from google.colab import files
files.download("/content/chunks.jsonl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# === Data Preparation: cleaning, text normalisation, chunking (one cell) ===
# Installs (safe to re-run)
!pip -q install python-docx PyPDF2 tiktoken unidecode

# --- Config: change these if your filenames differ ---
DOCX_PATH = "/content/EU AI Act Doc (1) (3).docx"
PDF_PATH  = "/content/Attention_is_all_you_need (1) (3).pdf"
OUT_PATH  = "/content/chunks.jsonl"

# --- Imports & utilities ---
import os, re, json, gc, unicodedata
from docx import Document
from PyPDF2 import PdfReader
from unidecode import unidecode
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")

def normalize_text(s: str) -> str:
    """Text normalisation + basic cleaning."""
    s = unidecode(s or "")                             # ASCII fallback
    s = unicodedata.normalize("NFC", s)                # Unicode standard form
    s = s.replace("\u00A0", " ")                       # non-breaking spaces
    s = re.sub(r"[ \t]+", " ", s)                      # collapse spaces
    s = re.sub(r"\s*\n\s*", "\n", s).strip()           # tidy newlines
    return s

def dehyphenate_pdf(s: str) -> str:
    """Join words broken by line hyphenation (common in PDFs)."""
    # join like "hydrox-\n yapatite" -> "hydroxyapatite"
    s = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", s)
    # join like "hydrox- yapatite" (no newline)
    s = re.sub(r"(\w)-\s+(\w)", r"\1\2", s)
    return s

def chunk_tokens(text: str, chunk_size=600, overlap=100):
    """Token-based chunking with overlap for context continuity."""
    ids = enc.encode(text)
    i = 0
    while i < len(ids):
        j = min(i + chunk_size, len(ids))
        yield enc.decode(ids[i:j])
        if j == len(ids): break
        i = max(0, j - overlap)

def write_record(rec, path=OUT_PATH):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# --- Safety checks for files ---
assert os.path.exists(DOCX_PATH), f"Missing file: {DOCX_PATH}"
assert os.path.exists(PDF_PATH),  f"Missing file: {PDF_PATH}"

# --- Start fresh output ---
if os.path.exists(OUT_PATH):
    os.remove(OUT_PATH)

# --- 1) LOAD + NORMALISE + CLEAN ---
# DOCX (paragraph-by-paragraph)
doc = Document(DOCX_PATH)
docx_texts = []
for p in doc.paragraphs:
    t = normalize_text(p.text)
    if len(t) > 20:                 # drop very short/empty lines
        docx_texts.append(t)
del doc; gc.collect()

# PDF (page-by-page)
pdf = PdfReader(PDF_PATH)
pdf_texts = []
for i, page in enumerate(pdf.pages):
    t = page.extract_text() or ""
    t = normalize_text(t)
    t = dehyphenate_pdf(t)
    if len(t) > 20:
        pdf_texts.append((t, i))    # keep page number for metadata
del pdf; gc.collect()

print(f" Loaded & cleaned: {len(docx_texts)} DOCX paragraphs, {len(pdf_texts)} PDF pages\n")

# --- Show SMALL samples for your report ---
if docx_texts:
    print("Sample cleaned DOCX paragraph:\n", docx_texts[0][:400], "\n---\n")
if pdf_texts:
    print("Sample cleaned PDF page text:\n", pdf_texts[0][0][:400], "\n---\n")

# --- 2) CHUNKING + WRITE DATASET (JSONL) ---
# DOCX chunks
doc_chunks = 0
for i, text in enumerate(docx_texts):
    for j, c in enumerate(chunk_tokens(text, chunk_size=600, overlap=100)):
        write_record({"doc":"EU_AI_Act", "chunk_id":f"eu_{i}_{j}", "text":c})
        doc_chunks += 1

# PDF chunks (with page metadata)
pdf_chunks = 0
for i, (text, page_idx) in enumerate(pdf_texts):
    for j, c in enumerate(chunk_tokens(text, chunk_size=600, overlap=100)):
        write_record({"doc":"Transformer", "chunk_id":f"tr_{i}_{j}", "page":int(page_idx), "text":c})
        pdf_chunks += 1

print(f" Chunked & saved: {doc_chunks} DOCX chunks, {pdf_chunks} PDF chunks")
print(f"Output file: {OUT_PATH}\n")

# --- Show SMALL samples from output for your submission ---
!ls -lh /content/chunks.jsonl
print("\nFirst 2 JSONL records:")
!head -n 2 /content/chunks.jsonl

# Also show example chunking for report (first paragraph/page only)
if docx_texts:
    example_chunks = list(chunk_tokens(docx_texts[0], chunk_size=600, overlap=100))
    print(f"\nExample: DOCX paragraph 0 -> {len(example_chunks)} chunks; preview of chunk 0:\n{example_chunks[0][:300]}")
if pdf_texts:
    example_chunks_pdf = list(chunk_tokens(pdf_texts[0][0], chunk_size=600, overlap=100))
    print(f"\nExample: PDF page 0 -> {len(example_chunks_pdf)} chunks; preview of chunk 0:\n{example_chunks_pdf[0][:300]}")


✅ Loaded & cleaned: 89 DOCX paragraphs, 15 PDF pages

Sample cleaned DOCX paragraph:
 High-level summary of the AI Act 
---

Sample cleaned PDF page text:
 Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani*
Google Brain
avaswani@google.comNoam Shazeer*
Google Brain
noam@google.comNiki Parmar*
Google Research
nikip@google.comJakob Uszkoreit*
Google Research
usz@google.com
Llion Jones*
Goog 
---

✅ Chunked & saved: 89 DOCX chunks, 25 PDF chunks
Output file: /content/chunks.jsonl

-rw-r--r-- 1 root root 63K Nov  9 10:25 /content/chunks.jsonl

First 2 JSONL records:
{"doc": "EU_AI_Act", "chunk_id": "eu_0_0", "text": "High-level summary of the AI Act"}
{"doc": "EU_AI_Act", "chunk_id": "eu_1_0", "text": "Updated on 30 May in accordance with the Corrigendum version of the AI Act."}

Example: DOCX paragraph 0 -> 1 chunks; preview o

In [None]:
import os
os.makedirs("eval", exist_ok=True)
print("Created eval/ folder")


Created eval/ folder


In [None]:
import json

queries = [
  # EU AI Act (policy/regulation)
  {"id":"q01","doc_hint":"EU_AI_Act","type":"list",
   "query":"What are the AI risk categories defined by the EU AI Act?"},
  {"id":"q02","doc_hint":"EU_AI_Act","type":"list",
   "query":"List the prohibited AI practices under the EU AI Act."},
  {"id":"q03","doc_hint":"EU_AI_Act","type":"obligations",
   "query":"Name three core obligations for providers of high-risk AI systems."},
  {"id":"q04","doc_hint":"EU_AI_Act","type":"timeline",
   "query":"When do the obligations for high-risk AI systems start to apply?"},
  {"id":"q05","doc_hint":"EU_AI_Act","type":"definition",
   "query":"What qualifies a GPAI model as creating systemic risk under the Act?"},

  # Transformer paper (technical/ML)
  {"id":"q06","doc_hint":"Transformer","type":"why",
   "query":"Why does the Transformer remove recurrence and convolution?"},
  {"id":"q07","doc_hint":"Transformer","type":"definition",
   "query":"Explain scaled dot-product attention."},
  {"id":"q08","doc_hint":"Transformer","type":"mechanism",
   "query":"What is multi-head attention and why is it useful?"},
  {"id":"q09","doc_hint":"Transformer","type":"mechanism",
   "query":"How do sinusoidal positional encodings work?"},
  {"id":"q10","doc_hint":"Transformer","type":"numbers",
   "query":"State the base model’s typical hyperparameters (e.g., d_model, heads)."}
]

with open("eval/queries.jsonl", "w", encoding="utf-8") as f:
    for q in queries:
        f.write(json.dumps(q, ensure_ascii=False) + "\n")

print("Saved eval/queries.jsonl with", len(queries), "queries.")


Saved eval/queries.jsonl with 10 queries.


In [None]:
rationale = """# Rationale for Query Selection

- **Breadth across domains:** 5 queries from the EU AI Act (policy/regulation) and 5 from “Attention Is All You Need” (technical ML) to test retrieval on heterogeneous language and structure.
- **Varied answer types:** definitions (q05, q07), lists (q01, q02), mechanisms/how (q08–q09), timelines (q04), and numeric facts (q10).
- **Retrieval difficulty mix:** short exact answers (q10) vs. multi-sentence explanations (q03, q08–q09) to exercise chunking and top-k retrieval.
- **Verifiability:** each query targets a specific, checkable passage in the selected documents, reducing ambiguity and hallucination risk.
- **Consistency:** this exact file (`eval/queries.jsonl`) will be used unchanged for all experiments and reports in later tasks.
"""
open("eval/rationale.md","w",encoding="utf-8").write(rationale)
print("Saved eval/rationale.md")


Saved eval/rationale.md


In [None]:
import json

# Load, enrich, and overwrite (optional)
enriched = []
for q in queries:
    if q["id"] == "q01":
        q["expected_keywords"] = ["unacceptable","high-risk","limited-risk","minimal-risk"]
    if q["id"] == "q02":
        q["expected_keywords"] = ["prohibited"]  # you can refine later
    if q["id"] == "q07":
        q["expected_keywords"] = ["softmax","query","key","value","dot-product","scale"]
    enriched.append(q)

with open("eval/queries.jsonl", "w", encoding="utf-8") as f:
    for q in enriched:
        f.write(json.dumps(q, ensure_ascii=False) + "\n")

print("Updated eval/queries.jsonl with expected_keywords for a few items.")


Updated eval/queries.jsonl with expected_keywords for a few items.


In [None]:
!ls -lh eval/
!head -n 3 eval/queries.jsonl
!sed -n '1,80p' eval/rationale.md


total 8.0K
-rw-r--r-- 1 root root 1.5K Nov  9 10:25 queries.jsonl
-rw-r--r-- 1 root root  786 Nov  9 10:25 rationale.md
{"id": "q01", "doc_hint": "EU_AI_Act", "type": "list", "query": "What are the AI risk categories defined by the EU AI Act?", "expected_keywords": ["unacceptable", "high-risk", "limited-risk", "minimal-risk"]}
{"id": "q02", "doc_hint": "EU_AI_Act", "type": "list", "query": "List the prohibited AI practices under the EU AI Act.", "expected_keywords": ["prohibited"]}
{"id": "q03", "doc_hint": "EU_AI_Act", "type": "obligations", "query": "Name three core obligations for providers of high-risk AI systems."}
# Rationale for Query Selection

- **Breadth across domains:** 5 queries from the EU AI Act (policy/regulation) and 5 from “Attention Is All You Need” (technical ML) to test retrieval on heterogeneous language and structure.
- **Varied answer types:** definitions (q05, q07), lists (q01, q02), mechanisms/how (q08–q09), timelines (q04), and numeric facts (q10).
- **Retrie

In [None]:
from google.colab import files
files.download("eval/queries.jsonl")
files.download("eval/rationale.md")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json

def load_queries(path="eval/queries.jsonl"):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

QUERIES = load_queries()
print("Loaded", len(QUERIES), "queries:", [q["id"] for q in QUERIES])


Loaded 10 queries: ['q01', 'q02', 'q03', 'q04', 'q05', 'q06', 'q07', 'q08', 'q09', 'q10']


In [None]:
!pip -q install sentence-transformers faiss-cpu rank-bm25


In [None]:
import faiss, pandas as pd, numpy as np, json
from sentence_transformers import SentenceTransformer
import torch, os

FAISS_PATH = "/content/index.faiss"
META_PATH  = "/content/meta.csv"
CHUNKS_PATH= "/content/chunks.jsonl"

assert os.path.exists(FAISS_PATH) and os.path.exists(META_PATH)

index = faiss.read_index(FAISS_PATH)
meta  = pd.read_csv(META_PATH)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

def embed(qs):
    return model.encode(qs, convert_to_numpy=True, normalize_embeddings=True, device=device)


In [None]:
def dense_search(query, k=5):
    q = embed([query]).astype("float32")
    D, I = index.search(q, k)
    out = meta.iloc[I[0]].copy()
    out["score"] = D[0]
    cols = ["doc","page","score","preview","idx","chunk_id"]
    return out[cols]


In [None]:
dense_search("Explain scaled dot-product attention.", k=5)


Unnamed: 0,doc,page,score,preview,idx,chunk_id
94,Transformer,3,0.594114,Scaled Dot-Product Attention Multi-Head Attent...,94,tr_3_0
111,Transformer,12,0.482228,Attention Visualizations Input-Input Layer5 It...,111,tr_12_0
112,Transformer,13,0.413221,Input-Input Layer5 The Law will never be perfe...,112,tr_13_0
95,Transformer,4,0.389805,output values. These are concatenated and once...,95,tr_4_0
106,Transformer,9,0.36694,of the art. In the former task our best model...,106,tr_9_1


In [None]:
from rank_bm25 import BM25Okapi
import re

# Load corpus (text + ids)
texts, ids = [], []
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        texts.append(rec["text"])
        ids.append(rec.get("chunk_id"))

# Simple tokenizer
def tok(s): return re.findall(r"\w+", s.lower())
tokenized_corpus = [tok(t) for t in texts]
bm25 = BM25Okapi(tokenized_corpus)

# Map chunk_id -> row in meta (for quick join)
id2row = {row["chunk_id"]: i for _, row in meta.iterrows()}


In [None]:
def bm25_search(query, k=100):
    scores = bm25.get_scores(tok(query))
    idxs = np.argsort(scores)[::-1][:k]
    rows = meta.iloc[[id2row[ids[i]] for i in idxs]].copy()
    rows["bm25_score"] = scores[idxs]
    return rows


In [None]:
def rrf_fusion(query, k=10, k_dense=50, k_bm25=200, c=60):
    # dense results
    q = embed([query]).astype("float32")
    D, I = index.search(q, k_dense)
    dense_ids = meta.iloc[I[0]]["chunk_id"].tolist()
    dense_rank = {cid: r for r, cid in enumerate(dense_ids, start=1)}

    # bm25 results
    bm = bm25_search(query, k=k_bm25)
    bm_ids = bm["chunk_id"].tolist()
    bm_rank = {cid: r for r, cid in enumerate(bm_ids, start=1)}

    # union of candidates
    cand = set(dense_ids) | set(bm_ids)

    # RRF score
    scores = []
    for cid in cand:
        rd = dense_rank.get(cid, 10**9)  # large if absent
        rb = bm_rank.get(cid, 10**9)
        s  = 1.0/(c+rd) + 1.0/(c+rb)
        scores.append((cid, s))
    scores.sort(key=lambda x: x[1], reverse=True)
    top = [cid for cid,_ in scores[:k]]

    out = meta.set_index("chunk_id").loc[top].reset_index()
    out["rrf_score"] = [s for _, s in scores[:k]]
    cols = ["doc","page","rrf_score","preview","chunk_id","idx"]
    return out[cols]


In [None]:
rrf_fusion("What are the AI risk categories defined by the EU AI Act?", k=5)


Unnamed: 0,doc,page,rrf_score,preview,chunk_id,idx
0,EU_AI_Act,-1,0.016736,"This applies to users located in the EU, and t...",eu_14_0,14
1,EU_AI_Act,-1,0.016393,The AI Act classifies AI according to its risk:,eu_4_0,4
2,EU_AI_Act,-1,0.016129,Some AI systems are considered 'High risk' und...,eu_38_0,38
3,EU_AI_Act,-1,0.015873,High-level summary of the AI Act,eu_0_0,0
4,EU_AI_Act,-1,0.015625,In this article we provide you with a high-lev...,eu_2_0,2


In [None]:
def mmr(query, candidates_texts, candidates_ids, lambda_=0.7, k=5):
    q_vec = embed([query])[0]
    C = embed(candidates_texts)
    selected, selected_ids = [], []
    cand_idxs = list(range(len(candidates_texts)))

    # Precompute cosine (dot, already normalized)
    sims_q = C @ q_vec
    sims_C = C @ C.T

    # Greedy selection
    first = int(np.argmax(sims_q))
    selected.append(first); selected_ids.append(candidates_ids[first])
    cand_idxs.remove(first)

    while len(selected) < min(k, len(candidates_texts)):
        best_i, best_val = None, -1e9
        for i in cand_idxs:
            div = max((sims_C[i, j] for j in selected), default=0.0)
            val = lambda_ * sims_q[i] - (1 - lambda_) * div
            if val > best_val:
                best_val, best_i = val, i
        selected.append(best_i); selected_ids.append(candidates_ids[best_i])
        cand_idxs.remove(best_i)
    return selected_ids

def rrf_mmr(query, k_return=5, pool=50):
    pool_df = rrf_fusion(query, k=pool)
    texts = pool_df["preview"].fillna("").tolist()
    cids  = pool_df["chunk_id"].tolist()
    sel_ids = mmr(query, texts, cids, lambda_=0.7, k=k_return)
    out = pool_df.set_index("chunk_id").loc[sel_ids].reset_index()
    return out


In [None]:
rrf_mmr("Explain scaled dot-product attention.", k_return=5, pool=50)


Unnamed: 0,chunk_id,doc,page,rrf_score,preview,idx
0,tr_3_0,Transformer,3,0.016393,Scaled Dot-Product Attention Multi-Head Attent...,94
1,tr_1_1,Transformer,1,0.014925,"operations, albeit at the cost of reduced eff...",92
2,tr_6_0,Transformer,6,0.014286,length nis smaller than the representation dim...,99
3,tr_2_0,Transformer,2,0.014706,Figure 1: The Transformer - model architecture...,93
4,tr_4_0,Transformer,4,0.015625,output values. These are concatenated and once...,95


In [None]:
def show(query, method="dense", k=5):
    if method=="dense": df = dense_search(query, k)
    elif method=="rrf": df = rrf_fusion(query, k)
    elif method=="rrf_mmr": df = rrf_mmr(query, k_return=k, pool=50)
    else: raise ValueError("method must be 'dense' | 'rrf' | 'rrf_mmr'")
    print(f"\nQuery: {query}\nTop-{k} ({method})")
    for i, r in df.reset_index(drop=True).iterrows():
        p = "" if pd.isna(r.get("page", None)) else f" p.{int(r['page'])}"
        s = r.get("score", r.get("rrf_score", None))
        print(f"{i+1}. [{r['doc']}{p}]  score={s:.4f}")
        print("   ", str(r["preview"])[:180].replace("\n"," ") + " …")
    return df

# Examples
show("What are the AI risk categories defined by the EU AI Act?", method="rrf", k=5)
show("Explain scaled dot-product attention.", method="rrf_mmr", k=5)



Query: What are the AI risk categories defined by the EU AI Act?
Top-5 (rrf)
1. [EU_AI_Act p.-1]  score=0.0167
    This applies to users located in the EU, and third country users where the AI system's output is used in the EU. …
2. [EU_AI_Act p.-1]  score=0.0164
    The AI Act classifies AI according to its risk: …
3. [EU_AI_Act p.-1]  score=0.0161
    Some AI systems are considered 'High risk' under the AI Act. Providers of those systems will be subject to additional requirements. …
4. [EU_AI_Act p.-1]  score=0.0159
    High-level summary of the AI Act …
5. [EU_AI_Act p.-1]  score=0.0156
    In this article we provide you with a high-level summary of the AI Act, selecting the parts which are most likely to be relevant to you rega …

Query: Explain scaled dot-product attention.
Top-5 (rrf_mmr)
1. [Transformer p.3]  score=0.0164
    Scaled Dot-Product Attention Multi-Head Attention Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of se …
2. [Transfo

Unnamed: 0,chunk_id,doc,page,rrf_score,preview,idx
0,tr_3_0,Transformer,3,0.016393,Scaled Dot-Product Attention Multi-Head Attent...,94
1,tr_1_1,Transformer,1,0.014925,"operations, albeit at the cost of reduced eff...",92
2,tr_6_0,Transformer,6,0.014286,length nis smaller than the representation dim...,99
3,tr_2_0,Transformer,2,0.014706,Figure 1: The Transformer - model architecture...,93
4,tr_4_0,Transformer,4,0.015625,output values. These are concatenated and once...,95


In [None]:
!pip -q install sentence-transformers faiss-cpu

import os, faiss, pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer
import torch

FAISS_PATH = "/content/index.faiss"
META_PATH  = "/content/meta.csv"

assert os.path.exists(FAISS_PATH) and os.path.exists(META_PATH), "index.faiss or meta.csv missing."

# Load FAISS + metadata
index = faiss.read_index(FAISS_PATH)
meta  = pd.read_csv(META_PATH)

# Embedder
device = "cuda" if torch.cuda.is_available() else "cpu"
model  = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

def embed(qs):
    return model.encode(qs, convert_to_numpy=True, normalize_embeddings=True, device=device)


In [None]:
def dense_search(query, k=5):
    q = embed([query]).astype("float32")
    D, I = index.search(q, k)
    out = meta.iloc[I[0]].copy()
    out["score"] = D[0]
    return out[["doc","page","score","preview","chunk_id"]]

def show(query, k=5):
    df = dense_search(query, k=k).reset_index(drop=True)
    print(f"\nQuery: {query}\nTop-{k} (dense)")
    for i, r in df.iterrows():
        p = ("" if pd.isna(r["page"]) else f" p.{int(r['page'])}")
        print(f"{i+1}. [{r['doc']}{p}]  score={r['score']:.4f}")
        print("   ", str(r["preview"])[:180].replace("\n"," "), "…")
    return df


In [None]:
# Example (pick any of your eval/queries.jsonl items)
show("What are the AI risk categories defined by the EU AI Act?", k=5)



Query: What are the AI risk categories defined by the EU AI Act?
Top-5 (dense)
1. [EU_AI_Act p.-1]  score=0.8238
    The AI Act classifies AI according to its risk: …
2. [EU_AI_Act p.-1]  score=0.7093
    Some AI systems are considered 'High risk' under the AI Act. Providers of those systems will be subject to additional requirements. …
3. [EU_AI_Act p.-1]  score=0.6673
    High-level summary of the AI Act …
4. [EU_AI_Act p.-1]  score=0.6657
    In this article we provide you with a high-level summary of the AI Act, selecting the parts which are most likely to be relevant to you rega …
5. [EU_AI_Act p.-1]  score=0.6547
    Those that intend to place on the market or put into service high-risk AI systems in the EU, regardless of whether they are based in the EU  …


Unnamed: 0,doc,page,score,preview,chunk_id
0,EU_AI_Act,-1,0.823752,The AI Act classifies AI according to its risk:,eu_4_0
1,EU_AI_Act,-1,0.709343,Some AI systems are considered 'High risk' und...,eu_38_0
2,EU_AI_Act,-1,0.667293,High-level summary of the AI Act,eu_0_0
3,EU_AI_Act,-1,0.665679,In this article we provide you with a high-lev...,eu_2_0
4,EU_AI_Act,-1,0.654711,Those that intend to place on the market or pu...,eu_10_0


In [None]:
!pip -q install rank-bm25
from rank_bm25 import BM25Okapi
import json, re

CHUNKS_PATH = "/content/chunks.jsonl"
assert os.path.exists(CHUNKS_PATH), "chunks.jsonl missing."

texts, ids = [], []
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        texts.append(rec["text"])
        ids.append(rec["chunk_id"])

def tok(s): return re.findall(r"\w+", s.lower())
tokenized = [tok(t) for t in texts]
bm25 = BM25Okapi(tokenized)

id2row = {row["chunk_id"]: i for _, row in meta.iterrows()}

def bm25_top_ids(query, k=200):
    scores = bm25.get_scores(tok(query))
    idxs = np.argsort(scores)[::-1][:k]
    return [ids[i] for i in idxs]


In [None]:
def rrf_fusion(query, k=5, k_dense=50, k_bm25=200, c=60):
    # dense
    q = embed([query]).astype("float32")
    D, I = index.search(q, k_dense)
    dense_ids = meta.iloc[I[0]]["chunk_id"].tolist()
    dense_rank = {cid: r for r, cid in enumerate(dense_ids, start=1)}
    # bm25
    bm_ids = bm25_top_ids(query, k=k_bm25)
    bm_rank = {cid: r for r, cid in enumerate(bm_ids, start=1)}
    # fuse
    cand = set(dense_ids) | set(bm_ids)
    scored = []
    for cid in cand:
        rd = dense_rank.get(cid, 10**9)
        rb = bm_rank.get(cid, 10**9)
        s  = 1/(c+rd) + 1/(c+rb)
        scored.append((cid, s))
    scored.sort(key=lambda x: x[1], reverse=True)
    top = [cid for cid,_ in scored[:k]]
    df = meta.set_index("chunk_id").loc[top].reset_index()
    df["rrf_score"] = [s for _, s in scored[:k]]
    return df[["doc","page","rrf_score","preview","chunk_id"]]

def show_hybrid(query, k=5):
    df = rrf_fusion(query, k=k).reset_index(drop=True)
    print(f"\nQuery: {query}\nTop-{k} (hybrid RRF: dense+BM25)")
    for i, r in df.iterrows():
        p = ("" if pd.isna(r["page"]) else f" p.{int(r['page'])}")
        print(f"{i+1}. [{r['doc']}{p}]  rrf={r['rrf_score']:.4f}")
        print("   ", str(r["preview"])[:180].replace("\n"," "), "…")
    return df


In [None]:
show_hybrid("Explain scaled dot-product attention.", k=5)



Query: Explain scaled dot-product attention.
Top-5 (hybrid RRF: dense+BM25)
1. [Transformer p.3]  rrf=0.0328
    Scaled Dot-Product Attention Multi-Head Attention Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of se …
2. [Transformer p.4]  rrf=0.0318
    output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure 2. Multi-head attention …
3. [Transformer p.12]  rrf=0.0308
    Attention Visualizations Input-Input Layer5 It is in this spirit that a majority of American governments have passed new laws since 2009 mak …
4. [Transformer p.8]  rrf=0.0301
    9 BLEU worse than the best setting, quality also drops off with too many heads. In Table 3 rows (B), we observe that reducing the attention  …
5. [Transformer p.1]  rrf=0.0299
     operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with  …


Unnamed: 0,doc,page,rrf_score,preview,chunk_id
0,Transformer,3,0.032787,Scaled Dot-Product Attention Multi-Head Attent...,tr_3_0
1,Transformer,4,0.031754,output values. These are concatenated and once...,tr_4_0
2,Transformer,12,0.030835,Attention Visualizations Input-Input Layer5 It...,tr_12_0
3,Transformer,8,0.030118,"9 BLEU worse than the best setting, quality al...",tr_8_1
4,Transformer,1,0.029851,"operations, albeit at the cost of reduced eff...",tr_1_1


In [None]:
df_dense  = show("Explain scaled dot-product attention.", k=5)
df_hybrid = show_hybrid("Explain scaled dot-product attention.", k=5)
df_dense.to_csv("/content/demo_dense.csv", index=False)
df_hybrid.to_csv("/content/demo_hybrid.csv", index=False)



Query: Explain scaled dot-product attention.
Top-5 (dense)
1. [Transformer p.3]  score=0.5941
    Scaled Dot-Product Attention Multi-Head Attention Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of se …
2. [Transformer p.12]  score=0.4822
    Attention Visualizations Input-Input Layer5 It is in this spirit that a majority of American governments have passed new laws since 2009 mak …
3. [Transformer p.13]  score=0.4132
    Input-Input Layer5 The Law will never be perfect , but its application should be just - this is what we are missing , in my opinion . <EOS>  …
4. [Transformer p.4]  score=0.3898
    output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure 2. Multi-head attention …
5. [Transformer p.9]  score=0.3669
     of the art. In the former task our best model outperforms even all previously reported ensembles. We are excited about the future of attent …

Query: Explain scaled dot-p

In [None]:
!pip -q install transformers accelerate sentence-transformers faiss-cpu

import os, faiss, pandas as pd, numpy as np, json, torch, re
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Paths
FAISS_PATH = "/content/index.faiss"
META_PATH  = "/content/meta.csv"
CHUNKS_PATH= "/content/chunks.jsonl"

assert os.path.exists(FAISS_PATH) and os.path.exists(META_PATH), "Missing index.faiss/meta.csv"


In [None]:
# Load FAISS + metadata
index = faiss.read_index(FAISS_PATH)
meta  = pd.read_csv(META_PATH)

# Embedding model for the query
device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

def embed(qs):
    return embedder.encode(qs, convert_to_numpy=True, normalize_embeddings=True, device=device)

def dense_search(query, k=5):
    q = embed([query]).astype("float32")
    D, I = index.search(q, k)
    out = meta.iloc[I[0]].copy()
    out["score"] = D[0]
    return out[["doc","page","score","preview","chunk_id","idx"]]


In [None]:
model_name = "google/flan-t5-base"     # small, works on CPU; upgrade to flan-t5-large if you have GPU
tok = AutoTokenizer.from_pretrained(model_name)
mdl = AutoModelForSeq2SeqLM.from_pretrained(model_name)
gen = pipeline("text2text-generation", model=mdl, tokenizer=tok,
               device=0 if torch.cuda.is_available() else -1)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def make_context_blocks(df, max_chars=3000):
    """Return a list of context blocks and a single concatenated string."""
    blocks, total = [], 0
    for i, r in df.iterrows():
        text = str(r["preview"]) if pd.notna(r["preview"]) else ""
        header = f"[C{i+1}] (chunk_id={r['chunk_id']}, doc={r['doc']}, page={int(r['page']) if pd.notna(r['page']) else -1})"
        block = f"{header}\n{text}\n"
        if total + len(block) > max_chars:
            break
        blocks.append(block); total += len(block)
    return blocks, "\n".join(blocks)

def build_prompt(query, df):
    _, ctx = make_context_blocks(df, max_chars=3000)
    prompt = f"""You are a careful assistant. Use ONLY the CONTEXT to answer.
If the answer is not in the context, say: "I don't know based on the provided context."

CONTEXT:
{ctx}

QUESTION:
{query}

ANSWER (cite chunk_ids you used, e.g., [C1], [C2]):
"""
    return prompt


In [None]:
def answer_with_context(query, k=5, max_new_tokens=256, temperature=0.2):
    # 1) retrieve
    df = dense_search(query, k=k).reset_index(drop=True)
    # 2) build prompt
    prompt = build_prompt(query, df)
    # 3) generate
    out = gen(prompt, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=False)[0]["generated_text"]
    # 4) show answer + which chunks were provided
    used_chunks = [f"[C{i+1}] {df.loc[i,'chunk_id']}" for i in range(len(df))]
    print("Answer:\n", out.strip(), "\n")
    print("Context chunks provided:")
    for uc in used_chunks:
        print("  ", uc)
    return out, df


In [None]:
# Example 1 — EU AI Act
_ = answer_with_context("What are the AI risk categories defined by the EU AI Act?", k=5)

# Example 2 — Transformer paper
_ = answer_with_context("Explain scaled dot-product attention.", k=5)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
 [C2] (chunk_id=eu_38_0, doc=EU_AI_Act, page=-1) Some AI systems are considered 'High risk' under the AI Act. Providers of those systems will be subject to additional requirements. [C3] (chunk_id=eu_0_0, doc=EU_AI_Act, page=-1) High-level summary of the AI Act [C4] (chunk_id=eu_2_0, doc=EU_AI_Act, page=-1) In this article we provide you with a high-level summary of the AI Act, selecting the parts which are most likely to be relevant to you rega [C5] (chunk_id=eu_10_0, doc=EU_AI_Act, page=-1) Those that intend to place on the market or put into service high-risk AI systems in the EU, regardless of whether they are based in the EU 

Context chunks provided:
   [C1] eu_4_0
   [C2] eu_38_0
   [C3] eu_0_0
   [C4] eu_2_0
   [C5] eu_10_0
Answer:
 [C1] (chunk_id=tr_3_0, doc=Transformer, page=3) Scaled Dot-Product Attention Multi-Head Attention consists of se [C2] (chunk_id=tr_12_0, doc=Transformer, page=12) Attention Visualizations Input-Input Layer5 It is in this spirit that a majorit

In [None]:

import json, os, pandas as pd
qs_path = "/content/eval/queries.jsonl"
if os.path.exists(qs_path):
    queries = [json.loads(l) for l in open(qs_path,"r",encoding="utf-8")]
    rows = []
    for q in queries:
        ans, df = answer_with_context(q["query"], k=5)
        rows.append({"id": q["id"], "query": q["query"], "answer": ans})
    pd.DataFrame(rows).to_csv("/content/generation_answers.csv", index=False)
    print("\nSaved: /content/generation_answers.csv")

Answer:
 [C2] (chunk_id=eu_38_0, doc=EU_AI_Act, page=-1) Some AI systems are considered 'High risk' under the AI Act. Providers of those systems will be subject to additional requirements. [C3] (chunk_id=eu_0_0, doc=EU_AI_Act, page=-1) High-level summary of the AI Act [C4] (chunk_id=eu_2_0, doc=EU_AI_Act, page=-1) In this article we provide you with a high-level summary of the AI Act, selecting the parts which are most likely to be relevant to you rega [C5] (chunk_id=eu_10_0, doc=EU_AI_Act, page=-1) Those that intend to place on the market or put into service high-risk AI systems in the EU, regardless of whether they are based in the EU 

Context chunks provided:
   [C1] eu_4_0
   [C2] eu_38_0
   [C3] eu_0_0
   [C4] eu_2_0
   [C5] eu_10_0
Answer:
 [C1], [C2] 

Context chunks provided:
   [C1] eu_20_0
   [C2] eu_79_0
   [C3] eu_0_0
   [C4] eu_4_0
   [C5] eu_19_0
Answer:
 [C3] (chunk_id=eu_49_0, doc=EU_AI_Act, page=-1) Requirements for providers of high-risk AI systems (Art. 8-17) [C4] (

In [None]:
from google.colab import files

# Path to your generated CSV file
csv_path = "/content/generation_answers.csv"

# Check the file exists, then download
import os
if os.path.exists(csv_path):
    print(" File found — downloading now...")
    files.download(csv_path)
else:
    print(" File not found! Make sure you ran the generation step and the file name/path is correct.")


 File found — downloading now...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os, json, re
import pandas as pd
import numpy as np

# ---------- helpers ----------
def load_queries(path="eval/queries.jsonl"):
    qs = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            qs.append(json.loads(line))
    return qs

def contains_any(text, keywords):
    t = text.lower()
    return any(kw.lower() in t for kw in keywords)

def contains_all(text, keywords):
    t = text.lower()
    return all(kw.lower() in t for kw in keywords)

def concat_context(df):
    # join previews into one string for groundedness checks
    return " ".join(str(x) for x in df["preview"].fillna("").tolist()).lower()

def first_hit_rank(retrieved_df, gold_doc_hint):
    # return 1-based rank of first row with matching doc; else None
    for i, (_, r) in enumerate(retrieved_df.reset_index(drop=True).iterrows(), start=1):
        if str(r.get("doc","")) == str(gold_doc_hint):
            return i
    return None

# ---------- evaluation loop ----------
def evaluate(queries, k=5):
    rows = []
    for q in queries:
        qid   = q.get("id", "")
        qtext = q["query"]
        gold  = q.get("doc_hint", None)              # optional
        expkw = q.get("expected_keywords", None)     # optional list

        # 1) Retrieval using your dense retriever
        topk = dense_search(qtext, k=k)  # <- your function from Task 3
        rec_k = None
        mrr   = None
        if gold:
            rank = first_hit_rank(topk, gold)        # 1..k  or None
            rec_k = 1 if (rank is not None and rank <= k) else 0
            mrr   = (1.0 / rank) if rank else 0.0

        # 2) Generation with context (from your Task 4)
        answer, ctx_df = answer_with_context(qtext, k=k)  # <- your function from Task 4
        ctx_text = concat_context(ctx_df)

        # 3) Accuracy (keyword-based, if keywords provided)
        acc_any = acc_all = None
        if expkw:
            acc_any = 1.0 if contains_any(answer, expkw) else 0.0
            acc_all = 1.0 if contains_all(answer, expkw) else 0.0

        # 4) Groundedness / hallucination
        # rule:
        # - if answer says "I don't know based on the provided context" -> not hallucinating (grounded=1)
        # - else, if it mentions any expected keyword AND that keyword exists in the context -> grounded=1
        # - else -> grounded=0 (possible hallucination)
        grounded = None
        idk = "i don't know based on the provided context" in answer.lower()
        if idk:
            grounded = 1.0
        elif expkw:
            mentions_any = contains_any(answer, expkw)
            in_context_any = contains_any(ctx_text, expkw)
            grounded = 1.0 if (mentions_any and in_context_any) else 0.0

        rows.append({
            "id": qid, "query": qtext, "gold_doc": gold,
            "recall@{}".format(k): rec_k,
            "mrr": mrr,
            "acc_any": acc_any,
            "acc_all": acc_all,
            "grounded": grounded,
            "answer": answer[:500],  # preview
        })

    df = pd.DataFrame(rows)
    # Aggregate metrics (ignore None)
    agg = {
        "recall@{}".format(k): np.nanmean([x for x in df["recall@{}".format(k)] if x is not None]) if len(df) else 0,
        "mrr": np.nanmean([x for x in df["mrr"] if x is not None]) if len(df) else 0,
        "acc_any": np.nanmean([x for x in df["acc_any"] if x is not None]) if len(df) else None,
        "acc_all": np.nanmean([x for x in df["acc_all"] if x is not None]) if len(df) else None,
        "grounded": np.nanmean([x for x in df["grounded"] if x is not None]) if len(df) else None,
    }
    return df, pd.Series(agg)

# ---------- run it ----------
QUERIES = load_queries("eval/queries.jsonl")  # your Task-2 file
results_df, summary = evaluate(QUERIES, k=5)

print("Per-query results:")
display(results_df)

print("\nSummary metrics:")
display(summary)


Answer:
 [C2] (chunk_id=eu_38_0, doc=EU_AI_Act, page=-1) Some AI systems are considered 'High risk' under the AI Act. Providers of those systems will be subject to additional requirements. [C3] (chunk_id=eu_0_0, doc=EU_AI_Act, page=-1) High-level summary of the AI Act [C4] (chunk_id=eu_2_0, doc=EU_AI_Act, page=-1) In this article we provide you with a high-level summary of the AI Act, selecting the parts which are most likely to be relevant to you rega [C5] (chunk_id=eu_10_0, doc=EU_AI_Act, page=-1) Those that intend to place on the market or put into service high-risk AI systems in the EU, regardless of whether they are based in the EU 

Context chunks provided:
   [C1] eu_4_0
   [C2] eu_38_0
   [C3] eu_0_0
   [C4] eu_2_0
   [C5] eu_10_0
Answer:
 [C1], [C2] 

Context chunks provided:
   [C1] eu_20_0
   [C2] eu_79_0
   [C3] eu_0_0
   [C4] eu_4_0
   [C5] eu_19_0
Answer:
 [C3] (chunk_id=eu_49_0, doc=EU_AI_Act, page=-1) Requirements for providers of high-risk AI systems (Art. 8-17) [C4] (

Unnamed: 0,id,query,gold_doc,recall@5,mrr,acc_any,acc_all,grounded,answer
0,q01,What are the AI risk categories defined by the...,EU_AI_Act,1,1.0,1.0,0.0,1.0,"[C2] (chunk_id=eu_38_0, doc=EU_AI_Act, page=-1..."
1,q02,List the prohibited AI practices under the EU ...,EU_AI_Act,1,1.0,0.0,0.0,0.0,"[C1], [C2]"
2,q03,Name three core obligations for providers of h...,EU_AI_Act,1,1.0,,,,"[C3] (chunk_id=eu_49_0, doc=EU_AI_Act, page=-1..."
3,q04,When do the obligations for high-risk AI syste...,EU_AI_Act,1,1.0,,,,24 months for high risk AI systems under Annex...
4,q05,What qualifies a GPAI model as creating system...,EU_AI_Act,1,1.0,,,,when the cumulative amount of compute used for...
5,q06,Why does the Transformer remove recurrence and...,Transformer,1,1.0,,,,reduced effective resolution due to averaging ...
6,q07,Explain scaled dot-product attention.,Transformer,1,1.0,1.0,0.0,1.0,"[C1] (chunk_id=tr_3_0, doc=Transformer, page=3..."
7,q08,What is multi-head attention and why is it use...,Transformer,1,1.0,,,,"Multi-head attention [C2] (chunk_id=tr_12_0, d..."
8,q09,How do sinusoidal positional encodings work?,Transformer,1,1.0,,,,"[C1] (chunk_id=tr_4_1, doc=Transformer, page=4..."
9,q10,State the base model’s typical hyperparameters...,Transformer,1,1.0,,,,"[C3] (chunk_id=tr_4_1, doc=Transformer, page=4..."



Summary metrics:


Unnamed: 0,0
recall@5,1.0
mrr,1.0
acc_any,0.666667
acc_all,0.0
grounded,0.666667



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
results_df.to_csv("/content/eval_results.csv", index=False)
summary.to_csv("/content/eval_summary.csv")

print("Saved to /content/eval_results.csv and /content/eval_summary.csv")


Saved to /content/eval_results.csv and /content/eval_summary.csv
