# RAG Hands-on

In [None]:
!pip install chromadb sentence-transformers

Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.0-py3-none-any.whl.metadata (2.5 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
# --- 1. Sample corpus ---
documents = [
    "The Applied Machine Learning course at the University of Southern Denmark covers modern ML techniques.",
    "Retrieval Augmented Generation (RAG) improves large language models by grounding answers in external documents.",
    "Tariq Yousef leads the Centre for Visual Data Science and works with NLP for scientific and historical texts.",
    "Vector databases like ChromaDB and FAISS store embeddings and support similarity search.",
    "RAG systems combine retrieval models with LLMs such as GPT, Llama, or Mistral."
]

In [None]:
# --- 2. Initialize ChromaDB ---
import chromadb

persist_dir = "eu_rag_db"   # folder where DB will be saved

client = chromadb.PersistentClient(path=persist_dir)

collection = client.create_collection("simple_rag")

In [None]:
# --- 3. Embed documents ---
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

embs = embedder.encode(documents).tolist()

collection.add(
    documents=documents,
    embeddings=embs,
    ids=[f"doc{i}" for i in range(len(documents))]
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# --- 4. Retrieval function ---
def retrieve(query, k=3):
    q_emb = embedder.encode([query]).tolist()[0]
    results = collection.query(query_embeddings=[q_emb], n_results=k)
    return results["documents"][0]


In [None]:
# --- 5. Load LLM (Phi-3 Mini) ---
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda", # "cpu" if you don't have GPU
    torch_dtype="auto"
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
# --- 6. RAG answer generation ---
def rag_answer(query):
    retrieved = retrieve(query)
    context = "\n".join(retrieved)

    prompt = f"""
Use the following context to answer the question.

CONTEXT:
{context}

QUESTION:
{query}

ANSWER:
"""
    tokens = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**tokens, max_new_tokens=200)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
# --- 7. Test queries ---
print(rag_answer("What is RAG and why is it useful?"))
print(rag_answer("Who leads the Centre for Visual Data Science?"))
print(rag_answer("What database can store embeddings?"))


Use the following context to answer the question.

CONTEXT:
RAG systems combine retrieval models with LLMs such as GPT, Llama, or Mistral.
Retrieval Augmented Generation (RAG) improves large language models by grounding answers in external documents.
Tariq Yousef leads the Centre for Visual Data Science and works with NLP for scientific and historical texts.

QUESTION:
What is RAG and why is it useful?

ANSWER:
RAG stands for Retrieval Augmented Generation. It is a technique that combines retrieval models with large language models (LLMs) like GPT, Llama, or Mistral. RAG improves the performance of LLMs by grounding their answers in external documents. This means that RAG can provide more accurate and relevant information by retrieving relevant documents from a large corpus and using them to generate responses.

RAG is useful because it enhances the capabilities of LLMs in various domains, such as scientific and historical texts. By grounding answers in external documents, RAG can pro

# Advanced PDF RAG

In [None]:
# https://www.eit.europa.eu/our-activities/knowledge-centre
pdf_urls = ["https://www.eit.europa.eu/sites/default/files/eit-digital-artificial-intelligence-report.pdf",
            "https://www.eit.europa.eu/sites/default/files/2025-11/EIT%20KICs%20Education%20Guide%202025.pdf",
            "https://www.eit.europa.eu/sites/default/files/2025-07/EIT%20KICs%20Impactful%20Innovation%20Guide%202025updated24042025.pdf"]

import requests

pdf_paths = []
for i, url in enumerate(pdf_urls):
    path = f"doc_{i+1}.pdf"
    with open(path, "wb") as f:
        f.write(requests.get(url).content)
    pdf_paths.append(path)

pdf_paths

['doc_1.pdf', 'doc_2.pdf', 'doc_3.pdf']

In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.4.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.4.0-py3-none-any.whl (329 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/329.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.5/329.5 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.4.0


In [None]:
from pypdf import PdfReader
from tqdm import tqdm
def load_multiple_pdfs(paths):

    corpus = []

    for path in paths:
      try:
        print(f"Loading {path}")
        reader = PdfReader(path)
        filename = path.split("/")[-1]

        for page_idx, page in enumerate(reader.pages):
            text = page.extract_text()
            if not text:
                continue

            corpus.append({
                "text": text,
                "metadata": {
                    "file": filename,
                    "id": f"{filename}_page_{page_idx+1}",
                    "title": f"{filename} — Page {page_idx+1}"
                }
            })
        print(f"Loading {path} complete!")
      except Exception as e:
          print(f"Error loading {path}: {e}")
    return corpus

pdf_corpus = load_multiple_pdfs(pdf_paths)
len(pdf_corpus), pdf_corpus[0]


Loading doc_1.pdf
Loading doc_1.pdf complete!
Loading doc_2.pdf




Loading doc_2.pdf complete!
Loading doc_3.pdf
Error loading doc_3.pdf: Stream has ended unexpectedly


(110,
 {'text': 'A EUROPEAN APPROACH  \nTO ARTIFICIAL INTELLIGENCE\nA POLICY PERSPECTIVE\n',
  'metadata': {'file': 'doc_1.pdf',
   'id': 'doc_1.pdf_page_1',
   'title': 'doc_1.pdf — Page 1'}})

In [None]:
pdf_corpus[10]

{'text': 'A EUROPEAN APPROACH TO ARTIFICIAL INTELLIGENCE - A POLICY PERSPECTIVE\n11\nGENERIC \nThere are five issues that, though from slightly different angles, \nare considered strategic and a potential source of barriers and \nbottlenecks: data, organisation, human capital, trust, markets. The \navailability and quality of data, as well as data governance are of \nstrategic importance. Strictly technical issues (i.e., inter-operabi-\nlity, standardisation) are mostly being solved, whereas internal and \nexternal data governance still restrain the full potential of AI Inno-\nvation. Organisational resources and, also, cognitive and cultural \nroutines are a challenge to cope with for full deployment. On the \none hand, there is the issue of the needed investments when evi-\ndence on return is not yet consolidated. On the other hand, equally \nimportant, are cultural conservatism and misalignment between \nanalytical and business objectives. Skills shortages are a main \nbottleneck in

In [None]:
#@title Chunk the content

def chunk_text(text, chunk_size=450, overlap=100):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

def build_chunks(corpus):
    chunk_texts, metadatas, ids = [], [], []

    for doc in corpus:
        chunks = chunk_text(doc["text"])
        for idx, ch in enumerate(chunks):
            chunk_texts.append(ch)
            meta = {**doc["metadata"], "chunk_idx": idx}
            metadatas.append(meta)
            ids.append(f'{meta["id"]}_chunk_{idx}')

    return chunk_texts, metadatas, ids

chunk_texts, metadatas, ids = build_chunks(pdf_corpus)

len(chunk_texts)


163

In [None]:
metadatas[11]

{'file': 'doc_1.pdf',
 'id': 'doc_1.pdf_page_7',
 'title': 'doc_1.pdf — Page 7',
 'chunk_idx': 1}

In [None]:
#@title Build a vector database for the new PDF

client2 = chromadb.Client()
eu_collection = client2.get_or_create_collection("eu_rag")

# reuse embedder from previous example
eu_embeddings = embedder.encode(chunk_texts, batch_size=32).tolist()

eu_collection.add(
    documents=chunk_texts,
    embeddings=eu_embeddings,
    metadatas=metadatas,
    ids=ids
)

eu_collection.count()


163

In [None]:
#@title Multi-query rewriting
def generate_alternative_queries(question, n_queries=3):
    prompt = (
        f"Rewrite the following question into {n_queries} alternative search queries, "
        f"each on a new line:\n\nQuestion: {question}"
    )

#     advanced_prompt = (
#     f"You are a query rewriting assistant."
#     f"Generate {n_queries} alternative search queries that express the same information need as the user's question."
#     f"Rules:"
#     f"- Each rewritten query MUST appear on a new line."
#     f"- Do NOT number the lines."
#     f"- Use different phrasings and synonyms."
#     f"- Expand implicit references."
#     f"- Do NOT include explanations or commentary."
#     f"User question: '{question}'"
#     f"Alternative search queries:"
# )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, max_new_tokens=150)
    text = tokenizer.decode(out[0], skip_special_tokens=True)

    lines = [l.strip("-• ").strip() for l in text.split("\n") if l.strip()]
    lines = [l for l in lines if question.lower() not in l.lower()]

    return lines[:n_queries]

In [None]:
#@title Multi-query retrieval on the PDF corpus
def retrieve_multi_query_pdf(question, k_per_query=3, n_queries=3):
    mq = generate_alternative_queries(question, n_queries=n_queries)
    all_qs = [question] + mq

    seen = set()
    results_list = []

    for q in all_qs:
        q_emb = embedder.encode([q]).tolist()[0]
        result = eu_collection.query(
            query_embeddings=[q_emb],
            n_results=k_per_query
        )

        for doc_id, doc, meta in zip(result["ids"][0], result["documents"][0], result["metadatas"][0]):
            if doc_id not in seen:
                seen.add(doc_id)
                results_list.append({"id": doc_id, "text": doc, "metadata": meta})

    return results_list


In [None]:
#@title Build RAG prompt with citations
def build_context_for_pdf(docs, limit_chars=3500):
    context = []
    total = 0
    for i, d in enumerate(docs, 1):
        header = f"[{i}] Source: {d['metadata']['title']} (chunk {d['metadata']['chunk_idx']})"
        block = f"{header}\n{d['text'][:900]}\n"
        if total + len(block) > limit_chars:
            break
        context.append(block)
        total += len(block)
    return "\n".join(context)


In [None]:
#@title Final RAG answer generator

def answer_pdf_rag(question):
    retrieved = retrieve_multi_query_pdf(question)
    context = build_context_for_pdf(retrieved)

    prompt = f"""
You are a helpful assistant answering based ONLY on the following PDF content.
Use citations like [1], [2] referring to the sources provided.

==================== SOURCES ====================
{context}
=================================================

Question: {question}

Answer with citations:
"""

    tokens = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**tokens, max_new_tokens=250, do_sample=False)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
queries = [
    "What are the main goals of the EU regarding artificial intelligence?",
    "How does the EU plan to regulate AI?",
    "What are the ethical risks mentioned in the report?",
    "What does the European Parliament say about transparency in AI?"
]

for q in queries:
    print("QUESTION:", q)
    answer = answer_pdf_rag(q)
    print("ANSWER:", answer.split("Answer with citations:")[1])
    print("\n" + "="*100 + "\n")


QUESTION: What are the main goals of the EU regarding artificial intelligence?
ANSWER: 

The main goals of the European Union (EU) regarding artificial intelligence (AI) are to foster innovation and growth while ensuring fairness and trust among its citizens. The EU aims to create a regulatory and governance framework that supports the development of AI technologies in a way that is beneficial to society as a whole. This involves addressing the challenges and barriers associated with AI, such as unfair discrimination and growing distrust, and finding concrete solutions that can be tailored to specific situations. The EU's approach is not to avoid AI altogether but to manage its risks and maximize its benefits. This is reflected in the EU's AI White Paper, which suggests an approach that is sector-specific and considers the unique features of different industries. The EU's goal is to ensure that AI technologies are developed and used in a way that promotes innovation and growth, while a

In [None]:
# #@title: Load database next time
# import chromadb

# persist_dir = "eu_rag_db"

# client = chromadb.PersistentClient(path=persist_dir)
# eu_collection = client.get_collection("eu_rag")

# print("Loaded documents:", eu_collection.count())
