# Baseline with RAG Integration + Chroma (persistent vectors)

This notebook is an implementation of the RAG pipeline on top of the plain Baseline model. Addtionally persistance has been implemented to analyse and preview vector embeddings using a Chroma user interface.

# Baseline with RAG Integration

This notebook extends the baseline LLM with retrieval-augmented generation (RAG).

Workflow:
- Load the same baseline model used in the prompt-engineering notebook
- Ingest a user-provided knowledge base (e.g., JSONL/CSV/TXT with recent release info)
- Build embeddings + FAISS index
- Compare answers: baseline `ask()` vs RAG `ask_rag()`



In [1]:
# Installs (Colab/Local)
%pip -q install -U transformers accelerate sentencepiece bitsandbytes sentence-transformers faiss-cpu pandas python-dotenv


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.[0m[31m
[0m

In [None]:
from huggingface_hub import login
login(token="acess token")


In [3]:
import os
import json
import pandas as pd
import torch
from typing import List, Dict, Optional, Tuple
from transformers import AutoTokenizer, AutoModelForCausalLM

try:
    from transformers import BitsAndBytesConfig
    _bnb_available = True
except Exception:
    _bnb_available = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "google/gemma-2-2b-it"

GEN_CFG = {
    "max_new_tokens": 800,
    "temperature": 0.3,
    "top_p": 0.9,
    "repetition_penalty": 1.1,
}

print("Device:", DEVICE)
print("Model:", MODEL_ID)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

if DEVICE == "cuda" and _bnb_available:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
    )
else:
    dtype = torch.float32 if DEVICE == "cpu" else torch.float16
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=dtype)
    model.to(DEVICE)

model.eval()
print("Model loaded.")


Device: cuda
Model: google/gemma-2-2b-it


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Model loaded.


In [4]:
PYTHON_ASSISTANT_SYSTEM_PROMPT = "You are a Python programming assistant."

def _format_chat(messages: List[Dict[str, str]], add_generation_prompt: bool = True) -> Dict[str, torch.Tensor]:
    if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
        effective_messages = messages
        if messages and messages[0].get("role") == "system":
            system_text = messages[0]["content"]
            effective_messages = messages[1:]
            if effective_messages and effective_messages[0].get("role") == "user":
                effective_messages = effective_messages.copy()
                effective_messages[0] = {
                    "role": "user",
                    "content": f"{system_text}\n\n{effective_messages[0]['content']}"
                }
            else:
                effective_messages = [{"role": "user", "content": system_text}]
        prompt_text = tokenizer.apply_chat_template(
            effective_messages,
            tokenize=False,
            add_generation_prompt=add_generation_prompt
        )
    else:
        sys_msg = ""
        if messages and messages[0].get("role") == "system":
            sys_msg = f"System: {messages[0]['content']}\n"
            user_msgs = messages[1:]
        else:
            user_msgs = messages
        convo = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in user_msgs])
        prompt_text = (sys_msg + convo + ("\nAssistant:" if add_generation_prompt else ""))

    inputs = tokenizer(prompt_text, return_tensors="pt")
    return {k: v.to(DEVICE) for k, v in inputs.items()}

@torch.inference_mode()
def generate_from_messages(
    messages: List[Dict[str, str]],
    max_new_tokens: int = GEN_CFG["max_new_tokens"],
    temperature: float = GEN_CFG["temperature"],
    top_p: float = GEN_CFG["top_p"],
    repetition_penalty: float = GEN_CFG["repetition_penalty"],
) -> str:
    inputs = _format_chat(messages, add_generation_prompt=True)
    input_len = inputs["input_ids"].shape[-1]
    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
    )
    gen_ids = outputs[0][input_len:]
    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
    return text.strip()

def ask(question: str, system_prompt: Optional[str] = PYTHON_ASSISTANT_SYSTEM_PROMPT, **gen_kwargs) -> str:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": question})
    return generate_from_messages(messages, **gen_kwargs)


In [5]:
# Knowledge base ingestion
from pathlib import Path
from typing import Union

def read_jsonl(path: Union[str, Path]) -> List[Dict[str, str]]:
    records = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            content = obj.get("content") or obj.get("text") or obj.get("body")
            if not content:
                continue
            records.append({
                "id": obj.get("id") or obj.get("_id") or str(len(records)),
                "title": obj.get("title") or "",
                "content": content,
                "meta": {k: v for k, v in obj.items() if k not in {"id", "_id", "title", "content", "text", "body"}}
            })
    return records

def read_csv(path: Union[str, Path]) -> List[Dict[str, str]]:
    df = pd.read_csv(path)
    if "content" not in df.columns:
        raise ValueError("CSV must have a 'content' column")
    recs = []
    for i, row in df.iterrows():
        recs.append({
            "id": str(row.get("id", i)),
            "title": str(row.get("title", "")),
            "content": str(row["content"]),
            "meta": {k: row[k] for k in df.columns if k not in {"id", "title", "content"}}
        })
    return recs

def read_texts(paths: List[Union[str, Path]]) -> List[Dict[str, str]]:
    recs = []
    for p in paths:
        p = Path(p)
        with open(p, "r", encoding="utf-8") as f:
            text = f.read()
        recs.append({
            "id": p.stem,
            "title": p.name,
            "content": text,
            "meta": {"path": str(p)}
        })
    return recs

def load_kb(path_or_dir: Union[str, Path]) -> List[Dict[str, str]]:
    p = Path(path_or_dir)
    if p.is_dir():
        files = list(p.rglob("*.jsonl")) + list(p.rglob("*.csv")) + list(p.rglob("*.txt")) + list(p.rglob("*.md"))
        recs: List[Dict[str, str]] = []
        for fpath in files:
            if fpath.suffix == ".jsonl":
                recs.extend(read_jsonl(fpath))
            elif fpath.suffix == ".csv":
                recs.extend(read_csv(fpath))
            else:
                recs.extend(read_texts([fpath]))
        return recs
    else:
        if p.suffix == ".jsonl":
            return read_jsonl(p)
        if p.suffix == ".csv":
            return read_csv(p)
        if p.suffix in {".txt", ".md"}:
            return read_texts([p])
        raise ValueError("Unsupported KB format; use .jsonl, .csv, .txt, .md or a directory containing them")


In [6]:
# Embeddings + FAISS index (unchanged)
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"  # small, fast

class VectorStore:
    def __init__(self, dim: int):
        self.dim = dim
        self.index = faiss.IndexFlatIP(dim)  # cosine via normalized dot product
        self.docs: List[Dict[str, str]] = []

    def add(self, embeddings: np.ndarray, docs: List[Dict[str, str]]):
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings)
        self.docs.extend(docs)

    def search(self, query_emb: np.ndarray, k: int = 5) -> List[Tuple[int, float]]:
        faiss.normalize_L2(query_emb)
        D, I = self.index.search(query_emb, k)
        return list(zip(I[0].tolist(), D[0].tolist()))

embedder = SentenceTransformer(EMBED_MODEL_ID)
emb_dim = embedder.get_sentence_embedding_dimension()
store = VectorStore(dim=emb_dim)

def build_index(records: List[Dict[str, str]], batch_size: int = 64):
    texts = [r["content"] for r in records]
    embeddings = []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i:i+batch_size]
        vecs = embedder.encode(chunk, convert_to_numpy=True, show_progress_bar=False, normalize_embeddings=False)
        embeddings.append(vecs)
    if not embeddings:
        return
    embeddings = np.vstack(embeddings)
    store.add(embeddings, records)

print("Embedding model:", EMBED_MODEL_ID)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model: sentence-transformers/all-MiniLM-L6-v2


In [7]:
# Retrieval and RAG ask() (unchanged)
from textwrap import dedent

def embed(texts: List[str]) -> np.ndarray:
    vecs = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False, normalize_embeddings=False)
    return vecs

def retrieve(query: str, k: int = 4) -> List[Dict[str, str]]:
    qvec = embed([query])
    hits = store.search(qvec, k=k)
    results = []
    for idx, score in hits:
        if idx == -1:
            continue
        doc = store.docs[idx]
        results.append({"score": float(score), **doc})
    return results

def make_context(snippets: List[Dict[str, str]]) -> str:
    blocks = []
    for s in snippets:
        title = s.get("title") or s.get("id") or ""
        prefix = f"Title: {title}\n" if title else ""
        blocks.append(prefix + s["content"])
    return "\n\n---\n\n".join(blocks)

def answer_with_context(question: str, context: str) -> str:
    sys = PYTHON_ASSISTANT_SYSTEM_PROMPT
    prompt = dedent(f"""
    Use the context below to answer the user question. If the answer is not in the context, say you don't know.

    Context:
    {context}

    Question: {question}
    """)
    return ask(prompt, system_prompt=sys, max_new_tokens=GEN_CFG["max_new_tokens"], temperature=GEN_CFG["temperature"])

# Public API (unchanged)

def ask_baseline(question: str) -> str:
    return ask(question)

def ask_rag(question: str, k: int = 4) -> Tuple[str, List[Dict[str, str]]]:
    snippets = retrieve(question, k=k)
    context = make_context(snippets)
    answer = answer_with_context(question, context)
    return answer, snippets


In [13]:
KB_PATH = "/content/updated_python_kb.jsonl"

records = load_kb(KB_PATH)
print(f"Loaded {len(records)} records from KB.")

store = VectorStore(dim=emb_dim)
build_index(records)
print(f"Indexed {len(store.docs)} documents.")

question = "Which CVEs were fixed in Python 3.12.x during mid‑2024, and which modules were impacted?"

print("\nQ:", question)
base_answer = ask_baseline(question)
rag_answer, used = ask_rag(question, k=4)
#print("\nBaseline:\n", base_answer)
print("\nRAG:\n", rag_answer)
print("\nContext snippets used (top-4):")
for i, snip in enumerate(used, 1):
    print(f"[{i}] id={snip.get('id')} title={snip.get('title','')[:80]} score={snip.get('score'):.3f}")


Loaded 7 records from KB.
Indexed 7 documents.

Q: Which CVEs were fixed in Python 3.12.x during mid‑2024, and which modules were impacted?

RAG:
 Here's the information based on the provided context:

* **CVE-2024-12345:**  Fixed an arbitrary file write vulnerability in the **`tempfile` module** on non-Windows systems.
* **CVE-2024-67890:** A potential ReDoS (Regular Expression Denial of Service) in the **`re` module** when parsing certain complex patterns.
* **CVE-2024-54321:** Fixed an issue in the **`email.utils` module** where malformed email addresses could lead to application crashes.

These CVEs were fixed in the following modules:

* **`tempfile` module**
* **`re` module**
* **`email.utils` module**

Context snippets used (top-4):
[1] id=q2_py3123 title=Python 3.12.3 (April 2024) – Bug fixes and security advisories score=0.726
[2] id=q4_py312x_mid2024_cves title=Python 3.12.x Mid-2024 CVEs and impacted modules score=0.720
[3] id=q1_py3122 title=Python 3.12.2 (March 2024) – Pat

---

## Chroma integration
Persists the same embeddings and documents to `data/chroma/` for live preview.


In [14]:
%pip -q install chromadb

import os, json, chromadb
os.makedirs("data/chroma", exist_ok=True)

# Use the new PersistentClient API
client = chromadb.PersistentClient(path="data/chroma")
col = client.get_or_create_collection(name="python_kb")

# Sanitize metadata to scalars (Chroma requires str/int/float/bool/None)
def sanitize_meta(m):
    out = {}
    for k, v in (m or {}).items():
        if isinstance(v, (str, int, float, bool)) or v is None:
            out[k] = v
        elif isinstance(v, (list, dict)):
            out[k] = json.dumps(v, ensure_ascii=False)
        else:
            out[k] = str(v)
    return out

# Upsert the same records and vectors already built above
_docs = [r["content"] for r in records]
_ids = [str(r["id"]) for r in records]
_metas = [{"title": r.get("title", ""), **sanitize_meta(r.get("meta"))} for r in records]
_vecs = embedder.encode(_docs, convert_to_numpy=True).tolist()

col.upsert(ids=_ids, documents=_docs, metadatas=_metas, embeddings=_vecs)
print(f"Chroma persisted {len(_ids)} docs to data/chroma/")

Chroma persisted 7 docs to data/chroma/


In [15]:
# Preview: show a stored embedding and a query result
res = col.get(ids=[_ids[0]], include=["embeddings","metadatas","documents"])
emb = res["embeddings"][0]
print("Stored doc emb dim:", len(emb), "preview:", emb[:8], "title:", res["metadatas"][0].get("title"))

q = "What PEP replaced PEP 722 for inline script metadata?"
qvec = embedder.encode([q], convert_to_numpy=True).tolist()
qout = col.query(query_embeddings=qvec, n_results=3, include=["distances","documents","metadatas","embeddings"])
print("Query emb dim:", len(qvec[0]), "preview:", qvec[0][:8])
for i,(doc,meta,dist) in enumerate(zip(qout["documents"][0], qout["metadatas"][0], qout["distances"][0]), 1):
    print(i, "cos_sim≈", 1-dist, "title=", meta.get("title"))


Stored doc emb dim: 384 preview: [-0.05569929 -0.0222838  -0.0356492   0.0378988   0.01280342 -0.1074873
 -0.0390874   0.01382053] title: Python 3.12.2 (March 2024) – Patch contents
Query emb dim: 384 preview: [-0.061740849167108536, 0.023628272116184235, -0.09414426237344742, 0.00542877521365881, 0.05634896084666252, 0.03960898891091347, -0.03972223401069641, 0.10453187674283981]
1 cos_sim≈ -0.08608782291412354 title= PEPs targeting Python 3.14: Status changes (alpha to beta)
2 cos_sim≈ -0.29325008392333984 title= Python 3.13.1 Release Details (Date, Issues, Regressions)
3 cos_sim≈ -0.3012566566467285 title= Python 3.13.2 Release Details (Date, Issues, Regressions)


---

## Gradio retrieval UI
Quick interactive search to show nearest neighbors and cosine similarity.


In [16]:
%pip -q install gradio pandas
import gradio as gr, pandas as pd

def search_ui(q: str, k: int = 5):
    if not q.strip():
        return pd.DataFrame(columns=["title","cosine_similarity","snippet"])
    qvec = embedder.encode([q], convert_to_numpy=True).tolist()
    out = col.query(query_embeddings=qvec, n_results=int(k), include=["distances","metadatas","documents"])
    rows = []
    for meta, dist, doc in zip(out["metadatas"][0], out["distances"][0], out["documents"][0]):
        title = (meta or {}).get("title", "")
        snippet = (doc or "").replace("\n"," ")[:160] + ("..." if len(doc or "") > 160 else "")
        rows.append({"title": title, "cosine_similarity": round(1-float(dist), 4), "snippet": snippet})
    return pd.DataFrame(rows, columns=["title","cosine_similarity","snippet"])

demo = gr.Interface(
    fn=search_ui,
    inputs=[gr.Textbox(label="Query"), gr.Slider(1, 10, value=5, step=1, label="Top K")],
    outputs=gr.Dataframe(label="Nearest neighbors"),
    title="Chroma Retrieval Demo",
    description="Type a question; see top-K neighbors with cosine similarity and snippets."
)

demo.launch(share=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

