In [4]:
# Imports
import json
import re
import os
import os.path as op
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# Constants
PROCESSED_DIR = "processed_data"

# Ensure we are in the correct working directory
current_dir = os.getcwd()
if op.basename(current_dir) == "data_processing_notebooks":
    os.chdir("..")
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\sande\OneDrive\Bureau\UofT\CSC2701_Communication4CS\internship-ai-assisstant\server\data


The goal of this notebook is to prepare the data for RAG. We do this via 2 big steps:
1. Creating the embeddings for each of the stored code chunks
2. Storing them in some vector database

## 1. Creating the embeddings

In [5]:
# 1.1 Load the data
date_file_to_load = "20251111"
datafile_to_load = f"rag_ready_docs_{date_file_to_load}.jsonl"

def load_jsonl_docs(path):
    docs = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                item = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping bad line: {e}")
                continue

            # Extract and clean text
            text = item.get("text", "")
            # Remove accidental prefixes like "page_content='...'"
            text = re.sub(r"^page_content='", "", text)
            # Remove leftover quotes, 'metadata=...' artifacts, etc.
            text = re.sub(r"' metadata=.*$", "", text)
            text = text.strip(" '")

            docs.append({
                "id": item.get("id"),
                "text": text,
                "metadata": item.get("metadata", {})
            })
    return docs

# Usage
rag_docs = load_jsonl_docs(op.join(PROCESSED_DIR, datafile_to_load))

In [None]:
# 1.2 Create embeddings
model = SentenceTransformer("all-MiniLM-L6-v2") # compact, fast model for creating embeddings
texts = [d["text"] for d in rag_docs]
embeddings = model.encode(texts, show_progress_bar=True)

Batches: 100%|██████████| 27/27 [00:18<00:00,  1.50it/s]


## 2. Store them in a Chroma database

In [52]:
# Create the client
client = chromadb.Client(Settings(
    persist_directory="./chroma_db"  # folder where vectors will be stored
))

# Create or get a collection
collection = client.get_or_create_collection(name="chatbot_resources")

In [54]:
# Clean metadata 
def clean_metadata(meta):
    """Remove None values and ensure all metadata values are simple strings."""
    if not isinstance(meta, dict):
        return {}
    clean = {}
    for k, v in meta.items():
        if v is None:
            continue
        # Convert everything to string
        clean[k] = str(v)
    return clean
clean_metadatas = [clean_metadata(d.get("metadata", {})) for d in rag_docs]

# Add data to Chroma
collection.add(
    ids=[d["id"] for d in rag_docs],
    embeddings=embeddings.tolist(),
    metadatas=clean_metadatas,
    documents=texts)

## 3. Sanity check : retrieval test

In [55]:
query = "Help me for technical interviews?"
query_embedding = model.encode([query])

results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=3
)

for i, doc in enumerate(results["documents"][0]):
    print(f"\nResult {i+1}:")
    print(doc[:400], "...")


Result 1:
practice interview questions Rehearse story, practice question, review mistake. Wake up and have breakfast, be on time, be conﬁdent, talk out loud Questions? Comments? For more questions and comments Daniel Giovannini, PhD Associate Director, MScAC Partnerships MSc in Applied Computing (MScAC) Program University of Toronto, 9th Floor, 700 University Avenue  dgiovannini@cs.toronto.edu  416-978-1679 ...

Result 2:
you got into this industry that you wish someone had told you? What questions should I ask in an informational interview? Informational interviews and networking MScAC Talks: monthly seminar series, September to June (mscac.utoronto.ca/talks)  Applied Research in Action (ARIA): November 13th, 2025  MScAC Partner Events: starting October 2025 (dates TBA)  MScAC Internship Expo: week beginning Janua ...

Result 3:
https://firecode.io/, Gayle Laakmann McDowell resources: https://www.gayle.com/consulting, Python Tutor: https://pythontutor.com/,Technical interview externa