In [1]:
import os
import getpass
import json
import re
import uuid
import pickle
import torch
from tqdm.auto import tqdm
from datasets import Dataset, load_dataset, concatenate_datasets

# LangChain Imports
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

# Sentence-Transformers (for fine-tuning) Imports
from sentence_transformers import SentenceTransformer, SentenceTransformerModelCardData
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformerTrainer

In [2]:
from huggingface_hub import login

login(token="", add_to_git_credential=True)

In [3]:
# Securely get the OpenAI API key
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key: ")

### Load in Data

In [4]:
# --- NEW CELL (Call it 'Cell 3.5') ---
# SCRIPT TO SPLIT LARGE CHUNKS BY FIXED SIZE (NO DE-DUPLICATION)

import json
# We'll use LangChain's text splitter for this
from langchain_text_splitters import RecursiveCharacterTextSplitter

ORIGINAL_FILE = "ms_applied_data_science_enhanced_chunks.json"
NEW_FILE = "ms_applied_data_science_RE_CHUNKED_FIXED_SIZE.json"

# --- DYNAMIC SETTINGS ---
# We will split any chunk larger than this.
CHARACTER_THRESHOLD = 3000
# We add a small overlap to maintain context between the new split chunks
CHUNK_OVERLAP = 150
# --- END SETTINGS ---

# 1. Set up our text splitter
# This will split text into chunks of 3000 chars
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHARACTER_THRESHOLD,
    chunk_overlap=CHUNK_OVERLAP
)

print(f"Loading original file: {ORIGINAL_FILE}...")
with open(ORIGINAL_FILE, 'r', encoding='utf-8') as f:
    original_data = json.load(f)

new_chunk_list = []
original_chunk_count = len(original_data)
chunks_kept_as_is = 0
chunks_split = 0
new_sub_chunks_created = 0

print(f"Processing {original_chunk_count} chunks...")
print(f"Splitting any chunk over {CHARACTER_THRESHOLD} characters.")

for i, chunk_item in enumerate(original_data):
    text_to_process = chunk_item.get('text', '')
    base_metadata = chunk_item.get('metadata', {})

    # 1. Check if chunk is *smaller* than the threshold
    if len(text_to_process) <= CHARACTER_THRESHOLD:
        # It's a "good" chunk. Keep it as-is.
        new_chunk_list.append(chunk_item)
        chunks_kept_as_is += 1

    # 2. If it's *larger*, we split it
    else:
        chunks_split += 1
        # print(f"--- Splitting large chunk (index {i}, length {len(text_to_process)}) ---")

        # Use the splitter to create new, smaller text chunks
        sub_texts = text_splitter.split_text(text_to_process)

        for sub_text in sub_texts:
            # Create a new item for each new sub-chunk
            new_item = {
                "text": sub_text,
                "metadata": base_metadata.copy() # Inherit metadata
            }
            new_chunk_list.append(new_item)
            new_sub_chunks_created += 1

# --- Save the new file ---
print(f"\nSaving new fixed-size-split file to: {NEW_FILE}...")
with open(NEW_FILE, 'w', encoding='utf-8') as f:
    json.dump(new_chunk_list, f, indent=2)

print("\n--- Fixed-Size Re-chunking Complete ---")
print(f"Original chunks processed: {original_chunk_count}")
print(f"Chunks kept (under threshold): {chunks_kept_as_is}")
print(f"Large chunks split: {chunks_split}")
print(f"New sub-chunks created: {new_sub_chunks_created}")
print(f"Total chunks in new file: {len(new_chunk_list)}")

Loading original file: ms_applied_data_science_enhanced_chunks.json...
Processing 802 chunks...
Splitting any chunk over 3000 characters.

Saving new fixed-size-split file to: ms_applied_data_science_RE_CHUNKED_FIXED_SIZE.json...

--- Fixed-Size Re-chunking Complete ---
Original chunks processed: 802
Chunks kept (under threshold): 788
Large chunks split: 14
New sub-chunks created: 134
Total chunks in new file: 922


In [5]:
# Define the file path for your JSON data
DATA_FILE = "ms_applied_data_science_RE_CHUNKED_FIXED_SIZE.json"

# Load the JSON data
print(f"Loading data from {DATA_FILE}...")
try:
    with open(DATA_FILE, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
except FileNotFoundError:
    print(f"Error: {DATA_FILE} not found.")
    print("Please make sure the file is in the same directory as this script.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {DATA_FILE}.")

# Convert JSON data into LangChain Document objects (our "nodes")
# We can skip PyPDFLoader and RecursiveCharacterTextSplitter because our JSON is already pre-chunked.
nodes = []
for i, item in enumerate(json_data):
    # We create a Document object for each chunk.
    doc = Document(
        page_content=item.get('text', ''),
        metadata=item.get('metadata', {})
    )
    doc.metadata["id"] = f"node_{i}"
    nodes.append(doc)

print(f"Successfully loaded and created {len(nodes)} Document nodes.")

Loading data from ms_applied_data_science_RE_CHUNKED_FIXED_SIZE.json...
Successfully loaded and created 922 Document nodes.


In [6]:
import textwrap

# The 'nodes' variable was created in the cell above.
# Let's inspect the first 3 nodes to see what they look like.

print(f"--- Showing first 3 of {len(nodes)} loaded nodes ---")

for i, node in enumerate(nodes[:3]):
    print(f"\n--- NODE {i} (Chunk {i}) ---")

    # Wrap the text for cleaner printing
    print(f"CONTENT:\n{textwrap.fill(node.page_content, 80)}")

    # Print the metadata
    print(f"\nMETADATA:\n{node.metadata}")

    print("-" * 80)

--- Showing first 3 of 922 loaded nodes ---

--- NODE 0 (Chunk 0) ---
CONTENT:
The University of Chicago’s MS in Applied Data Science program equips you with
in-demand expertise and an unparalleled network of global alumni. Take the next
step and start your application today.

METADATA:
{'source': 'https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/', 'title': "Master's in Applied Data Science - DSI", 'page_type': 'main', 'last_scraped': '2025-11-07 18:20:38', 'content_type': 'paragraph', 'keywords': ['the university of chicagos ms', 'applied data science program', 'an unparalleled network', 'global alumni', 'demand', 'program', 'chicago', 'application', 'today', 'applied data science'], 'semantic_tags': {'dates': ['today'], 'application_info': True, 'career_info': True}, 'embedded_links': [], 'id': 'node_0'}
--------------------------------------------------------------------------------

--- NODE 1 (Chunk 1) ---
CONTENT:
Choose from full- and part-

### Generate Synthetic Data for Fine-Tuning Embedding

In [14]:
def generate_queries_batch(nodes, llm, num_questions_per_chunk=1, batch_size=20):
    """
    [NEW BATCH VERSION]
    Generates questions in parallel batches to speed up the process.
    """
    # Your new "student" prompt
    prompt_template_str = """
    Context information is below.
    ---------------------
    {context_str}
    ---------------------
    Given the context information and not prior knowledge.
    generate only questions based on the below query.

    You are a prospective student or current student interested in the \
    University of Chicago's MS in Applied Data Science program. \
    Your task is to ask {num_questions_per_chunk} specific questions about \
    the program details based *only* on the context provided. \
    The questions should be typical of someone asking a chatbot for details.

    Restrict your questions *only* to the information in the context.
    Do not ask questions if the context does not provide the answer."""

    prompt_template = ChatPromptTemplate.from_template(prompt_template_str)
    chain = prompt_template | llm | StrOutputParser()

    queries = {}
    relevant_docs = {}

    # 1. Prepare all inputs for the batch
    inputs = []
    for node in nodes:
        inputs.append({
            "context_str": node.page_content,
            "num_questions_per_chunk": num_questions_per_chunk
        })

    print(f"Generating questions for {len(inputs)} nodes in parallel (max concurrency: {batch_size})...")

    # 2. Run the batch processing (this will show a tqdm progress bar!)
    # We set max_concurrency to avoid hitting rate limits too hard.
    responses = chain.batch(inputs, config={"max_concurrency": batch_size})

    # 3. Process the responses
    print("Processing responses...")
    for i, response in enumerate(responses):
        node = nodes[i]
        node_id = node.metadata["id"]

        result = response.strip().split("\n")
        questions = [re.sub(r"^\d+[\\).\\s]", "", q).strip() for q in result]
        questions = [q for q in questions if len(q) > 0]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]

    return queries, relevant_docs

In [7]:
# --- THIS IS THE NEW CRASH-PROOF RESUMABLE EXECUTION CELL ---
import time
from openai import RateLimitError # Need to import the error to catch it

synthetic_data_file = "uchicago_synthetic_data_v2.pkl"

# --- BATCH SETTINGS ---
# This is the "super-batch" for saving. We'll save our progress every 50 nodes.
SUPER_BATCH_SIZE = 50
# This is the "inner-batch" for API calls.
# We set it low (e.g., 5) to prevent rate limit errors.
INNER_BATCH_SIZE = 5
# --- END SETTINGS ---

# 1. Check for an existing progress file
# (This logic is correct and remains the same)
if os.path.exists(synthetic_data_file):
    print(f"Loading existing synthetic data from {synthetic_data_file}...")
    with open(synthetic_data_file, "rb") as f:
        data_to_save = pickle.load(f)
        queries = data_to_save['queries']
        relevant_docs = data_to_save['relevant_docs']
    print(f"Loaded {len(queries)} existing questions.")

    processed_node_ids = set()
    for q_id, node_ids in relevant_docs.items():
        processed_node_ids.add(node_ids[0])
    print(f"Found {len(processed_node_ids)} already processed nodes.")

else:
    print("No existing data file found. Starting from scratch.")
    queries = {}
    relevant_docs = {}
    processed_node_ids = set()

# 2. Create the list of nodes that STILL need to be processed
# (This logic is correct and remains the same)
nodes_to_process = [
    node for node in nodes if node.metadata["id"] not in processed_node_ids
]

# 3. Run the generation only on the remaining nodes
# (This logic is NEW and now loops in "super-batches")
if not nodes_to_process:
    print("All nodes have already been processed! Nothing to do.")
else:
    print(f"--- {len(nodes_to_process)} remaining nodes to process. ---")
    llm_qgen = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.7)

    # We use a while loop to process in "super-batches"
    while nodes_to_process:

        # A. Get the next super-batch of nodes to process
        current_batch_nodes = nodes_to_process[:SUPER_BATCH_SIZE]
        print(f"\n--- Processing a new super-batch of {len(current_batch_nodes)} nodes... ---")

        try:
            # B. Run the generation *only* on this small super-batch
            # We pass the small INNER_BATCH_SIZE to prevent rate limits
            new_queries, new_relevant_docs = generate_queries_batch(
                current_batch_nodes,
                llm_qgen,
                num_questions_per_chunk=1, # Using 1 as in your last prompt
                batch_size=INNER_BATCH_SIZE
            )

            # C. Add the new results to our main dataset
            print(f"Adding {len(new_queries)} new questions to our dataset...")
            queries.update(new_queries)
            relevant_docs.update(new_relevant_docs)

            # D. Save the COMBINED data back to the file (CRITICAL!)
            # We do this *inside* the loop for crash-proof saving.
            print(f"Saving combined data ({len(queries)} total questions) to {synthetic_data_file}...")
            data_to_save = {'queries': queries, 'relevant_docs': relevant_docs}
            with open(synthetic_data_file, "wb") as f:
                pickle.dump(data_to_save, f)

            # E. Update the list of nodes to process *in memory*
            nodes_to_process = nodes_to_process[SUPER_BATCH_SIZE:]
            print(f"--- Super-batch complete. {len(nodes_to_process)} nodes remaining. ---")

        except RateLimitError as e:
            # If we hit a rate limit, print a message and wait 60 seconds.
            # The 'continue' will retry this *same* super-batch.
            print(f"\n!!! RATE LIMIT ERROR HIT !!!")
            print("Pausing for 60 seconds. The script will then retry this *same* batch.")
            time.sleep(60)
            continue

        except Exception as e:
            # If any other error happens, stop the script.
            print(f"\n!!! AN UNEXPECTED ERROR OCCURRED !!!")
            print(f"Error: {e}")
            print("Stopping the script to prevent data loss.")
            print(f"Progress up to the *previous* super-batch is saved in {synthetic_data_file}.")
            print("You can re-run this cell to resume from the last saved point.")
            break # Exit the while loop

print(f"\n--- Process complete. Total questions in file: {len(queries)} ---")

Loading existing synthetic data from uchicago_synthetic_data_v2.pkl...
Loaded 922 existing questions.
Found 922 already processed nodes.
All nodes have already been processed! Nothing to do.

--- Process complete. Total questions in file: 922 ---


In [8]:
import pickle
import textwrap
import random

# 1. Load the data file you just generated
synthetic_data_file = "uchicago_synthetic_data_v2.pkl"
try:
    with open(synthetic_data_file, "rb") as f:
        loaded_data = pickle.load(f)
        queries = loaded_data['queries']
        relevant_docs = loaded_data['relevant_docs']
except FileNotFoundError:
    print(f"Error: Could not find {synthetic_data_file}.")
    print("Please make sure the Part 2 generation cell ran successfully.")
    queries = {} # Initialize to avoid error
    relevant_docs = {} # Initialize to avoid error

# 2. Re-create the node_id -> text content lookup
# (The 'nodes' variable should still be in memory from Part 1)
try:
    node_id_to_content = {node.metadata["id"]: node.page_content for node in nodes}
except NameError:
    print("Error: 'nodes' variable not found. Please re-run Part 1 to load the data.")
    node_id_to_content = {} # Initialize to avoid error

# 3. Print 5 RANDOM generated Q&A pairs
print(f"--- Showing 5 random pairs from {len(queries)} total generated pairs ---")

if queries:
    question_ids = list(queries.keys())

    # Check if we have enough questions to sample 5, otherwise just show all
    num_to_show = 5
    if len(question_ids) < num_to_show:
        random_q_ids = question_ids
    else:
        # --- THIS IS THE CHANGE ---
        # Get 5 random question IDs
        random_q_ids = random.sample(question_ids, num_to_show)

    for i, q_id in enumerate(random_q_ids):
        question = queries[q_id]
        node_id = relevant_docs[q_id][0]
        context = node_id_to_content.get(node_id, "CONTEXT NOT FOUND")

        print(f"\n--- RANDOM PAIR {i+1} (from node '{node_id}') ---")
        print(f"GENERATED QUESTION:\n{textwrap.fill(question, 80)}")
        print(f"\nORIGINAL CONTEXT (The 'Answer'):\n{textwrap.fill(context, 80)}")
        print("-" * 80)
else:
    print("No queries found to display.")

--- Showing 5 random pairs from 922 total generated pairs ---

--- RANDOM PAIR 1 (from node 'node_316') ---
GENERATED QUESTION:
Can students with significant full-time work experience waive the Career Seminar
course in the program?

ORIGINAL CONTEXT (The 'Answer'):
Course: Unknown Course Description: The Career Seminar (Pass/Fail) supports the
development of industry professional skills, job and/or internship searches, and
other in-demand areas of competency among today’s employers. Students enroll in
the Career Seminar each quarter in order to engage in unique content throughout
their degree program. Students with significant full-time work experience may be
eligible to waive this course. 0 units, no cost. Details: • Career Seminar
(Required) Pass/Fail The Career Seminar (Pass/Fail) supports the development of
industry professional skills, job and/or internship searches, and other in-
demand areas of competency among today’s employers. Students enroll in the
Career Seminar each quarte

### Create Train and Test Data

In [9]:
train_dataset_file = "uchicago_train_dataset_v2.json"
test_dataset_file = "uchicago_test_dataset_v2.json"

if os.path.exists(train_dataset_file) and os.path.exists(test_dataset_file):
    print("Train and test JSON files already exist. Loading them.")
    train_dataset = load_dataset("json", data_files=train_dataset_file, split="train")
    test_dataset = load_dataset("json", data_files=test_dataset_file, split="train")
else:
    print("Creating and saving train/test JSON files...")
    # Create a lookup for node content
    node_id_to_content = {node.metadata["id"]: node.page_content for node in nodes}

    prepared_data = []
    for q_id, question in queries.items():
        node_id = relevant_docs.get(q_id, [None])[0]
        if node_id:
            context = node_id_to_content.get(node_id)
            if context:
                prepared_data.append({
                    "question": question,
                    "context": context
                })

    # Convert to a Hugging Face Dataset
    dataset = Dataset.from_list(prepared_data)
    # Rename columns to match sentence-transformers convention
    dataset = dataset.rename_column("question", "anchor")
    dataset = dataset.rename_column("context", "positive")
    # Add a unique ID for each pair
    dataset = dataset.add_column("id", range(len(dataset)))

    # Train/test split (90% train, 10% test)
    dataset_split = dataset.train_test_split(test_size=0.1)

    # Save datasets to JSON
    dataset_split["train"].to_json(train_dataset_file, orient="records", lines=True)
    dataset_split["test"].to_json(test_dataset_file, orient="records", lines=True)

    train_dataset = dataset_split["train"]
    test_dataset = dataset_split["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train and test JSON files already exist. Loading them.


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Train dataset size: 829
Test dataset size: 93


### Fine-Tune the Embedding Model

In [12]:
# Define model ID and Matryoshka dimensions
model_id = "sentence-transformers/all-MiniLM-L6-v2"
matryoshka_dimensions = [384, 256, 128, 64]
finetuned_model_output_dir = "minilm-uchicago-msads-finetuned_v2"

print("Setting up evaluator...")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])
corpus = dict(zip(corpus_dataset["id"], corpus_dataset["positive"])) # All contexts
queries_test = dict(zip(test_dataset["id"], test_dataset["anchor"])) # Test questions

# Map test queries to their relevant contexts
relevant_docs_test = {}
for q_id in queries_test:
    relevant_docs_test[q_id] = [q_id] # Query is relevant to its own positive context

matryoshka_evaluators = []
for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries_test,
        corpus=corpus,
        relevant_docs=relevant_docs_test,
        name=f"dim_{dim}",
        truncate_dim=dim,
        score_functions={"cosine": cos_sim},
    )
    matryoshka_evaluators.append(ir_evaluator)

evaluator = SequentialEvaluator(matryoshka_evaluators)
print("Evaluator is ready.")

Setting up evaluator...
Evaluator is ready.


#### Eval initial model before finetuning

In [19]:
print("Evaluating baseline model (before fine-tuning)...")
base_model = SentenceTransformer(model_id)
baseline_results = evaluator(base_model)

print("\n--- Baseline Model Performance (ndcg@10) ---")
for dim in matryoshka_dimensions:
    key = f"dim_{dim}_cosine_ndcg@10"
    print(f"{key}: {baseline_results.get(key, 'N/A'):.4f}")

Evaluating baseline model (before fine-tuning)...

--- Baseline Model Performance (ndcg@10) ---
dim_384_cosine_ndcg@10: 0.2570
dim_256_cosine_ndcg@10: 0.2345
dim_128_cosine_ndcg@10: 0.2309
dim_64_cosine_ndcg@10: 0.1768


#### Finetuning

In [13]:
print("\nConfiguring fine-tuning...")
# Set device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available(): # For Apple Silicon
    device = "mps"
else:
    device = "cpu"
print(f"Using device: {device}")

# Load the model for training
model = SentenceTransformer(
    model_id,
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="all-MiniLM-L6-v2-uchicago-msads-matryoshka_v2", # New model name
    ),
    device=device
)

# Define the loss function
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)


Configuring fine-tuning...
Using device: mps


In [21]:
# --- SAFE MODE FIX FOR LAPTOP FREEZING ---
# This cell aggressively reduces memory usage for both
# training and evaluation to prevent system-wide freezes.

print("Using 'Safe Mode' Training Arguments to prevent RAM exhaustion.")

# Set device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available(): # For Apple Silicon
    device = "mps"
else:
    device = "cpu"

args = SentenceTransformerTrainingArguments(
    output_dir=finetuned_model_output_dir,
    num_train_epochs=5,

    # --- HERE IS THE FIX ---
    per_device_train_batch_size=4,  # 1. Training fix: Use a tiny batch (was 16)
    gradient_accumulation_steps=8,  # 2. Quality fix: Simulate a 16-size batch (2 * 8 = 16)
    per_device_eval_batch_size=2,   # 3. Evaluator fix: Force evaluation to run in tiny batches
    # --- END OF FIX ---

    learning_rate=2e-5,
    warmup_ratio=0.1,
    optim="adamw_torch",
    fp16=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_dim_128_cosine_ndcg@10",
    logging_steps=25,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
)

Using 'Safe Mode' Training Arguments to prevent RAM exhaustion.


#### Training model with synthetic data

In [22]:
# Initialize the trainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    # Note: MNRL loss expects columns in order: anchor, positive
    train_dataset=train_dataset.select_columns(["anchor", "positive"]),
    loss=train_loss,
    evaluator=evaluator,
)

# Training
print("--- Starting Model Fine-Tuning ---")
print("This will take some time...")
trainer.train()
print("--- Training Complete ---")
trainer.save_model()
print(f"Fine-tuned model saved to {finetuned_model_output_dir}")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

--- Starting Model Fine-Tuning ---
This will take some time...




Epoch,Training Loss,Validation Loss,Dim 384 Cosine Accuracy@1,Dim 384 Cosine Accuracy@3,Dim 384 Cosine Accuracy@5,Dim 384 Cosine Accuracy@10,Dim 384 Cosine Precision@1,Dim 384 Cosine Precision@3,Dim 384 Cosine Precision@5,Dim 384 Cosine Precision@10,Dim 384 Cosine Recall@1,Dim 384 Cosine Recall@3,Dim 384 Cosine Recall@5,Dim 384 Cosine Recall@10,Dim 384 Cosine Ndcg@10,Dim 384 Cosine Mrr@10,Dim 384 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100,Dim 64 Cosine Accuracy@1,Dim 64 Cosine Accuracy@3,Dim 64 Cosine Accuracy@5,Dim 64 Cosine Accuracy@10,Dim 64 Cosine Precision@1,Dim 64 Cosine Precision@3,Dim 64 Cosine Precision@5,Dim 64 Cosine Precision@10,Dim 64 Cosine Recall@1,Dim 64 Cosine Recall@3,Dim 64 Cosine Recall@5,Dim 64 Cosine Recall@10,Dim 64 Cosine Ndcg@10,Dim 64 Cosine Mrr@10,Dim 64 Cosine Map@100,Sequential Score
1,1.8774,No log,0.268817,0.44086,0.483871,0.569892,0.268817,0.146953,0.096774,0.056989,0.268817,0.44086,0.483871,0.569892,0.414863,0.365707,0.383556,0.258065,0.430108,0.473118,0.55914,0.258065,0.143369,0.094624,0.055914,0.258065,0.430108,0.473118,0.55914,0.403086,0.353546,0.373586,0.236559,0.44086,0.537634,0.612903,0.236559,0.146953,0.107527,0.06129,0.236559,0.44086,0.537634,0.612903,0.418961,0.357015,0.371233,0.225806,0.354839,0.430108,0.548387,0.225806,0.11828,0.086022,0.054839,0.225806,0.354839,0.430108,0.548387,0.372112,0.317405,0.334989,0.372112
2,0.5887,No log,0.258065,0.419355,0.494624,0.580645,0.258065,0.139785,0.098925,0.058065,0.258065,0.419355,0.494624,0.580645,0.408579,0.354583,0.373718,0.247312,0.397849,0.473118,0.569892,0.247312,0.132616,0.094624,0.056989,0.247312,0.397849,0.473118,0.569892,0.39673,0.342469,0.362716,0.236559,0.397849,0.483871,0.569892,0.236559,0.132616,0.096774,0.056989,0.236559,0.397849,0.483871,0.569892,0.395313,0.340199,0.35793,0.236559,0.376344,0.473118,0.580645,0.236559,0.125448,0.094624,0.058065,0.236559,0.376344,0.473118,0.580645,0.391408,0.332642,0.349559,0.391408
3,0.4201,No log,0.27957,0.451613,0.505376,0.591398,0.27957,0.150538,0.101075,0.05914,0.27957,0.451613,0.505376,0.591398,0.430337,0.379254,0.39683,0.258065,0.419355,0.505376,0.602151,0.258065,0.139785,0.101075,0.060215,0.258065,0.419355,0.505376,0.602151,0.418645,0.36097,0.378207,0.258065,0.44086,0.526882,0.602151,0.258065,0.146953,0.105376,0.060215,0.258065,0.44086,0.526882,0.602151,0.425608,0.369265,0.385283,0.268817,0.419355,0.483871,0.623656,0.268817,0.139785,0.096774,0.062366,0.268817,0.419355,0.483871,0.623656,0.424737,0.363539,0.377332,0.424737
4,0.3929,No log,0.290323,0.451613,0.505376,0.623656,0.290323,0.150538,0.101075,0.062366,0.290323,0.451613,0.505376,0.623656,0.444043,0.388245,0.406184,0.268817,0.430108,0.516129,0.634409,0.268817,0.143369,0.103226,0.063441,0.268817,0.430108,0.516129,0.634409,0.435653,0.373622,0.390762,0.258065,0.44086,0.537634,0.634409,0.258065,0.146953,0.107527,0.063441,0.258065,0.44086,0.537634,0.634409,0.435776,0.373088,0.388734,0.258065,0.376344,0.483871,0.612903,0.258065,0.125448,0.096774,0.06129,0.258065,0.376344,0.483871,0.612903,0.411514,0.349441,0.363992,0.411514
5,0.3656,No log,0.290323,0.451613,0.505376,0.602151,0.290323,0.150538,0.101075,0.060215,0.290323,0.451613,0.505376,0.602151,0.437221,0.385514,0.404241,0.27957,0.451613,0.526882,0.623656,0.27957,0.150538,0.105376,0.062366,0.27957,0.451613,0.526882,0.623656,0.439854,0.382079,0.398815,0.247312,0.430108,0.526882,0.580645,0.247312,0.143369,0.105376,0.058065,0.247312,0.430108,0.526882,0.580645,0.411766,0.357258,0.376722,0.268817,0.397849,0.494624,0.612903,0.268817,0.132616,0.098925,0.06129,0.268817,0.397849,0.494624,0.612903,0.420558,0.361137,0.375399,0.420558




--- Training Complete ---
Fine-tuned model saved to minilm-uchicago-msads-finetuned_v2


#### Eval finetuned model

In [23]:
print("\nEvaluating fine-tuned model...")
fine_tuned_model = SentenceTransformer(
    args.output_dir, device=device
)
finetuned_results = evaluator(fine_tuned_model)

print("\n--- Fine-Tuned Model Performance (ndcg@10) ---")
for dim in matryoshka_dimensions:
    key = f"dim_{dim}_cosine_ndcg@10"
    print(f"{key}: {finetuned_results.get(key, 'N/A'):.4f}")


Evaluating fine-tuned model...

--- Fine-Tuned Model Performance (ndcg@10) ---
dim_384_cosine_ndcg@10: 0.4440
dim_256_cosine_ndcg@10: 0.4357
dim_128_cosine_ndcg@10: 0.4358
dim_64_cosine_ndcg@10: 0.4115


We see that this fintuned model has better performance than the baseline.

### Build RAG with newly trained embedding model

In [18]:
# --- REPLACEMENT FOR NOTEBOOK CELL 24 ---

from langchain_community.vectorstores.utils import filter_complex_metadata

# Define the path where the database will be saved
DB_PERSIST_PATH = "chroma_db_finetuned"

print(f"Loading fine-tuned embeddings from {finetuned_model_output_dir}...")

# Set device for embeddings
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available(): # For Apple Silicon
    device = "mps"
else:
    device = "cpu"

embedding_function = HuggingFaceEmbeddings(
    model_name=finetuned_model_output_dir,
    model_kwargs={'device': device},
    encode_kwargs={'normalize_embeddings': True}
)

print("Filtering complex metadata from nodes...")
filtered_nodes = filter_complex_metadata(nodes)

print(f"Creating and saving vector store to: {DB_PERSIST_PATH}...")
# This command creates the DB and saves it to the persist_directory
vectorstore = Chroma.from_documents(
    documents=filtered_nodes,
    embedding=embedding_function,
    persist_directory=DB_PERSIST_PATH  # <-- This is the new line
)

print("Vector store built and saved locally.")

Loading fine-tuned embeddings from minilm-uchicago-msads-finetuned_v2...
Filtering complex metadata from nodes...
Creating and saving vector store to: chroma_db_finetuned...
Vector store built and saved locally.


In [15]:
# THIS IS THE FAST, EXPERIMENTAL CELL
k_value = 10 # <-- Change this value to 5, 8, 10, etc.

retriever = vectorstore.as_retriever(search_kwargs={"k": k_value})
print(f"Retriever is ready with k={k_value}.")

Retriever is ready with k=10.


#### Defining Rag Chain

In [21]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# 1. Define LLMs
# The "GPT LLM" (gpt-4o) for final answer generation
llm_rag = ChatOpenAI(model_name="gpt-4o", temperature=0)
# The LLM for correcting user typos
llm_correction = ChatOpenAI(model_name="gpt-4o", temperature=0)

# 2. Define the Document Formatter (from your notebook)
# It's designed to read the exact JSON structure you provided
def format_docs_with_links(docs):
    """
    Formats retrieved documents to include their source URL and
    any embedded links found in the metadata.
    """
    formatted_chunks = []
    for i, doc in enumerate(docs):
        # Get the source URL
        source = doc.metadata.get('source', 'Unknown Source')

        # --- THIS PART HANDLES YOUR 'embedded_links' ---
        links = doc.metadata.get('embedded_links')
        links_string = ""  # Start with an empty string

        if links:  # Check if links is not None and not empty
            link_parts = []
            for link in links:
                text = link.get('text', 'link')
                href = link.get('href', '#')
                # Format as a Markdown link, which the LLM can understand
                link_parts.append(f"• [{text}]({href})")

            # Create a clearly labeled section for the LLM
            links_string = "\n\n**Embedded Links Found:**\n" + "\n".join(link_parts)
        # --- END OF 'embedded_links' HANDLER ---

        # Create a formatted block for each chunk
        chunk_with_source = (
            f"--- Context Chunk {i+1} (Source: {source}) ---\n"
            f"{doc.page_content}"
            f"{links_string}"  # Append the new links string
        )
        formatted_chunks.append(chunk_with_source)

    return "\n\n".join(formatted_chunks)

# 3. Define the Main RAG Prompt (from your notebook)
# --- SYNTAX ERROR IS FIXED HERE ---
system_prompt = (
    "You are an expert assistant for the University of Chicago's MS in "
    "Applied Data Science program. "
    "Use the following pieces of retrieved context to answer the question. "
    "Each context chunk includes its source URL and may also include a list "
    "of 'Embedded Links Found'.\n\n"

    "**Critical Rules for Answering:**\n"
    "1.  **Ground your answer in the context.** Quote or paraphrase the provided text. "
    "2.  **If the context chunk you use has an 'Embedded Links Found' section,** "
    "you MUST include the relevant link(s) in your answer. "
    "Format them as [Link Text](URL).\n"
    "3.  If your answer is a fact (like a tuition number), you do not need a link "
    "unless one is explicitly provided in the 'Embedded Links Found' section.\n"
    "4.  If you don't know the answer from the context, just say that you don't know.\n\n"

    "**Example Answer:**\n"
    "For questions regarding an application fee waiver, you should refer to "
    "the [Physical Sciences Division fee waiver policy](https://...#FeeWaiver).\n\n"

    "Context:\n{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [("system", system_prompt), ("human", "{question}")]
)

# 4. Define the Final Answer Chain
question_answer_chain = (
    prompt
    | llm_rag
    | StrOutputParser()
)

# 5. --- NEW: Define the Query Correction Chain (from Streamlit) ---
correction_prompt_template = (
    "You are an expert query assistant. Your job is to correct any typos in a "
    "user's query. Do not answer the question, just output the corrected query. "
    "User query: {question}"
)
correction_prompt = ChatPromptTemplate.from_template(correction_prompt_template)

# This chain takes a string query and outputs a corrected string query
correction_chain = (
    {"question": RunnablePassthrough()}
    | correction_prompt
    | llm_correction
    | StrOutputParser()
)

# 6. --- NEW: Define a chain to get context using the corrected query ---
# This chain will:
# 1. Take the original query string
# 2. Run the correction_chain to get a corrected query
# 3. Use the corrected query to search the retriever
# 4. Format the retrieved docs
context_retrieval_chain = (
    RunnableParallel(
        corrected_question=correction_chain,
        original_question=RunnablePassthrough() # We pass this just for the lambda
    )
    | (lambda x: x["corrected_question"]) # Pass only the corrected query string
    | retriever
    | format_docs_with_links
)

# 7. --- UPDATED: Define the Main RAG Chain ---
# This chain now uses the context_retrieval_chain (which has typo correction)
# It still passes the ORIGINAL user query to the final prompt
rag_chain = RunnableParallel(
    context=context_retrieval_chain,
    question=RunnablePassthrough() # Pass the original, uncorrected query
).assign(answer=question_answer_chain)

print("RAG chain with typo correction is built and ready.")

RAG chain with typo correction is built and ready.


#### Run Example Queries

#### Eval Set

In [22]:
import textwrap

print("\n--- Running Comprehensive Evaluation Queries ---")

# This is the new, much better list of questions you provided
evaluation_queries = [
    "What is tuition cost for the program?",
    "What scholarships are available for the program?",
    "What are the minimum scores for the TOEFL and IELTS English Language Requirement?",
    "Is there an application fee waiver?",
    "What are the deadlines for the in-person program?",
    "How long will it take for me to receive a decision on my application?",
    "Can I set up an advising appointment with the enrollment management team?",
    "Where can I mail my official transcripts?",
    "Does the Master’s in Applied Data Science Online program provide visa sponsorship?",
    "How do I apply to the MBA/MS program?",
    "Is the MS in Applied Data Science program STEM/OPT eligible?",
    "How many courses must you complete to earn UChicago’s Master’s in Applied Data Science?"
]

print(f"Running {len(evaluation_queries)} evaluation queries...")
print("You can compare the 'Answer:' from the model to your sample answers to see how well it's working.")

for query in evaluation_queries:
    print(f"\nQuery: {query}")

    # Invoke the RAG chain
    response = rag_chain.invoke(query)

    # Print the model's generated answer
    print(f"Answer:\n{textwrap.fill(response['answer'], 80)}")
    print("-" * 80)

print("\n--- RAG Evaluation Complete ---")


--- Running Comprehensive Evaluation Queries ---
Running 12 evaluation queries...
You can compare the 'Answer:' from the model to your sample answers to see how well it's working.

Query: What is tuition cost for the program?
Answer:
The tuition for the MS in Applied Data Science program at the University of
Chicago is $6,384 per course. For the 12-course program, the total tuition is
$76,608, and for the 18-course program, it is $114,912. Additionally, there is a
non-refundable program enrollment deposit of $1,500, which is credited toward
your first quarter’s tuition balance.
--------------------------------------------------------------------------------

Query: What scholarships are available for the program?
Answer:
The MS in Applied Data Science program at the University of Chicago offers
partial tuition merit-based scholarships to top applicants. These scholarships
do not require a separate application, but it is recommended that candidates
submit their applications ahead of th