In [12]:
# Cell 1: Install dependencies
!pip install openai faiss-cpu pandas numpy --quiet



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
# Cell 2: Load your chunks
import pandas as pd
df = pd.read_csv("heading_sliding_chunks.csv")  # url, section, chunk_id, text
texts = df["text"].tolist()
total = len(texts)


In [16]:
# Cell 3: Initialize OpenAI client
from openai import OpenAI
import numpy as np

client = OpenAI(api_key="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")  # ← replace with your key


In [20]:
# Cell 4: Embed in batches with progress
batch_size = 64
embeddings = []
processed = 0

for i in range(0, total, batch_size):
    batch = texts[i : i + batch_size]
    resp = client.embeddings.create(
        input=batch,
        model="text-embedding-ada-002"
    )
    embeddings.extend([np.array(d.embedding) for d in resp.data])
    processed += len(batch)
    if processed % 2000 == 0 or processed == total:
        print(f"✅ Processed {processed}/{total} embeddings")

df["embedding"] = embeddings
df.to_pickle("chunks_with_openai_embeddings.pkl")
print("✅ All embeddings saved.")


✅ Processed 8000/48901 embeddings
✅ Processed 16000/48901 embeddings
✅ Processed 24000/48901 embeddings
✅ Processed 32000/48901 embeddings
✅ Processed 40000/48901 embeddings
✅ Processed 48000/48901 embeddings
✅ Processed 48901/48901 embeddings
✅ All embeddings saved.


In [22]:
# Cell 5: Build FAISS index
import faiss

# Normalize helper
def normalize(v): return v / np.linalg.norm(v)

emb_matrix = np.stack(df["embedding"].apply(normalize).values)
d = emb_matrix.shape[1]
index = faiss.IndexFlatIP(d)
index.add(emb_matrix)
print(f"✅ FAISS index built with {index.ntotal} vectors.")


✅ FAISS index built with 48901 vectors.


In [40]:
# Cell 6: Retrieval + RAG answer function
def retrieve_chunks(query, k=5):
    qv = normalize(np.array(client.embeddings.create(
        input=[query], model="text-embedding-ada-002"
    ).data[0].embedding))
    D, I = index.search(np.array([qv]), k)
    return df.iloc[I[0]][["url","section","text"]].to_dict("records")

def generate_answer(query):
    top = retrieve_chunks(query)
    context = "\n\n".join(
        f"{c['text']}\n(Source: {c['url']}, Section: {c['section']})"
        for c in top
    )
    prompt = (
        "You are a helpful UTD chatbot.\n\n"
        "Use the context below to answer the question. If listing items, format them as bullet points.\n\n"
        f"Context:\n{context}\n\n"
        f"Question:\n{query}\n\n"
        "Answer:"
    )
    resp = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role":"user","content":prompt}]
    )
    return resp.choices[0].message.content
top = retrieve_chunks("Deans Excellence Scholarship")
# Test it
print(generate_answer("Deans Excellence Scholarship?"))
print(top)

Students enrolling in the MS Business Analytics and Artificial Intelligence Cohort or MS Business Analytics and Artificial Intelligence Cohort Online programs at the University of Texas at Dallas are not eligible for the Dean’s Excellence Scholarship. These students should visit their program webpages or director for specific scholarship opportunities associated with those programs.
[{'url': 'https://jindal.utdallas.edu/consortium-online-graduate-business-education/founding-schools', 'section': 'North Carolina State University – Poole College of Management', 'text': 'Assistant Dean of Graduate Programs'}, {'url': 'https://jindal.utdallas.edu/consortium-online-graduate-business-education/founding-schools', 'section': 'Arizona State University – WP Carey School of Business', 'text': 'Assistant Dean of Graduate Programs'}, {'url': 'https://jindal.utdallas.edu/blog/jsom-jump-starts-fall-2023', 'section': 'Food, Fun and Friends', 'text': 'undergraduate dean.'}, {'url': 'https://jindal.utdal