In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
def install_if_missing(package, import_name=None):
    import importlib
    try:
        importlib.import_module(import_name or package)
    except ImportError:
        import subprocess
        subprocess.check_call(["pip", "install", package])

# Install packages as needed
install_if_missing("pandas")
install_if_missing("numpy")
install_if_missing("langchain")
install_if_missing("langchain-community")
install_if_missing("langchain-chroma")
install_if_missing("langchain-openai")

import pandas as pd
import numpy as np

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough
from langchain import hub


import glob
import os

BOLD = "\033[1m"
END   = "\033[0m"

In [36]:
from getpass import getpass
OPENAI_KEY = getpass("Enter your OpenAI API key:")
LANGCHAIN_API_KEY = getpass("Enter your LangSmith API key:")
HUGGINGFACEHUB_API_TOKEN = getpass("Enter your HuggingFace API token:")

os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_KEY

Enter your OpenAI API key:··········
Enter your LangSmith API key:··········
Enter your HuggingFace API token:··········


## Importing

In [None]:
def load_md_docs(base_folder: str, single_mode: bool) -> list[Document]:
    """
    TRUE: load only pages/012_%20.md
    FALSE : load all .md under pages except 012_%20.md
    """

    # the one file to isolate
    special = os.path.join(base_folder, "012_%20.md")

    if single_mode:
        paths = [special]
    else:
        paths = [os.path.join(base_folder, fn) for fn in os.listdir(base_folder) if fn.endswith(".md") and os.path.join(base_folder, fn) != special]

    docs = []
    for path in paths:
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        docs.append(Document(page_content=text,metadata={"source": os.path.basename(path)}))
    return docs

# ─── USAGE ─────────────────────────────────────────────────────────────
base_folder = "/content/drive/MyDrive/Gen AI Shared/msads_pages/pages"

# Toggle this:
SINGLE_MODE = False
docs = load_md_docs(base_folder, SINGLE_MODE)

print(f"Loaded {len(docs)} document(s):")
for d in docs:
    print(" •", d.metadata["source"])

Loaded 12 document(s):
 • 000_ms-in-applied-data-science.md
 • 001_in-person-program.md
 • 002_capstone-projects.md
 • 003_course-progressions.md
 • 004_how-to-apply.md
 • 005_events-deadlines.md
 • 006_our-students.md
 • 007_instructors-staff.md
 • 008_faqs.md
 • 009_career-outcomes.md
 • 010_online-program.md
 • 011_tuition-fees-aid.md


## Chunking

In [47]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

def add_url_to_chunks(docs, splitter):
    all_chunks = []
    for doc in docs:
        # Extract URL from document
        url = "Unknown"
        for line in doc.page_content.split('\n')[:5]:
            if line.startswith("# Source: "):
                url = line.replace("# Source: ", "").strip()
                break

        # Split the document
        chunks = splitter.split_documents([doc])

        # Add URL to each chunk's metadata
        for chunk in chunks:
            chunk.metadata['url'] = url

        all_chunks.extend(chunks)

    return all_chunks

# Use the SINGLE_MODE you set earlier (respects your toggle choice)
docs = load_md_docs(base_folder, SINGLE_MODE)
all_splits = add_url_to_chunks(docs, splitter)

sizes = [len(c.page_content) for c in all_splits]
print(f"\nUsing Mode = {'ONLY 012_%20.md' if SINGLE_MODE else 'ALL except 012_%20.md'}")
print(f" • Documents loaded  : {len(docs)}")
print(f" • Total chunks      : {len(all_splits)}")
print(f" • Avg chunk length  : {sum(sizes)/len(sizes):.0f} chars")
print(f" • Chunk length range: {min(sizes)}–{max(sizes)} chars")

print(f"{BOLD}\nTotal number of chunks after splitting{END}: {len(all_splits)}")
print(f"{BOLD}Character count on first chunk{END}: {len(all_splits[0].page_content)}")
print(f"{BOLD}First chunk URL{END}: {all_splits[0].metadata.get('url', 'No URL')}")
print(f"{BOLD}Smallest chunk size{END}: {min(sizes)} characters")
print(f"{BOLD}Largest chunk size{END}: {max(sizes)} characters")


Using Mode = ALL except 012_%20.md
 • Documents loaded  : 12
 • Total chunks      : 331
 • Avg chunk length  : 708 chars
 • Chunk length range: 41–998 chars
[1m
Total number of chunks after splitting[0m: 331
[1mCharacter count on first chunk[0m: 885
[1mFirst chunk URL[0m: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/
[1mSmallest chunk size[0m: 41 characters
[1mLargest chunk size[0m: 998 characters


## Indexing

In [51]:
# Embed model + upload into DB

persist_directory = "/content/drive/MyDrive/Gen AI Shared/msads_vectorstore_FINAL"

# Recreate the vectorstore
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=all_splits,
    embedding=embedding_model,
    persist_directory=persist_directory
)

print(f"Created new vectorstore with {vectorstore._collection.count()} documents")

Created new vectorstore with 331 documents


In [53]:
import shutil

# Create zip file
source_path = "/content/drive/MyDrive/Gen AI Shared/msads_vectorstore_FINAL"
zip_path = "/content/msads_vectorstore_backup"

shutil.make_archive(zip_path, 'zip', source_path)
print(f"✅ Created backup: {zip_path}.zip")

# Check zip file size
zip_size = os.path.getsize(f"{zip_path}.zip")
print(f"Zip file size: {zip_size / (1024*1024):.1f} MB")

# download from colab
from google.colab import files
files.download('msads_vectorstore_backup.zip')

✅ Created backup: /content/msads_vectorstore_backup.zip
Zip file size: 1.5 MB


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
################# RUN ON COMMAND


# # Remove the old broken vectorstore first
# rm -rf msads_vectorstore

# # Move the new working one
# mv msads_vectorstore_backup msads_vectorstore


# python -c "
# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_chroma import Chroma

# print('Loading embedding model...')
# embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# print('Loading vectorstore...')
# vectorstore = Chroma(persist_directory='msads_vectorstore', embedding_function=embedding_model)

# print(f'✅ Loaded {vectorstore._collection.count()} documents')

# print('Testing retrieval...')
# results = vectorstore.similarity_search('portfolio', k=2)
# print(f'✅ Retrieved {len(results)} documents')
# for i, doc in enumerate(results):
#     print(f'  Doc {i+1}: {doc.metadata.get(\"source\", \"Unknown\")}')
# "

# streamlit run streamlit_app.py

## Retrieve & Generate - Test

In [54]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [55]:
test_queries = [
    "What is tuition cost for the program?",
    "What scholarships are available for the program?",
    "What are the minimum scores for the TOEFL and IELTS English Language Requirement?",
    "Is there an application fee waiver?",
    "What are the deadlines for the in-person program?",
    "How long will it take for me to receive a decision on my application?",
    "Can I set up an advising appointment with the enrollment management team?",
    "Where can I mail my official transcripts?",
    "Does the Master’s in Applied Data Science Online program provide visa sponsorship?",
    "How do I apply to the MBA/MS program?",
    "Is the MS in Applied Data Science program STEM/OPT eligible?",
    "How many courses must you complete to earn UChicago’s Master’s in Applied Data Science?",


    # ADDITIONAL QUESTIONS
    "What are the career outcomes for both internships and full-time roles"
    "for this program (based on previous student's outcomes)? Where did you get this information?",
    "What are some of professors in this program?",
    "Who is this professor named Utku?",
    "Are transcripts required? If yes, which one?",
    "Where do i send my e-transcript?",
    "Do I need to send a Portfolio?"

]

# include LLM to return the source
'''def format_docs(docs):
    formatted = []
    for i, doc in enumerate(docs):
        url = doc.metadata.get("url", "Unknown")
        formatted.append(f"[Source {i+1} - {url}]:\n{doc.page_content.strip()}")
    return "\n\n".join(formatted)'''
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

llm = ChatOpenAI(
    model="gpt-4",
    api_key=OPENAI_KEY,
    temperature=0
)

retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k":10})
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

# get RAG prompt template
'''prompt = ChatPromptTemplate.from_template("""
Answer the question based only on the provided context.

IMPORTANT: Always cite your sources using [source_name] notation from the context.

Context: {context}

Question: {question}

Answer with citations
""")'''
prompt = hub.pull("rlm/rag-prompt")

# create RAG chain
'''rag_chain = (
    {"context": retriever | format_docs,
     "question": RunnablePassthrough()}
     | prompt
     | llm
     | StrOutputParser()
)'''

# RAG chain with sources
rag_chain_with_sources = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(
    answer=prompt | llm | StrOutputParser()
)

In [56]:
print("TEST RESULTS:\n")
for i, query in enumerate(test_queries, 1):
    result = rag_chain_with_sources.invoke(query)

    print(f"QUERY {i}: {query}")
    print(f"ANSWER:\n{result['answer']}\n")

    seen = set()
    unique_urls = []
    for doc in result["context"]:
        source = doc.metadata.get("source", "Unknown")
        if source not in seen:
            seen.add(source)
            unique_urls.append(source)

    if unique_urls:
        print("SOURCES:")
        for src in unique_urls:
            print(f"- {src}")
    print("\n" + "-"*80 + "\n")

TEST RESULTS:

QUERY 1: What is tuition cost for the program?
ANSWER:
The tuition for the MS in Applied Data Science program is $6,384 per course, with a total tuition cost of $76,608. There is also a non-refundable program enrollment deposit of $1,500, which is credited toward your first quarter’s tuition balance. Please note that tuition is expected to increase 3-7% per year.

SOURCES:
- 011_tuition-fees-aid.md
- 001_in-person-program.md
- 008_faqs.md
- 004_how-to-apply.md
- 003_course-progressions.md

--------------------------------------------------------------------------------

QUERY 2: What scholarships are available for the program?
ANSWER:
The MS in Applied Data Science program offers merit-based scholarships and partial tuition scholarships to top applicants. These scholarships do not require a separate application. Two specific scholarships mentioned are The Data Science Institute Scholarship and the MS in Applied Data Science Alumni Scholarship.

SOURCES:
- 011_tuition-fee

---