In [1]:
from pathlib import Path
import os
from typing import List, Dict, Any

import pandas as pd
from dotenv import load_dotenv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [2]:
cwd = Path.cwd()
candidates = [cwd, cwd.parent, cwd.parent.parent]

project_root = None
for c in candidates:
    if (c / "data").exists() and (c / "src").exists():
        project_root = c
        break

if project_root is None:
    project_root = cwd.parent  # fallback

print("Detected project root:", project_root)

env_path = project_root / ".env"
print("Looking for .env at:", env_path)

if env_path.exists():
    load_dotenv(dotenv_path=env_path)
else:
    print("NOTE: .env not found. This notebook does not require any API keys.")


Detected project root: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496
Looking for .env at: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\.env


# LOAD DATA FROM 1ST NOTEBOOK

In [3]:
DATA_DIR = project_root / "data"
PROCESSED_DIR = DATA_DIR / "processed"

chunks_parquet = PROCESSED_DIR / "lecture_chunks.parquet"
chunks_csv = PROCESSED_DIR / "lecture_chunks.csv"

if chunks_parquet.exists():
    chunks_df = pd.read_parquet(chunks_parquet)
    print(f"Loaded chunks from {chunks_parquet}")
elif chunks_csv.exists():
    chunks_df = pd.read_csv(chunks_csv)
    print(f"Loaded chunks from {chunks_csv}")
else:
    raise FileNotFoundError(
        "Could not find lecture_chunks.parquet or lecture_chunks.csv in data/processed/. "
        "Please run Notebook 1 first to generate them."
    )

print("DataFrame shape:", chunks_df.shape)
display(chunks_df.head(5))


Loaded chunks from C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\lecture_chunks.parquet
DataFrame shape: (75, 6)


Unnamed: 0,text,course,lecture_id,source,page,chunk_id
0,8\nModelling Long-Run Relationships in Finance...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,0
1,and why it is essential that variables that ar...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,1
2,behaviour and properties\n. To offer one illus...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,2
3,t\n will not have a\nsmaller effect in time \n...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,3
4,"are totally unrelated. So, if standard regress...",default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,4


In [4]:
corpus = chunks_df["text"].fillna("").astype(str).tolist()

vectorizer = TfidfVectorizer(
    max_features=5000,   # cap vocabulary size
    ngram_range=(1, 2),  # unigrams + bigrams often work better for notes[web:51][web:57]
)

tfidf_matrix = vectorizer.fit_transform(corpus)

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (75, 5000)


In [5]:
def semantic_search_local(query: str, k: int = 4):
    """
    Semantic search using TF-IDF + cosine similarity over lecture_chunks.
    Returns:
      - indices of top-k chunks in chunks_df
      - a small DataFrame with ranked results (for easy inspection)
    """
    if not isinstance(query, str) or not query.strip():
        raise ValueError("Query must be a non-empty string.")

    query_vec = vectorizer.transform([query])
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]  # shape: (n_chunks,)

    k = min(k, len(sims))
    top_indices = np.argsort(sims)[-k:][::-1]

    rows = []
    for rank, idx in enumerate(top_indices, start=1):
        row = chunks_df.iloc[idx]
        rows.append({
            "rank": rank,
            "similarity": float(sims[idx]),
            "lecture_id": row.get("lecture_id", ""),
            "page": row.get("page", None),
            "source": row.get("source", ""),
            "text_preview": str(row["text"])[:300].replace("\n", " "),
        })

    result_df = pd.DataFrame(rows)
    return top_indices, result_df


In [6]:
test_queries = [
    "linear regression",          
    "gradient descent",          
    "probability distribution",
]

for q in test_queries:
    print("=" * 80)
    print("QUERY:", q)
    _, result_df = semantic_search_local(q, k=3)
    display(result_df)


QUERY: linear regression


Unnamed: 0,rank,similarity,lecture_id,page,source,text_preview
0,1,0.129972,sample_data,21,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,( 8.47) Taking everything except the residuals...
1,2,0.111606,sample_data,1,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,"are totally unrelated. So, if standard regress..."
2,3,0.100327,sample_data,21,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,where All that has been done is to take one ...


QUERY: gradient descent


Unnamed: 0,rank,similarity,lecture_id,page,source,text_preview
0,1,0.0,sample_data,24,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,"y t − γ x t ), for this would imply that y ..."
1,2,0.0,sample_data,24,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,Chapter 5 . The error correction model is some...
2,3,0.0,sample_data,24,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,x t are cointegrated with cointegrating coeff...


QUERY: probability distribution


Unnamed: 0,rank,similarity,lecture_id,page,source,text_preview
0,1,0.158606,sample_data,12,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,( 8.37) The test statistics do not follow the ...
1,2,0.121645,sample_data,1,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,"Figure 8.1 . As Figure 8.1 shows, although o..."
2,3,0.062212,sample_data,6,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,"process). Thus the series, Δ y t would in thi..."


In [7]:
sample_query = "LangGraph"  # set to a concept surely present in your notes
_, preview_df = semantic_search_local(sample_query, k=5)

display(preview_df)

preview_path = PROCESSED_DIR / "semantic_search_preview.csv"
preview_df.to_csv(preview_path, index=False, encoding="utf-8")
print(f"Saved semantic search preview to {preview_path}")


Unnamed: 0,rank,similarity,lecture_id,page,source,text_preview
0,1,0.0,sample_data,24,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,"y t − γ x t ), for this would imply that y ..."
1,2,0.0,sample_data,24,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,Chapter 5 . The error correction model is some...
2,3,0.0,sample_data,24,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,x t are cointegrated with cointegrating coeff...
3,4,0.0,sample_data,24,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,everything in the equation cancels. Model equ...
4,5,0.0,sample_data,23,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,"solution. For example, consider two series, ..."


Saved semantic search preview to C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\semantic_search_preview.csv
