In [1]:
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd
from dotenv import load_dotenv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [2]:
cwd = Path.cwd()
candidates = [cwd, cwd.parent, cwd.parent.parent]

project_root = None
for c in candidates:
    if (c / "data").exists() and (c / "src").exists():
        project_root = c
        break

if project_root is None:
    project_root = cwd.parent

print("Detected project root:", project_root)

env_path = project_root / ".env"
if env_path.exists():
    load_dotenv(env_path)

DATA_DIR = project_root / "data"
PROCESSED_DIR = DATA_DIR / "processed"

chunks_parquet = PROCESSED_DIR / "lecture_chunks.parquet"
chunks_csv = PROCESSED_DIR / "lecture_chunks.csv"

if chunks_parquet.exists():
    chunks_df = pd.read_parquet(chunks_parquet)
    print(f"Loaded chunks from {chunks_parquet}")
elif chunks_csv.exists():
    chunks_df = pd.read_csv(chunks_csv)
    print(f"Loaded chunks from {chunks_csv}")
else:
    raise FileNotFoundError(
        "Could not find lecture_chunks.parquet or lecture_chunks.csv in data/processed/. "
        "Run Notebook 1 first."
    )

print("Chunks DataFrame shape:", chunks_df.shape)
display(chunks_df.head(3))


Detected project root: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496
Loaded chunks from C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\lecture_chunks.parquet
Chunks DataFrame shape: (75, 6)


Unnamed: 0,text,course,lecture_id,source,page,chunk_id
0,8\nModelling Long-Run Relationships in Finance...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,0
1,and why it is essential that variables that ar...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,1
2,behaviour and properties\n. To offer one illus...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,2


In [3]:
corpus = chunks_df["text"].fillna("").astype(str).tolist()
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(corpus)

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (75, 5000)


In [4]:
def semantic_search_local(query: str, k: int = 4):
    if not isinstance(query, str) or not query.strip():
        raise ValueError("Query must be a non-empty string.")

    query_vec = vectorizer.transform([query])
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]

    k = min(k, len(sims))
    top_indices = np.argsort(sims)[-k:][::-1]

    rows = []
    for rank, idx in enumerate(top_indices, start=1):
        row = chunks_df.iloc[idx]
        rows.append({
            "rank": rank,
            "similarity": float(sims[idx]),
            "lecture_id": row.get("lecture_id", ""),
            "page": row.get("page", None),
            "source": row.get("source", ""),
            "chunk_id": row.get("chunk_id", None),
            "text": str(row["text"]),
        })

    result_df = pd.DataFrame(rows)
    return top_indices, result_df


In [5]:
def tool_search_lectures(topic: str, k: int = 5) -> List[Dict[str, Any]]:
    """
    Tool: search_lectures
    Description: Given a topic string, return top-k relevant lecture chunks.
    """
    _, ctx_df = semantic_search_local(topic, k=k)
    return ctx_df.to_dict(orient="records")


In [6]:
def tool_list_topics(min_count: int = 3) -> List[Dict[str, Any]]:
    """
    Tool: list_topics
    Description: Heuristic topic list from lecture_id + frequent words in headings.
    Currently returns counts per lecture_id; can be extended later.
    """
    counts = chunks_df["lecture_id"].value_counts().reset_index()
    counts.columns = ["lecture_id", "num_chunks"]

    if min_count is not None:
        counts = counts[counts["num_chunks"] >= min_count]

    return counts.to_dict(orient="records")


In [7]:
def tool_get_source_excerpt(lecture_id: str, page: int, n_chars: int = 300) -> str:
    """
    Tool: get_source_excerpt
    Description: Return a short excerpt for a given lecture_id + page combination.
    """
    subset = chunks_df[
        (chunks_df["lecture_id"] == lecture_id) &
        (chunks_df["page"] == page)
    ]

    if subset.empty:
        return ""

    text = str(subset.iloc[0]["text"]).replace("\n", " ").strip()
    return text[:n_chars] + ("..." if len(text) > n_chars else "")


In [8]:
def call_tool(tool_name: str, **kwargs) -> Dict[str, Any]:
    """
    Simple tool-calling router.
    tool_name: one of "search_lectures", "list_topics", "get_source_excerpt".
    kwargs: tool-specific arguments.
    """
    if tool_name == "search_lectures":
        topic = kwargs.get("topic", "")
        k = int(kwargs.get("k", 5))
        records = tool_search_lectures(topic, k=k)
        return {"tool": tool_name, "args": {"topic": topic, "k": k}, "result": records}

    if tool_name == "list_topics":
        min_count = int(kwargs.get("min_count", 3))
        records = tool_list_topics(min_count=min_count)
        return {"tool": tool_name, "args": {"min_count": min_count}, "result": records}

    if tool_name == "get_source_excerpt":
        lecture_id = kwargs.get("lecture_id", "")
        page = kwargs.get("page", None)
        n_chars = int(kwargs.get("n_chars", 300))
        excerpt = tool_get_source_excerpt(lecture_id, page, n_chars=n_chars)
        return {
            "tool": tool_name,
            "args": {"lecture_id": lecture_id, "page": page, "n_chars": n_chars},
            "result": excerpt,
        }

    raise ValueError(f"Unknown tool name: {tool_name}")


In [9]:
topics_resp = call_tool("list_topics", min_count=1)
print("Example topics / lectures:\n")
for row in topics_resp["result"][:5]:
    print(row)


Example topics / lectures:

{'lecture_id': 'sample_data', 'num_chunks': 75}


In [10]:
test_topic = "probability distribution"  # adjust to your notes
search_resp = call_tool("search_lectures", topic=test_topic, k=3)

print(f"\nSearch tool result for topic='{test_topic}':\n")
search_df = pd.DataFrame(search_resp["result"])
display(search_df)



Search tool result for topic='probability distribution':



Unnamed: 0,rank,similarity,lecture_id,page,source,chunk_id,text
0,1,0.158606,sample_data,12,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,30,(\n8.37)\nThe test statistics do not follow th...
1,2,0.121645,sample_data,1,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,5,"Figure 8.1\n.\nAs \nFigure 8.1\n shows, althou..."
2,3,0.062212,sample_data,6,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,16,"process). Thus the series, ﾎ能 y\nt\n would in ..."


In [12]:
if search_resp["result"]:
    first = search_resp["result"][0]
    lec_id = first["lecture_id"]
    page = first["page"]
    excerpt_resp = call_tool("get_source_excerpt", lecture_id=lec_id, page=page, n_chars=300)
    print("\nFirst source excerpt:\n")
    print(excerpt_resp["result"])




First source excerpt:

( 8.37) The test statistics do not follow the usual  t -distribution under the null hypothesis, since the null is one of non-stationarity, but rather they follow a non-standard distribution. Critical values are derived from simulations experiments in, for example, Fuller ( 1976 ); see also  Chapter ...


In [13]:
def tool_guided_question_gen(topic: str, k: int = 4) -> Dict[str, Any]:
    """
    Example high-level function:
    - Uses the search_lectures tool to get context.
    - Returns context in a compact form that Notebook 4's graph can use.
    """
    search_resp = call_tool("search_lectures", topic=topic, k=k)
    context_records = search_resp["result"]

    return {
        "topic": topic,
        "context_chunks": context_records,
        "n_context": len(context_records),
    }

preview = tool_guided_question_gen("probability distribution", k=3)
print(preview["topic"], "=>", preview["n_context"], "context chunks")
pd.DataFrame(preview["context_chunks"])


probability distribution => 3 context chunks


Unnamed: 0,rank,similarity,lecture_id,page,source,chunk_id,text
0,1,0.158606,sample_data,12,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,30,(\n8.37)\nThe test statistics do not follow th...
1,2,0.121645,sample_data,1,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,5,"Figure 8.1\n.\nAs \nFigure 8.1\n shows, althou..."
2,3,0.062212,sample_data,6,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,16,"process). Thus the series, ﾎ能 y\nt\n would in ..."
