In [1]:
from pathlib import Path
from typing import List, Optional
import json

import pandas as pd
from dotenv import load_dotenv
from pydantic import BaseModel, Field


In [2]:
cwd = Path.cwd()
candidates = [cwd, cwd.parent, cwd.parent.parent]

project_root = None
for c in candidates:
    if (c / "data").exists() and (c / "src").exists():
        project_root = c
        break

if project_root is None:
    project_root = cwd.parent  # fallback

print("Detected project root:", project_root)

env_path = project_root / ".env"
if env_path.exists():
    load_dotenv(env_path)
else:
    print("NOTE: .env not required for this notebook, but loaded if present.")

DATA_DIR = project_root / "data"
PROCESSED_DIR = DATA_DIR / "processed"

chunks_parquet = PROCESSED_DIR / "lecture_chunks.parquet"
chunks_csv = PROCESSED_DIR / "lecture_chunks.csv"

if chunks_parquet.exists():
    chunks_df = pd.read_parquet(chunks_parquet)
    print(f"Loaded chunks from {chunks_parquet}")
elif chunks_csv.exists():
    chunks_df = pd.read_csv(chunks_csv)
    print(f"Loaded chunks from {chunks_csv}")
else:
    raise FileNotFoundError(
        "Could not find lecture_chunks.parquet or lecture_chunks.csv in data/processed/. "
        "Run Notebook 1 first."
    )

print("Chunks DataFrame shape:", chunks_df.shape)
display(chunks_df.head(3))


Detected project root: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496
Loaded chunks from C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\lecture_chunks.parquet
Chunks DataFrame shape: (75, 6)


Unnamed: 0,text,course,lecture_id,source,page,chunk_id
0,8\nModelling Long-Run Relationships in Finance...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,0
1,and why it is essential that variables that ar...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,1
2,behaviour and properties\n. To offer one illus...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,2


# FROM 2ND STEP FILE

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
corpus = chunks_df["text"].fillna("").astype(str).tolist()
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(corpus)

print("TF-IDF matrix shape (rebuilt):", tfidf_matrix.shape)


TF-IDF matrix shape (rebuilt): (75, 5000)


In [6]:
def semantic_search_local(query: str, k: int = 4):
    """
    Same semantic search helper as in Notebook 2.
    """
    if not isinstance(query, str) or not query.strip():
        raise ValueError("Query must be a non-empty string.")

    query_vec = vectorizer.transform([query])
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]

    k = min(k, len(sims))
    top_indices = np.argsort(sims)[-k:][::-1]

    rows = []
    for rank, idx in enumerate(top_indices, start=1):
        row = chunks_df.iloc[idx]
        rows.append({
            "rank": rank,
            "similarity": float(sims[idx]),
            "lecture_id": row.get("lecture_id", ""),
            "page": row.get("page", None),
            "source": row.get("source", ""),
            "text": str(row["text"]),
        })

    result_df = pd.DataFrame(rows)
    return top_indices, result_df


In [7]:
class MCQQuestion(BaseModel):
    """Single multiple-choice question with one correct option."""
    id: str = Field(description="Unique ID for this question")
    stem: str = Field(description="The question text the student sees")
    options: List[str] = Field(description="List of answer choices")
    correct_option_index: int = Field(
        description="Index of the correct option in the options list (0-based)"
    )
    difficulty: str = Field(
        description="One of: 'easy', 'medium', 'hard'"
    )
    topic: str = Field(
        description="Short topic label, e.g., 'linear_regression' or 'probability'"
    )
    source_excerpt: str = Field(
        description="Short excerpt from the lecture text that justifies the question"
    )
    source_doc_id: str = Field(
        description="Identifier of source doc/chunk, e.g., 'lecture_id:page:chunk_id'"
    )


class ShortAnswerQuestion(BaseModel):
    """Short-answer question and ideal answer."""
    id: str
    question: str
    ideal_answer: str
    difficulty: str
    topic: str
    source_excerpt: str
    source_doc_id: str


class ConceptMapEdge(BaseModel):
    """Directed edge between two related concepts for concept-maps."""
    id: str
    source_concept: str = Field(description="Starting concept (node A)")
    target_concept: str = Field(description="Related concept (node B)")
    relation: str = Field(
        description="Short phrase describing relation, e.g., 'causes', 'depends on'"
    )
    topic: str
    source_excerpt: str
    source_doc_id: str


class QuestionBundle(BaseModel):
    """Container for batches of generated questions from one prompt."""
    mcqs: List[MCQQuestion] = []
    short_answers: List[ShortAnswerQuestion] = []
    concept_edges: List[ConceptMapEdge] = []


In [8]:
def build_source_doc_id(row) -> str:
    lecture_id = str(row.get("lecture_id", ""))
    page = row.get("page", None)
    chunk_id = row.get("chunk_id", None)
    return f"{lecture_id}:page={page}:chunk={chunk_id}"


In [9]:
def extract_excerpt(text: str, max_chars: int = 400) -> str:
    text = text.replace("\n", " ").strip()
    if len(text) <= max_chars:
        return text
    return text[: max_chars - 3] + "..."


In [11]:
def build_mcq_prompt(topic: str, k_context: int = 4) -> str:
    """
    Build an MCQ-generation prompt using top-k retrieved chunks for a topic.
    """
    _, ctx_df = semantic_search_local(topic, k=k_context)

    context_blocks = []
    for _, row in ctx_df.iterrows():
        source_id = build_source_doc_id(row)
        excerpt = extract_excerpt(row["text"])
        block = f"[SOURCE_ID: {source_id}]\n{excerpt}"
        context_blocks.append(block)

    context_text = "\n\n".join(context_blocks)

    prompt = f"""
You are an AI tutor helping a teacher create exam-style multiple-choice questions.

Use ONLY the information in the CONTEXT below. Do not invent facts.

CONTEXT:
{context_text}

TASK:
Generate 3 clear, unambiguous multiple-choice questions about the topic: "{topic}".

Rules:
- Each question must have exactly 4 options.
- Exactly ONE option must be correct.
- Options must be mutually exclusive; avoid "all of the above".
- Tag each question with difficulty: "easy", "medium", or "hard".
- For each question, include:
  - a unique id string
  - the question stem
  - the 4 options
  - correct_option_index (0, 1, 2, or 3)
  - topic (short topic label)
  - source_excerpt: a short quotation or paraphrase from CONTEXT that justifies the question
  - source_doc_id: the SOURCE_ID of the chunk you used

Output format:
Return a JSON object that matches this Pydantic model:

{{
  "mcqs": [
    {{
      "id": "string",
      "stem": "string",
      "options": ["string", "string", "string", "string"],
      "correct_option_index": 0,
      "difficulty": "easy|medium|hard",
      "topic": "string",
      "source_excerpt": "string",
      "source_doc_id": "string"
    }}
  ],
  "short_answers": [],
  "concept_edges": []
}}

Return ONLY valid JSON, no extra text.
"""
    return prompt.strip()


In [12]:
def build_short_answer_prompt(topic: str, k_context: int = 4) -> str:
    _, ctx_df = semantic_search_local(topic, k=k_context)

    context_blocks = []
    for _, row in ctx_df.iterrows():
        source_id = build_source_doc_id(row)
        excerpt = extract_excerpt(row["text"])
        block = f"[SOURCE_ID: {source_id}]\n{excerpt}"
        context_blocks.append(block)

    context_text = "\n\n".join(context_blocks)

    prompt = f"""
You are an AI tutor generating short-answer questions for students.

Use ONLY the information in the CONTEXT below.

CONTEXT:
{context_text}

TASK:
Generate 3 short-answer questions about the topic: "{topic}".

Rules:
- Each question should be answerable in 1-3 sentences.
- Avoid "prove that" or very open-ended questions.
- For each question, include:
  - id
  - question
  - ideal_answer
  - difficulty ("easy", "medium", "hard")
  - topic
  - source_excerpt
  - source_doc_id

Output format:
Return a JSON object matching this structure:

{{
  "mcqs": [],
  "short_answers": [
    {{
      "id": "string",
      "question": "string",
      "ideal_answer": "string",
      "difficulty": "easy|medium|hard",
      "topic": "string",
      "source_excerpt": "string",
      "source_doc_id": "string"
    }}
  ],
  "concept_edges": []
}}

Return ONLY valid JSON.
"""
    return prompt.strip()


In [13]:
def build_concept_map_prompt(topic: str, k_context: int = 4) -> str:
    _, ctx_df = semantic_search_local(topic, k=k_context)

    context_blocks = []
    for _, row in ctx_df.iterrows():
        source_id = build_source_doc_id(row)
        excerpt = extract_excerpt(row["text"])
        block = f"[SOURCE_ID: {source_id}]\n{excerpt}"
        context_blocks.append(block)

    context_text = "\n\n".join(context_blocks)

    prompt = f"""
You are an AI assistant extracting concept-map relations from lecture text.

Use ONLY the information in the CONTEXT below.

CONTEXT:
{context_text}

TASK:
Identify 5 important relationships between concepts related to the topic "{topic}".

Each relationship should be represented as a directed edge:

  source_concept --relation--> target_concept

Rules:
- Concepts should be short noun phrases (e.g., "loss function", "gradient descent").
- relation should be a short phrase like "depends on", "is defined as", "causes".
- For each edge, include:
  - id
  - source_concept
  - target_concept
  - relation
  - topic
  - source_excerpt
  - source_doc_id

Output format:
Return a JSON object matching this structure:

{{
  "mcqs": [],
  "short_answers": [],
  "concept_edges": [
    {{
      "id": "string",
      "source_concept": "string",
      "target_concept": "string",
      "relation": "string",
      "topic": "string",
      "source_excerpt": "string",
      "source_doc_id": "string"
    }}
  ]
}}

Return ONLY valid JSON.
"""
    return prompt.strip()


In [14]:
sample_topic = "types of non-stationarity"  

mcq_prompt = build_mcq_prompt(sample_topic, k_context=3)
short_prompt = build_short_answer_prompt(sample_topic, k_context=3)
concept_prompt = build_concept_map_prompt(sample_topic, k_context=3)

print("=== MCQ PROMPT PREVIEW ===")
print(mcq_prompt[:2000]) 

print("\n\n=== SHORT-ANSWER PROMPT PREVIEW ===")
print(short_prompt[:1000])

print("\n\n=== CONCEPT-MAP PROMPT PREVIEW ===")
print(concept_prompt[:1000])


=== MCQ PROMPT PREVIEW ===
You are an AI tutor helping a teacher create exam-style multiple-choice questions.

Use ONLY the information in the CONTEXT below. Do not invent facts.

CONTEXT:
[SOURCE_ID: sample_data:page=3:chunk=None]
Figure 8.2   Value of  t -ratio of slope coefficient for 1,000 sets of regressions of a non-stationary variable on another independent non- stationary variable 8.1.2 Two Types of Non-Stationarity There are two models that have been frequently used to characterise the non-stationarity, the  random walk model with drift ( 8.1) and the  trend-stationary process  – so called because it is stationar...

[SOURCE_ID: sample_data:page=5:chunk=None]
some starting value of  y 0 . This is known as the  unit root case , for the root of the characteristic equation would be unity. (3) ϕ  > 1. Now given shocks become more influential as time goes on, since if  ϕ  > 1,  ϕ 3  >  ϕ 2  >  ϕ , etc. This is the  explosive case  which, for the reasons listed above, will not be co

In [15]:
fake_json = {
    "mcqs": [
        {
            "id": "mcq_1",
            "stem": "Which of the following best describes a probability distribution?",
            "options": [
                "A function that assigns probabilities to events",
                "A table of raw data",
                "A deterministic algorithm",
                "A measure of central tendency"
            ],
            "correct_option_index": 0,
            "difficulty": "easy",
            "topic": "probability_distribution",
            "source_excerpt": "A probability distribution assigns probabilities to each possible outcome.",
            "source_doc_id": "sample_data:page=3:chunk=5"
        }
    ],
    "short_answers": [],
    "concept_edges": []
}

bundle = QuestionBundle(**fake_json)
print("Parsed MCQ count:", len(bundle.mcqs))
print("First MCQ stem:", bundle.mcqs[0].stem)
print("First MCQ correct option index:", bundle.mcqs[0].correct_option_index)


Parsed MCQ count: 1
First MCQ stem: Which of the following best describes a probability distribution?
First MCQ correct option index: 0
