In [5]:
import pandas as pd
from pathlib import Path


In [6]:
from pathlib import Path
from typing import List, Dict, Any, Optional
import json

import pandas as pd
from dotenv import load_dotenv
from pydantic import BaseModel, Field, ValidationError

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from typing_extensions import TypedDict, Annotated
from operator import add

from langgraph.graph import StateGraph, START, END


In [7]:
cwd = Path.cwd()
candidates = [cwd, cwd.parent, cwd.parent.parent]

project_root = None
for c in candidates:
    if (c / "data").exists() and (c / "src").exists():
        project_root = c
        break

if project_root is None:
    project_root = cwd.parent

print("Detected project root:", project_root)

env_path = project_root / ".env"
if env_path.exists():
    load_dotenv(env_path)
else:
    print("NOTE: .env loaded if present; not required in this notebook.")

DATA_DIR = project_root / "data"
PROCESSED_DIR = DATA_DIR / "processed"

chunks_parquet = PROCESSED_DIR / "lecture_chunks.parquet"
chunks_csv = PROCESSED_DIR / "lecture_chunks.csv"

if chunks_parquet.exists():
    chunks_df = pd.read_parquet(chunks_parquet)
    print(f"Loaded chunks from {chunks_parquet}")
elif chunks_csv.exists():
    chunks_df = pd.read_csv(chunks_csv)
    print(f"Loaded chunks from {chunks_csv}")
else:
    raise FileNotFoundError(
        "Could not find lecture_chunks.parquet or lecture_chunks.csv in data/processed/. "
        "Run Notebook 1 first."
    )

print("Chunks DataFrame shape:", chunks_df.shape)
display(chunks_df.head(3))


Detected project root: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496
Loaded chunks from C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\lecture_chunks.parquet
Chunks DataFrame shape: (75, 6)


Unnamed: 0,text,course,lecture_id,source,page,chunk_id
0,8\nModelling Long-Run Relationships in Finance...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,0
1,and why it is essential that variables that ar...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,1
2,behaviour and properties\n. To offer one illus...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,2


# REUSING SCHEMAS FROM PREVIOUS NOTEBOOK

In [8]:
class MCQQuestion(BaseModel):
    id: str
    stem: str
    options: List[str]
    correct_option_index: int
    difficulty: str
    topic: str
    source_excerpt: str
    source_doc_id: str


class ShortAnswerQuestion(BaseModel):
    id: str
    question: str
    ideal_answer: str
    difficulty: str
    topic: str
    source_excerpt: str
    source_doc_id: str


class ConceptMapEdge(BaseModel):
    id: str
    source_concept: str
    target_concept: str
    relation: str
    topic: str
    source_excerpt: str
    source_doc_id: str


class QuestionBundle(BaseModel):
    mcqs: List[MCQQuestion] = []
    short_answers: List[ShortAnswerQuestion] = []
    concept_edges: List[ConceptMapEdge] = []


In [9]:
corpus = chunks_df["text"].fillna("").astype(str).tolist()
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(corpus)

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (75, 5000)


In [10]:
def semantic_search_local(query: str, k: int = 4):
    if not isinstance(query, str) or not query.strip():
        raise ValueError("Query must be a non-empty string.")

    query_vec = vectorizer.transform([query])
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]

    k = min(k, len(sims))
    top_indices = np.argsort(sims)[-k:][::-1]

    rows = []
    for rank, idx in enumerate(top_indices, start=1):
        row = chunks_df.iloc[idx]
        rows.append({
            "rank": rank,
            "similarity": float(sims[idx]),
            "lecture_id": row.get("lecture_id", ""),
            "page": row.get("page", None),
            "source": row.get("source", ""),
            "chunk_id": row.get("chunk_id", None),
            "text": str(row["text"]),
        })

    result_df = pd.DataFrame(rows)
    return top_indices, result_df


In [11]:
def build_source_doc_id(row) -> str:
    lecture_id = str(row.get("lecture_id", ""))
    page = row.get("page", None)
    chunk_id = row.get("chunk_id", None)
    return f"{lecture_id}:page={page}:chunk={chunk_id}"


def extract_excerpt(text: str, max_chars: int = 300) -> str:
    text = text.replace("\n", " ").strip()
    if len(text) <= max_chars:
        return text
    return text[: max_chars - 3] + "..."


In [12]:
def build_mcq_prompt_from_df(topic: str, ctx_df: pd.DataFrame) -> str:
    context_blocks = []
    for _, row in ctx_df.iterrows():
        source_id = build_source_doc_id(row)
        excerpt = extract_excerpt(row["text"])
        block = f"[SOURCE_ID: {source_id}]\n{excerpt}"
        context_blocks.append(block)

    context_text = "\n\n".join(context_blocks)

    prompt = f"""
You are an AI tutor helping a teacher create exam-style multiple-choice questions.

Use ONLY the information in the CONTEXT below.

CONTEXT:
{context_text}

TASK:
Generate 3 clear, unambiguous multiple-choice questions about the topic: "{topic}".

Rules:
- Each question has exactly 4 options and exactly one correct option.
- Options are mutually exclusive.
- For each question, include id, stem, options, correct_option_index, difficulty, topic, source_excerpt, source_doc_id.

Return ONLY valid JSON matching this structure:

{{
  "mcqs": [...],
  "short_answers": [],
  "concept_edges": []
}}
"""
    return prompt.strip()


In [13]:
class QGenState(TypedDict):
    topic: str
    mode: str                         
    n_target: int                      
    retrieved_context: List[Dict[str, Any]]
    last_prompt: str
    last_raw_output: str
    questions_mcq: Annotated[List[Dict[str, Any]], add]
    errors: Annotated[List[str], add]
    done: bool


In [14]:
def retrieve_node(state: QGenState) -> QGenState:
    topic = state["topic"]
    _, ctx_df = semantic_search_local(topic, k=4)

    state["retrieved_context"] = ctx_df.to_dict(orient="records")
    return state


In [15]:
def build_prompt_node(state: QGenState) -> QGenState:
    topic = state["topic"]
    mode = state["mode"]

    ctx_df = pd.DataFrame(state["retrieved_context"])
    if ctx_df.empty:
        state["errors"].append("No context retrieved for topic.")
        state["last_prompt"] = ""
        return state

    if mode == "mcq":
        prompt = build_mcq_prompt_from_df(topic, ctx_df)
    else:
        prompt = f"(Mode {mode} not yet implemented.)"

    state["last_prompt"] = prompt
    return state


In [16]:
def dummy_llm_node(state: QGenState) -> QGenState:
    """
    Placeholder 'LLM' that returns a fixed, valid JSON bundle.
    Replace later with a real Perplexity/LLM call + structured output.
    """
    topic = state["topic"]

    fake_json = {
        "mcqs": [
            {
                "id": f"{topic}_mcq_1",
                "stem": f"Dummy question about {topic} (1)",
                "options": ["Option A", "Option B", "Option C", "Option D"],
                "correct_option_index": 0,
                "difficulty": "easy",
                "topic": topic,
                "source_excerpt": "Dummy excerpt 1.",
                "source_doc_id": "dummy:page=0:chunk=0"
            },
            {
                "id": f"{topic}_mcq_2",
                "stem": f"Dummy question about {topic} (2)",
                "options": ["Option A", "Option B", "Option C", "Option D"],
                "correct_option_index": 1,
                "difficulty": "medium",
                "topic": topic,
                "source_excerpt": "Dummy excerpt 2.",
                "source_doc_id": "dummy:page=0:chunk=1"
            },
        ],
        "short_answers": [],
        "concept_edges": []
    }

    state["last_raw_output"] = json.dumps(fake_json)
    return state


In [17]:
def validate_node(state: QGenState) -> QGenState:
    raw = state.get("last_raw_output", "")
    if not raw:
        state["errors"].append("No raw output to validate.")
        return state

    try:
        data = json.loads(raw)
    except json.JSONDecodeError as e:
        state["errors"].append(f"JSON decode error: {e}")
        return state

    try:
        bundle = QuestionBundle(**data)
    except ValidationError as e:
        state["errors"].append(f"Pydantic validation error: {e}")
        return state

    for mcq in bundle.mcqs:
        state["questions_mcq"].append(mcq.model_dump())

    return state


In [18]:
def decide_next_node(state: QGenState) -> str:
    """
    Decide whether we have enough questions.
    Return 'retrieve' to loop or END to finish.
    """
    n_target = state["n_target"]
    current = len(state.get("questions_mcq", []))

    if current >= n_target:
        state["done"] = True
        return END

    if current > 0:
        state["done"] = True
        return END

    return "retrieve"


In [19]:
graph_builder = StateGraph(QGenState)

graph_builder.add_node("retrieve", retrieve_node)
graph_builder.add_node("build_prompt", build_prompt_node)
graph_builder.add_node("llm_call", dummy_llm_node)
graph_builder.add_node("validate", validate_node)

graph_builder.add_edge(START, "retrieve")
graph_builder.add_edge("retrieve", "build_prompt")
graph_builder.add_edge("build_prompt", "llm_call")
graph_builder.add_edge("llm_call", "validate")

graph_builder.add_conditional_edges(
    "validate",
    decide_next_node,
    {
        "retrieve": "retrieve",
        END: END
    }
)

graph_builder.set_entry_point("retrieve")

qgen_app = graph_builder.compile()
print("LangGraph question-generation app compiled.")


LangGraph question-generation app compiled.


In [20]:
initial_state: QGenState = {
    "topic": "probability distribution",  # adjust to your notes
    "mode": "mcq",
    "n_target": 3,
    "retrieved_context": [],
    "last_prompt": "",
    "last_raw_output": "",
    "questions_mcq": [],
    "errors": [],
    "done": False,
}

result_state = qgen_app.invoke(initial_state)

print("Done flag:", result_state["done"])
print("Errors:", result_state["errors"])
print("Number of MCQs generated:", len(result_state["questions_mcq"]))


Done flag: False
Errors: []
Number of MCQs generated: 4


In [21]:
pd.DataFrame(result_state["questions_mcq"])


Unnamed: 0,id,stem,options,correct_option_index,difficulty,topic,source_excerpt,source_doc_id
0,probability distribution_mcq_1,Dummy question about probability distribution (1),"[Option A, Option B, Option C, Option D]",0,easy,probability distribution,Dummy excerpt 1.,dummy:page=0:chunk=0
1,probability distribution_mcq_2,Dummy question about probability distribution (2),"[Option A, Option B, Option C, Option D]",1,medium,probability distribution,Dummy excerpt 2.,dummy:page=0:chunk=1
2,probability distribution_mcq_1,Dummy question about probability distribution (1),"[Option A, Option B, Option C, Option D]",0,easy,probability distribution,Dummy excerpt 1.,dummy:page=0:chunk=0
3,probability distribution_mcq_2,Dummy question about probability distribution (2),"[Option A, Option B, Option C, Option D]",1,medium,probability distribution,Dummy excerpt 2.,dummy:page=0:chunk=1


In [22]:
questions_df = pd.DataFrame(result_state["questions_mcq"])
eval_input_path = PROCESSED_DIR / "langgraph_mcqs_dummy.csv"
questions_df.to_csv(eval_input_path, index=False, encoding="utf-8")
print("Saved MCQs for evaluation to:", eval_input_path)


Saved MCQs for evaluation to: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\langgraph_mcqs_dummy.csv
