In [2]:
from pathlib import Path
import os
import json
from typing import List, Dict, Any

import pandas as pd
from dotenv import load_dotenv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import requests


In [3]:
cwd = Path.cwd()
candidates = [cwd, cwd.parent, cwd.parent.parent]

project_root = None
for c in candidates:
    if (c / "data").exists() and (c / "src").exists():
        project_root = c
        break

if project_root is None:
    project_root = cwd.parent

print("Project root:", project_root)

env_path = project_root / ".env"
if env_path.exists():
    load_dotenv(env_path)
else:
    print("WARNING: .env not found; set PERPLEXITY_API_KEY in environment manually.")

DATA_DIR = project_root / "data"
PROCESSED_DIR = DATA_DIR / "processed"

chunks_parquet = PROCESSED_DIR / "lecture_chunks.parquet"
chunks_csv = PROCESSED_DIR / "lecture_chunks.csv"

if chunks_parquet.exists():
    chunks_df = pd.read_parquet(chunks_parquet)
elif chunks_csv.exists():
    chunks_df = pd.read_csv(chunks_csv)
else:
    raise FileNotFoundError("Run Notebook 1 to create lecture_chunks first.")

print("Chunks shape:", chunks_df.shape)
display(chunks_df.head(3))


Project root: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496
Chunks shape: (75, 6)


Unnamed: 0,text,course,lecture_id,source,page,chunk_id
0,8\nModelling Long-Run Relationships in Finance...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,0
1,and why it is essential that variables that ar...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,0,1
2,behaviour and properties\n. To offer one illus...,default_course,sample_data,C:\Users\Admin\OneDrive\Desktop\Capstone-MAT49...,1,2


In [4]:
corpus = chunks_df["text"].fillna("").astype(str).tolist()
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(corpus)
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (75, 5000)


In [5]:
def semantic_search_local(query: str, k: int = 4) -> pd.DataFrame:
    query_vec = vectorizer.transform([query])
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]
    k = min(k, len(sims))
    top_indices = np.argsort(sims)[-k:][::-1]

    rows = []
    for rank, idx in enumerate(top_indices, start=1):
        row = chunks_df.iloc[idx]
        rows.append({
            "rank": rank,
            "similarity": float(sims[idx]),
            "lecture_id": row.get("lecture_id", ""),
            "page": row.get("page", None),
            "source": row.get("source", ""),
            "chunk_id": row.get("chunk_id", None),
            "text": str(row["text"]),
        })

    return pd.DataFrame(rows)


In [6]:
def build_source_doc_id(row) -> str:
    lecture_id = str(row.get("lecture_id", ""))
    page = row.get("page", None)
    chunk_id = row.get("chunk_id", None)
    return f"{lecture_id}:page={page}:chunk={chunk_id}"


def extract_excerpt(text: str, max_chars: int = 400) -> str:
    text = text.replace("\n", " ").strip()
    if len(text) <= max_chars:
        return text
    return text[: max_chars - 3] + "..."


In [7]:
from pydantic import BaseModel, Field, ValidationError

class MCQQuestion(BaseModel):
    id: str
    stem: str
    options: List[str]
    correct_option_index: int
    difficulty: str
    topic: str
    source_excerpt: str
    source_doc_id: str

class QuestionBundle(BaseModel):
    mcqs: List[MCQQuestion] = []
    short_answers: List[Dict[str, Any]] = []
    concept_edges: List[Dict[str, Any]] = []


In [8]:
def build_mcq_prompt_for_llm(topic: str, k_context: int = 4) -> str:
    ctx_df = semantic_search_local(topic, k=k_context)

    if ctx_df.empty:
        raise ValueError("No context found for this topic.")

    context_blocks = []
    for _, row in ctx_df.iterrows():
        source_id = build_source_doc_id(row)
        excerpt = extract_excerpt(row["text"])
        block = f"[SOURCE_ID: {source_id}]\n{excerpt}"
        context_blocks.append(block)

    context_text = "\n\n".join(context_blocks)

    prompt = f"""
You are an AI tutor helping a teacher create exam-style multiple-choice questions.

Use ONLY the information in the CONTEXT below. Do not invent facts.

CONTEXT:
{context_text}

TASK:
Generate 3 clear, unambiguous multiple-choice questions about the topic: "{topic}".

Rules:
- Each question has exactly 4 options and exactly one correct option.
- Options are mutually exclusive and not overlapping.
- For each question, include:
  - id (unique string)
  - stem (question text)
  - options (list of 4 strings)
  - correct_option_index (0, 1, 2, or 3)
  - difficulty ("easy", "medium", or "hard")
  - topic (short topic label)
  - source_excerpt (short quote/paraphrase from CONTEXT)
  - source_doc_id (the SOURCE_ID you used)

Output format:
Return ONLY a JSON object with this structure:

{{
  "mcqs": [
    {{
      "id": "string",
      "stem": "string",
      "options": ["string", "string", "string", "string"],
      "correct_option_index": 0,
      "difficulty": "easy|medium|hard",
      "topic": "string",
      "source_excerpt": "string",
      "source_doc_id": "string"
    }}
  ],
  "short_answers": [],
  "concept_edges": []
}}

Return ONLY valid JSON, no extra commentary.
"""
    return prompt.strip()


In [10]:
pplx_key = os.getenv("PERPLEXITY_API_KEY")
if not pplx_key:
    raise EnvironmentError("PERPLEXITY_API_KEY not set in environment or .env.")


In [12]:
def raw_perplexity_chat(prompt: str) -> str:
    """
    Call Perplexity chat completions API via HTTP and return the message content string.
    """
    url = "https://api.perplexity.ai/chat/completions"
    headers = {
        "Authorization": f"Bearer {pplx_key}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": "sonar-pro",  
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 1024,
    }

    resp = requests.post(url, headers=headers, json=payload, timeout=60)
    resp.raise_for_status()
    data = resp.json()
    return data["choices"][0]["message"]["content"]


In [17]:
def call_perplexity_for_mcqs(topic: str, k_context: int = 4) -> QuestionBundle:
    prompt = build_mcq_prompt_for_llm(topic, k_context=k_context)
    content = raw_perplexity_chat(prompt)

    content_str = content.strip()
    # Remove markdown code fences if present
    if content_str.startswith("```"):
        # remove surrounding backticks / fences
        content_str = content_str.strip("`").strip()
        # handle leading "json" language tag
        if content_str.lower().startswith("json"):
            content_str = content_str[4:].strip()

    try:
        data = json.loads(content_str)
    except json.JSONDecodeError as e:
        print("Raw model output (truncated):")
        print(content_str[:1000])
        raise RuntimeError(f"Failed to parse JSON from model: {e}")

    try:
        bundle = QuestionBundle(**data)
    except ValidationError as e:
        print("Parsed JSON but failed schema validation:")
        print(json.dumps(data, indent=2)[:1000])
        raise

    return bundle


In [18]:
topic = "non-stationarity"  

bundle = call_perplexity_for_mcqs(topic, k_context=4)
print(f"Received {len(bundle.mcqs)} MCQs from Perplexity.")


Received 3 MCQs from Perplexity.


In [19]:
mcq_records = [q.model_dump() for q in bundle.mcqs]
real_mcq_df = pd.DataFrame(mcq_records)
display(real_mcq_df)


Unnamed: 0,id,stem,options,correct_option_index,difficulty,topic,source_excerpt,source_doc_id
0,nonstationarity_001,Which of the following is NOT one of the two f...,"[Random walk model with drift, Trend-stationar...",3,medium,Non-stationarity models,Two models that have been frequently used to c...,sample_data:page=3:chunk=8
1,nonstationarity_002,"In the context of characteristic equations, wh...",[When the root of the characteristic equation ...,1,hard,Unit roots and stochastic non-stationarity,"This is known as the unit root case, for the r...",sample_data:page=5:chunk=12
2,nonstationarity_003,When a regression includes a linear trend term...,[Stochastic non-stationarity with a random wal...,1,medium,Trend-stationary processes and detrending,If it is believed that only this class of non-...,sample_data:page=5:chunk=13


In [20]:
safe_topic = topic.replace(" ", "_")
out_path = PROCESSED_DIR / f"real_mcqs_{safe_topic}.csv"
real_mcq_df.to_csv(out_path, index=False, encoding="utf-8")
print("Saved real MCQs to:", out_path)


Saved real MCQs to: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\real_mcqs_non-stationarity.csv
