In [1]:
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv


In [2]:
cwd = Path.cwd()
candidates = [cwd, cwd.parent, cwd.parent.parent]

project_root = None
for c in candidates:
    if (c / "data").exists() and (c / "src").exists():
        project_root = c
        break

if project_root is None:
    project_root = cwd.parent

print("Detected project root:", project_root)

env_path = project_root / ".env"
if env_path.exists():
    load_dotenv(env_path)

DATA_DIR = project_root / "data"
PROCESSED_DIR = DATA_DIR / "processed"

clean_mcq_csv = PROCESSED_DIR / "mcq_clean_only.csv"
if not clean_mcq_csv.exists():
    raise FileNotFoundError(
        f"{clean_mcq_csv} not found. Run Notebook 6 to create clean MCQs first."
    )

mcq_df = pd.read_csv(clean_mcq_csv)
print("Loaded clean MCQs:", clean_mcq_csv)
print("Shape:", mcq_df.shape)
display(mcq_df.head(5))


Detected project root: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496
Loaded clean MCQs: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\mcq_clean_only.csv
Shape: (4, 11)


Unnamed: 0,id,stem,options,correct_option_index,difficulty,topic,source_excerpt,source_doc_id,has_issue,issues,num_options
0,probability distribution_mcq_1,Dummy question about probability distribution (1),"['Option A', 'Option B', 'Option C', 'Option D']",0,easy,probability distribution,Dummy excerpt 1.,dummy:page=0:chunk=0,False,,4
1,probability distribution_mcq_2,Dummy question about probability distribution (2),"['Option A', 'Option B', 'Option C', 'Option D']",1,medium,probability distribution,Dummy excerpt 2.,dummy:page=0:chunk=1,False,,4
2,probability distribution_mcq_1,Dummy question about probability distribution (1),"['Option A', 'Option B', 'Option C', 'Option D']",0,easy,probability distribution,Dummy excerpt 1.,dummy:page=0:chunk=0,False,,4
3,probability distribution_mcq_2,Dummy question about probability distribution (2),"['Option A', 'Option B', 'Option C', 'Option D']",1,medium,probability distribution,Dummy excerpt 2.,dummy:page=0:chunk=1,False,,4


In [4]:
def explode_options(row) -> dict:
    """
    Ensure options become separate columns option_a..option_d as text.
    Assumes 'options' is list-like or repr of list.
    """
    options = row.get("options", "")
    if isinstance(options, str):
        if options.startswith("[") and options.endswith("]"):
            raw_items = options[1:-1].split(",")
            options_list = [item.strip().strip("'").strip('"') for item in raw_items]
        else:
            options_list = [options]
    else:
        options_list = list(options)

    options_list = (options_list + [""] * 4)[:4]

    return {
        "option_a": options_list[0],
        "option_b": options_list[1],
        "option_c": options_list[2],
        "option_d": options_list[3],
    }


In [5]:
records = []
for _, row in mcq_df.iterrows():
    base = {
        "id": row.get("id", ""),
        "stem": row.get("stem", ""),
        "correct_option_index": int(row.get("correct_option_index", 0)),
        "difficulty": row.get("difficulty", ""),
        "topic": row.get("topic", ""),
        "source_doc_id": row.get("source_doc_id", ""),
        "source_excerpt": row.get("source_excerpt", ""),
    }
    base.update(explode_options(row))
    records.append(base)

teacher_df = pd.DataFrame(records)
display(teacher_df.head(5))


Unnamed: 0,id,stem,correct_option_index,difficulty,topic,source_doc_id,source_excerpt,option_a,option_b,option_c,option_d
0,probability distribution_mcq_1,Dummy question about probability distribution (1),0,easy,probability distribution,dummy:page=0:chunk=0,Dummy excerpt 1.,Option A,Option B,Option C,Option D
1,probability distribution_mcq_2,Dummy question about probability distribution (2),1,medium,probability distribution,dummy:page=0:chunk=1,Dummy excerpt 2.,Option A,Option B,Option C,Option D
2,probability distribution_mcq_1,Dummy question about probability distribution (1),0,easy,probability distribution,dummy:page=0:chunk=0,Dummy excerpt 1.,Option A,Option B,Option C,Option D
3,probability distribution_mcq_2,Dummy question about probability distribution (2),1,medium,probability distribution,dummy:page=0:chunk=1,Dummy excerpt 2.,Option A,Option B,Option C,Option D


In [6]:
teacher_out = PROCESSED_DIR / "question_bank_teacher_view.csv"
teacher_df.to_csv(teacher_out, index=False, encoding="utf-8")
print("Saved teacher question bank CSV to:", teacher_out)


Saved teacher question bank CSV to: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\question_bank_teacher_view.csv


In [7]:
def make_anki_mcq_front_back(row) -> dict:
    """
    Build front/back text for Anki Basic (Front/Back) note type.
    Front: stem + numbered options.
    Back: correct answer text + short explanation.
    """
    opts = explode_options(row)
    stem = str(row.get("stem", "")).strip()

    front_lines = [stem]
    option_labels = ["A", "B", "C", "D"]
    for label, key in zip(option_labels, ["option_a", "option_b", "option_c", "option_d"]):
        if opts[key]:
            front_lines.append(f"{label}. {opts[key]}")

    front = "<br>".join(front_lines)  # HTML line breaks work well in Anki[web:154][web:162]

    try:
        correct_idx = int(row.get("correct_option_index", 0))
    except Exception:
        correct_idx = 0

    key_map = ["option_a", "option_b", "option_c", "option_d"]
    if 0 <= correct_idx < len(key_map):
        correct_text = opts[key_map[correct_idx]]
    else:
        correct_text = ""

    explanation_parts = [f"Correct answer: {correct_text}"]
    topic = str(row.get("topic", "")).strip()
    if topic:
        explanation_parts.append(f"Topic: {topic}")
    diff = str(row.get("difficulty", "")).strip()
    if diff:
        explanation_parts.append(f"Difficulty: {diff}")
    src = str(row.get("source_excerpt", "")).strip()
    if src:
        explanation_parts.append(f"Source: {src}")

    back = "<br>".join(explanation_parts)

    return {
        "front": front,
        "back": back,
    }


In [8]:
anki_records = []
for _, row in mcq_df.iterrows():
    fb = make_anki_mcq_front_back(row)
    anki_records.append(fb)

anki_df = pd.DataFrame(anki_records)
display(anki_df.head(5))


Unnamed: 0,front,back
0,Dummy question about probability distribution ...,Correct answer: Option A<br>Topic: probability...
1,Dummy question about probability distribution ...,Correct answer: Option B<br>Topic: probability...
2,Dummy question about probability distribution ...,Correct answer: Option A<br>Topic: probability...
3,Dummy question about probability distribution ...,Correct answer: Option B<br>Topic: probability...


In [9]:
anki_out = PROCESSED_DIR / "anki_mcq_front_back.csv"
anki_df.to_csv(anki_out, index=False, encoding="utf-8")
print("Saved Anki-ready CSV to:", anki_out)


Saved Anki-ready CSV to: C:\Users\Admin\OneDrive\Desktop\Capstone-MAT496\data\processed\anki_mcq_front_back.csv
