In [None]:
!pip -q install pymupdf faiss-cpu openai tiktoken fastapi uvicorn python-multipart pydantic pyngrok pandas numpy rank-bm25

In [2]:
import os
#redacted
os.environ["OPENAI_API_KEY"] = "your-api-key"

In [None]:
from google.colab import files

os.makedirs("/content/pdfs", exist_ok=True)
print("Select one or more NCERT PDFs...")
uploaded = files.upload()
for name, data in uploaded.items():
    with open(f"/content/pdfs/{name}", "wb") as f:
        f.write(data)
print("✅ Uploaded to /content/pdfs")

In [None]:
# --- Multi-subject KB builder with robust JSONL writing & validation ---

import os, re, json, hashlib
import fitz  # PyMuPDF
import numpy as np
import faiss
from rank_bm25 import BM25Okapi
from openai import OpenAI

KB_DIR       = "/content/kb"
PDF_DIR      = "/content/pdfs"
META_JSONL   = f"{KB_DIR}/meta.jsonl"
META_TMP     = f"{KB_DIR}/meta.tmp.jsonl"
INDEX_PATH   = f"{KB_DIR}/index.faiss"
BM25_PATH    = f"{KB_DIR}/bm25.json"
SUBJECTS_JSON= f"{KB_DIR}/subjects.json"
TOPICS_JSON = "/content/kb/topics.json"
PROGRESS_JSON = "/content/kb/progress.json"

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

SUBJECT_ALIASES = {
    "science": ["science","physics","chemistry","biology","bio"],
    "mathematics": ["math","mathematics","algebra","geometry","trigonometry","statistics","probability","arith"],
    "english": ["english","literature","grammar","poem","prose"],
    "social science": ["social science","history","geography","civics","political science","economics","democratic","resources"],
    "hindi": ["hindi"],
    "computer science": ["computer","informatics","cs","ip","information technology","it"],
    "biology": ["biology","bio","life processes","cell","heredity","evolution","reproduction"],
    "physics": ["physics","electricity","magnetism","motion","force","work","energy","light","sound","wave"],
    "chemistry": ["chemistry","chemical","reaction","matter","atom","mole","periodic","compound","mixture"]
}

def _safe_text(s: str) -> str:
    if not s:
        return ""
    # strip NULs & normalize weird whitespace; keep \n
    s = s.replace("\x00", "")
    s = s.encode("utf-8", "replace").decode("utf-8", "replace")
    # collapse repeated spaces while preserving newlines
    s = re.sub(r"[ \t\r\f\v]+", " ", s)
    # remove stray control chars (except \n and \t)
    s = re.sub(r"[\x01-\x08\x0b-\x0c\x0e-\x1f\x7f]", "", s)
    return s.strip()

def parse_grade_subject_from_filename(fname: str):
    chapter = os.path.splitext(fname)[0].split("_")[2].lower()
    base = os.path.splitext(fname)[0].replace("_"," ").lower()
    m = re.search(r"(class|grade)\s*([0-9]{1,2})", base)
    grade = int(m.group(2)) if m else None
    subject = None
    for subj, keys in SUBJECT_ALIASES.items():
        if any(k in base for k in keys):
            subject = subj; break
    if subject is None:
        m2 = re.search(r"([a-z]+)", base)
        subject = m2.group(1) if m2 else "general"
    return grade, subject, chapter

def iter_pdf_pages(pdf_path):
    doc = fitz.open(pdf_path)
    for pno in range(len(doc)):
        page = doc[pno]
        text = page.get_text("text") or ""
        yield pno+1, _safe_text(text)

def heading_guess(page_text):
    lines = [l.strip() for l in page_text.split("\n") if l.strip()]
    chap, title = "", ""
    for i,l in enumerate(lines[:10]):
        if re.match(r"(?i)^chapter\s+\d+\b", l):
            chap = l
            if i+1 < len(lines): title = lines[i+1][:120]
            break
    if not chap and lines:
        title = lines[0][:120]
    return chap, title

def chunk_page(text, max_chars=1200, overlap=100):
    # split on blank lines, rebuild chunks under max_chars
    paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
    chunks, buf = [], ""
    for p in paras:
        if len(buf) + len(p) + 2 <= max_chars:
            buf = (buf + "\n\n" + p).strip()
        else:
            if buf: chunks.append(buf)
            buf = p
    if buf: chunks.append(buf)
    out = []
    for i,c in enumerate(chunks):
        prev_tail = chunks[i-1][-overlap:] if i>0 else ""
        out.append((prev_tail + "\n" + c).strip())
    return out

def embed_texts(texts):
    # OpenAI embeddings; normalized for cosine via IP
    B = 512
    vecs = []
    for i in range(0, len(texts), B):
        resp = client.embeddings.create(model="text-embedding-3-large", input=texts[i:i+B])
        arr = np.array([d.embedding for d in resp.data], dtype="float32")
        arr /= (np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12)
        vecs.append(arr.astype("float32"))
    return np.vstack(vecs)

def build_kb(pdf_dir=PDF_DIR):
    os.makedirs(KB_DIR, exist_ok=True)

    metadatas, texts = [], []
    subjects_registry = {}

    # Collect chunks + metadata
    for fname in sorted(os.listdir(pdf_dir)):
        if not fname.lower().endswith(".pdf"):
            continue
        path = os.path.join(pdf_dir, fname)
        book = os.path.splitext(fname)[0]
        grade, subject, ch = parse_grade_subject_from_filename(fname)
        subjects_registry.setdefault(subject, set()).add(grade)

        for page_no, page_text in iter_pdf_pages(path):
            chap, sec = heading_guess(page_text)
            for c in chunk_page(page_text):
                metadatas.append({
                    "book": book,
                    "grade": grade,
                    "subject": subject,
                    "chapter": ch,
                    "section": sec,
                    "page": page_no
                })
                texts.append(c)

    if not texts:
        raise RuntimeError("No chunks created. Did you upload PDFs?")

    # Embeddings + FAISS
    print(f"Embedding {len(texts)} chunks…")
    X = embed_texts(texts)
    dim = X.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(X)
    faiss.write_index(index, INDEX_PATH)


    with open(META_TMP, "w", encoding="utf-8", newline="\n") as f:
        for m, t in zip(metadatas, texts):
            m2 = {**m, "text": t}
            f.write(json.dumps(m2, ensure_ascii=False) + "\n")

    if os.path.exists(META_JSONL):
        os.remove(META_JSONL)
    os.rename(META_TMP, META_JSONL)


    valid_count = 0
    with open(META_JSONL, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            s = line.strip()
            if not s: continue
            try:
                json.loads(s)
                valid_count += 1
            except json.JSONDecodeError as e:
                snippet = s[max(0, e.pos-80):e.pos+80]
                raise RuntimeError(f"Invalid JSON at line {i}: {e.msg} (pos {e.pos}).\nSnippet: {snippet}") from e
    print(f"meta.jsonl written & validated ({valid_count} lines).")


    tokenized = [t.lower().split() for t in texts]
    bm25 = {"docs": texts, "tokenized": tokenized}
    with open(BM25_PATH, "w", encoding="utf-8") as f:
        json.dump(bm25, f)


    sr = {k: sorted([g for g in v if g is not None]) for k,v in subjects_registry.items()}
    with open(SUBJECTS_JSON, "w", encoding="utf-8") as f:
        json.dump(sr, f, indent=2)

    print(f"KB ready: {len(texts)} chunks; FAISS+BM25 built.")
    print("Detected subjects/grades:", sr)
build_kb()

In [None]:
import json, os, numpy as np, faiss, math, re
from rank_bm25 import BM25Okapi
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
index = faiss.read_index("/content/kb/index.faiss")
meta = []
with open("/content/kb/meta.jsonl", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        meta.append(json.loads(line))

with open("/content/kb/bm25.json", encoding="utf-8") as f:
    bm25_raw = json.load(f)
bm25 = BM25Okapi([d for d in bm25_raw["tokenized"]])

with open("/content/kb/subjects.json", encoding="utf-8") as f:
    SUBJECTS = json.load(f)

def embed_query(q: str) -> np.ndarray:
    r = client.embeddings.create(model="text-embedding-3-large", input=[q])
    v = np.array(r.data[0].embedding, dtype="float32")
    v = v / (np.linalg.norm(v) + 1e-12)
    return v.reshape(1, -1)

def guess_subject(query: str):
    q = query.lower()
    best, best_score = None, 0
    for subj, grades in SUBJECTS.items():
        keys = subj.split() + sum(([k] for k,v in locals().get('SUBJECT_ALIASES', {}).items() if k==subj), [])
        score = sum(1 for k in keys if k in q)

        if subj in ("science","biology","physics","chemistry") and any(w in q for w in ["photosynthesis","cell","energy","acid","base","motion","light","electric","magnet"]):
            score += 1
        if subj in ("mathematics",) and any(w in q for w in ["theorem","solve","prove","equation","triangle","probability","mean","median","mode","lcm","hcf"]):
            score += 1
        if score > best_score:
            best, best_score = subj, score
    return best if best_score>0 else None

def hybrid_retrieve(question: str, subject: str=None, grade: int=None, k_embed=40, k_bm25=40, top=8):
    qv = embed_query(question)
    D, I = index.search(qv, k_embed)
    cand_embed = [(i, float(D[0][j])) for j,i in enumerate(I[0])]


    scores = bm25.get_scores(question.lower().split())
    top_bm25_ids = np.argsort(scores)[::-1][:k_bm25]
    cand_bm25 = [(int(i), float(scores[i])) for i in top_bm25_ids]


    cand_map = {}
    for i,s in cand_embed:
        cand_map.setdefault(i, {"embed":0.0,"bm25":0.0})
        cand_map[i]["embed"] = max(cand_map[i]["embed"], s)
    for i,s in cand_bm25:
        cand_map.setdefault(i, {"embed":0.0,"bm25":0.0})
        cand_map[i]["bm25"] = max(cand_map[i]["bm25"], s)


    if cand_map:
        max_bm25 = max(v["bm25"] for v in cand_map.values()) or 1.0
    fused = []
    for i,sc in cand_map.items():
        m = meta[i]
        subj_boost = 1.0
        if subject and m.get("subject")==subject:
            subj_boost *= 1.25
        if grade and m.get("grade")==grade:
            subj_boost *= 1.15

        txt = m["text"].lower()
        edu_boost = 1.1 if any(k in txt for k in ["definition","example","key points","summary","exercise"]) else 1.0
        f_score = 0.65*sc["embed"] + 0.35*(sc["bm25"]/max_bm25)
        fused.append((i, f_score*subj_boost*edu_boost))

    fused.sort(key=lambda x: x[1], reverse=True)
    picked = []
    seen_pages = set()
    for i, s in fused:
        m = meta[i]
        key = (m["book"], m["page"])
        if key in seen_pages:
            continue
        seen_pages.add(key)
        picked.append((m, s))
        if len(picked)>=top: break
    return picked

SYSTEM = (
    "You are a strict NCERT tutor. Use ONLY the provided context. "
    "Format your response as:\n"
    "1) Short definition/overview\n"
    "2) Key points (bulleted)\n"
    "3) Example or short derivation (if context contains one)\n"
    "4) Final takeaway\n"
    "Always include explicit NCERT citations after relevant paragraphs like (Book, Ch X, p.Y)."
)

def smart_answer(question: str, contexts_with_scores, dontknow_threshold=0.35):

    if not contexts_with_scores:
        return "I don’t know. I couldn’t find this in the NCERT materials you uploaded."
    avg_score = sum(s for _,s in contexts_with_scores)/len(contexts_with_scores)
    ctx_text = "\\n\\n---\\n\\n".join(
        f'[{c.get("subject","")}, {c.get("book","")}, {c.get("chapter","")}, p.{c.get("page","")}]\\n{c["text"][:1600]}'
        for c,_ in contexts_with_scores
    )
    if avg_score < dontknow_threshold:
        prefix = "I’m not fully confident this is covered in your NCERT set. Here’s my best effort using the closest matches.\\n\\n"
    else:
        prefix = ""
    user = f"{prefix}Context:\\n{ctx_text}\\n\\nQuestion: {question}\\nAnswer following the required format with citations."
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":SYSTEM},
                  {"role":"user","content":user}],
        temperature=0
    )
    return resp.choices[0].message.content

def smart_answer_ext(question: str, subject: str=None, grade: int=None):
    ext_prompt = (
        f"Provide broader, real-world context for this question beyond the NCERT syllabus. Keep it factual and suitable for {grade}-th grade student"
    )
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role":"system","content":ext_prompt},
            {"role":"user","content":question}
        ],
        temperature=0.7
    )
    return resp.choices[0].message.content

def chat_router(question: str, subject: str=None, grade: int=None):
    subj = subject or guess_subject(question)
    picks = hybrid_retrieve(question, subject=subj, grade=grade)
    answer = smart_answer(question, picks)
    additional = smart_answer_ext(question, subject=subj, grade=grade)
    cites = [{"book":m.get("book"),"subject":m.get("subject"),"chapter":m.get("chapter"),"page":m.get("page")} for m,_ in picks]
    return answer, cites, subj, additional

print("Multi-subject hybrid retrieval ready. Try chat_router('Explain photosynthesis for Class 7')")

In [None]:
import json, collections

rows = [json.loads(l) for l in open("/content/kb/meta.jsonl", encoding="utf-8")]

tree = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(list))))
for r in rows:
    subj = r.get("subject") or "general"
    grd = str(r.get("grade") or "NA")
    book = r.get("book")
    chap = r.get("chapter") or "Chapter"
    sec  = r.get("section") or "Section"
    tree[subj][grd][book][chap].append((sec, r.get("page",0)))

out = []
for subj, grades in tree.items():
    g_list = []
    for grd, books in grades.items():
        b_list = []
        for book, chaps in books.items():
            c_list = []
            for chap, entries in chaps.items():
                sec_pages = collections.defaultdict(list)
                for sec, page in entries:
                    sec_pages[sec].append(page)
                s_list = []
                for sec, pages in sec_pages.items():
                    ps = sorted(set(pages))
                    pages_str = f"{ps[0]}-{ps[-1]}" if ps else ""
                    s_list.append({"id": f"{book}::{chap}::{sec}", "title": sec, "pages": pages_str})
                c_list.append({"id": f"{book}::{chap}", "title": chap, "sections": s_list})
            b_list.append({"book": book, "chapters": c_list})
        g_list.append({"grade": grd, "books": b_list})
    out.append({"subject": subj, "grades": g_list})

with open(TOPICS_JSON, "w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)

print("Topics/Subjects index written to", TOPICS_JSON)


In [None]:
import math, json
from datetime import date, timedelta

def estimate_minutes_for_text(txt: str) -> int:
    tokens = max(50, len(txt.split()))
    return max(6, math.ceil(tokens/40.0) + 2)

with open(META_JSONL, encoding="utf-8") as f:
    rows = [json.loads(l) for l in f]

sections_map = {}
for r in rows:
    key = (r["book"], r.get("chapter",""), r.get("section",""))
    secs = sections_map.setdefault(key, {"book": r["book"], "chapter": r.get("chapter",""), "section": r.get("section",""), "pages": set(), "minutes_est": 0})
    secs["pages"].add(r["page"])
    secs["minutes_est"] += estimate_minutes_for_text(r["text"])

sections = []
for (book, chap, sec), v in sections_map.items():
    pages = sorted(v["pages"])
    v["pages"] = f"{pages[0]}-{pages[-1]}" if pages else ""
    v["id"] = f"{book}::{chap}::{sec}"
    sections.append(v)

sections = sorted(sections, key=lambda x: (x["book"], x["chapter"], x["section"]))

def build_plan(start: str, deadline: str, hours_per_day: float):
    start_d = date.fromisoformat(start)
    end_d = date.fromisoformat(deadline)
    minutes_per_day = int(hours_per_day*60)
    days = []
    d = start_d
    while d <= end_d:
        days.append({"date": d.isoformat(), "capacity": minutes_per_day, "items": []})
        d += timedelta(days=1)

    i = 0
    for sec in sections:
        remaining = sec["minutes_est"]
        while remaining > 0 and i < len(days):
            slot = min(remaining, days[i]["capacity"])
            if slot == 0:
                i += 1
                continue
            days[i]["items"].append({
                "section_id": sec["id"],
                "book": sec["book"],
                "chapter": sec["chapter"],
                "minutes": slot
            })
            days[i]["capacity"] -= slot
            remaining -= slot
    if any(day["capacity"] < minutes_per_day for day in days):
        reviews = []
        for day in days:
            for item in day["items"]:
                for gap in (1,3,7):
                    try_date = (date.fromisoformat(day["date"]) + timedelta(days=gap)).isoformat()
                    reviews.append({"date": try_date, "section_id": item["section_id"], "minutes": max(6, item["minutes"]//6)})
    else:
        reviews = []

    return {"days": [{k: (v if k!="capacity" else minutes_per_day - v) for k,v in d.items()} for d in days], "reviews": reviews}

print("✅ Planner ready. Example: build_plan('2025-09-16','2025-10-15', 2.0)")


In [20]:
import os, re, json, math, collections
from typing import List, Dict, Any, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from json import JSONDecoder, JSONDecodeError # Import for load_meta_jsonl

# --- Robust meta loader ---
def load_meta_jsonl(path="/content/kb/meta.jsonl"):
    """
    Loads /content/kb/meta.jsonl robustly:
      - Handles proper JSONL (one JSON per line)
      - Also handles concatenated JSON without newlines (}{ back-to-back)
      - Strips NULs and ignores empty/whitespace segments
    """
    # Placeholder: using the logic from cell tZnxJ19J79y-
    dec = JSONDecoder()
    objs = []

    with open(path, "r", encoding="utf-8", errors="replace") as f:
        buf = f.read()

    # Remove NULs that may appear from odd PDFs
    buf = buf.replace("\x00", "")

    # Fast path: try line-by-line first (proper JSONL)
    lines = buf.splitlines()
    if len(lines) > 1:
        for ln, line in enumerate(lines, 1):
            s = line.strip()
            if not s:
                continue
            try:
                objs.append(json.loads(s))
            except JSONDecodeError as e:
                print(f"Warning: Skipping invalid JSON line {ln} in {path}: {e}")
                continue
        return objs

    # Streaming decode: parse sequential JSON objects in one big string
    idx = 0
    n = len(buf)
    while idx < n:
        # skip whitespace between JSONs
        while idx < n and buf[idx].isspace():
            idx += 1
        if idx >= n:
            break
        try:
            obj, end = dec.raw_decode(buf, idx)
        except JSONDecodeError as e:
            # Try to repair the most common issue: missing newline between }{
            # Insert a newline at the nearest }{}{ boundary near the error and continue once.
            window = buf[max(0, idx-50):min(n, idx+50)]
            if "}{".encode().decode() in window:
                buf = buf.replace("}{", "}\n{")
                # Restart the whole streaming pass once after patch
                objs.clear()
                idx = 0
                n = len(buf)
                continue
            # If still bad, print warning and try to skip the character
            context = buf[max(0, idx-140):min(n, idx+140)]
            print(f"Warning: Skipping invalid JSON segment at pos {idx} in {path}. Context: {context}. Error: {e}")
            idx += 1 # Skip one character and try again
            continue # Move to the next iteration


        objs.append(obj)
        idx = end
    return objs


META_PATH = "/content/kb/meta.jsonl"
assert os.path.exists(META_PATH), "meta.jsonl not found. Build the KB first."

# Load fresh each time to reflect any new ingest
def get_meta_rows():
    return load_meta_jsonl(META_PATH)


# --- Utilities for topic extraction ---
CUE_PATTERNS = [
    r"\b(key points?|summary|in a nutshell|important|remember|definition|define|note|quick recap)\b",
    r"\b(exercise|questions|mcq|short answer|long answer|try yourself)\b",
    r"\b(activity\s*\d*|project|experiment|investigate)\b",
    r"\b(objectives?|learning outcomes?)\b",
    r"\b(fascinating facts?|holistic lens|know a scientist)\b",
]
CUE_RE = re.compile("|".join(CUE_PATTERNS), re.I)

HEADING_LINE_RE = re.compile(r"^\s*(?:\d+(?:\.\d+){0,3})\s*[:\-\)]?\s*([A-Z][^\n]{3,200})$")

# broad stopwords; keep domain-neutral
GENERIC_STOP = set("""chapter exercise question questions answer answers points summary topic topics figure table example examples
activity project experiment objectives outcome outcomes learning page pages section subsection student students teacher teachers
colour color paper group let us observe discuss try find out make list see also aim material apparatus method conclusion
""".split())

def compress_ranges(pages: List[int]) -> str:
    if not pages: return ""
    pages = sorted(set(pages))
    rng=[]; s=prev=pages[0]
    for p in pages[1:]:
        if p==prev+1: prev=p
        else: rng.append((s,prev)); s=prev=p
    rng.append((s,prev))
    return ",".join([f"{a}-{b}" if a!=b else f"{a}" for a,b in rng])

def extract_chapter_and_book(rows, subject, grade, book, chapter):
    ch_blobs, ch_pages = [], []
    book_blobs = []
    for r in rows:
        if r.get("subject")==subject and r.get("grade")==grade:
            book_blobs.append(r.get("text",""))
            if (r.get("chapter") or "") == chapter:
                ch_blobs.append(r.get("text",""))
                ch_pages.append(r.get("page",0))
    return ch_blobs, ch_pages, book_blobs

def split_lines(text):
    return [ln.strip() for ln in text.split("\n") if ln.strip()]

def cue_heading_boosts(blobs: List[str]):
    term_boost = collections.Counter()
    page_hits = collections.defaultdict(set)

    def norm_phrase(s):
        s = re.sub(r"[^a-z0-9\s\-]", " ", s.lower())
        s = re.sub(r"\s+", " ", s).strip()
        return s

    for bi, t in enumerate(blobs):
        for ln in split_lines(t):
            m = HEADING_LINE_RE.match(ln)
            if m:
                phrase = norm_phrase(m.group(1))
                if phrase and phrase not in GENERIC_STOP and not phrase.isdigit():
                    term_boost[phrase] += 2.0
                    page_hits[phrase].add(bi)
                continue
            if CUE_RE.search(ln):
                phrase = norm_phrase(ln)
                term_boost[phrase] += 1.0
                page_hits[phrase].add(bi)

        # Capitalized multiword spans
        for m in re.finditer(r"(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,6})", t):
            phrase = norm_phrase(m.group(0))
            if phrase and len(phrase.split())>=2:
                term_boost[phrase] += 0.6
                page_hits[phrase].add(bi)

    return term_boost, page_hits

def clean_phrase(p):
    # keep alphabetic + hyphen; remove repeated spaces; drop very short tokens
    p = re.sub(r"[^a-z\s\-]", " ", p.lower())
    p = re.sub(r"\s+", " ", p).strip()
    # drop phrases containing generic stopwords entirely
    toks = p.split()
    if any(tok in GENERIC_STOP for tok in toks):
        return None
    # keep multiword phrases or domainy unigrams (litmus, neutralisation, indicator)
    if len(toks)==1 and toks[0] not in {"litmus","indicator","neutralisation","acidic","basic","neutral"}:
        return None
    # avoid phrases starting with verbs like "let", "make", "see"
    if toks and toks[0] in {"let","make","see","find","observe","discuss","try"}:
        return None
    # min length
    if len(" ".join(toks)) < 5:
        return None
    return " ".join(toks)

def tfidf_phrases(ch_blobs: List[str], book_blobs: List[str], top_k=40):
    # Chapter-vs-book TFIDF with 2-4 grams to prefer phrases
    # Relax min_df and max_df to handle smaller chapter texts or more unique terms
    vect = TfidfVectorizer(
        ngram_range=(2,4),
        min_df=0.01,  # Lowered from default 1 -> Adjusted again
        max_df=0.99,  # Increased from default 0.9 -> Adjusted again
        stop_words="english",
        token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z\-]+\b"
    )
    corpus = ["\n".join(ch_blobs)] + ["\n".join(book_blobs)]  # doc0=chapter, doc1+=book
    try:
        X = vect.fit_transform(corpus)
    except ValueError as e:
        if "After pruning, no terms remain" in str(e):
            print(f"Warning: TF-IDF found no terms for chapter. Consider adjusting min_df/max_df further or check input text.")
            return [] # Return empty list instead of raising error
        else:
            raise # Re-raise other ValueErrors
    ch_vec = X[0].toarray()[0]
    feats = vect.get_feature_names_out()
    pairs = [(feats[i], ch_vec[i]) for i in ch_vec.nonzero()[0]]
    pairs.sort(key=lambda x: x[1], reverse=True)
    out=[]
    for term, sc in pairs[:max(top_k, 60)]:
        cp = clean_phrase(term)
        if cp:
            out.append((cp, float(sc)))
    return out

def pages_for_phrase(phrase: str, blobs: List[str]):
    hits=[]
    pat = re.compile(r"\b" + re.escape(phrase) + r"\b", re.I)
    for i, t in enumerate(blobs):
        if pat.search(t):
            hits.append(i)
    return hits

def robust_tfidf_terms(ch_blobs: List[str], book_blobs: List[str], want=60):
    corpus = ["\n".join(ch_blobs)] + ["\n".join(book_blobs)]
    attempts = [
        dict(ngram_range=(2,4), stop_words="english", max_df=0.95, min_df=1, token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z\-]+\b"),
        dict(ngram_range=(1,3), stop_words=None,      max_df=1.0,  min_df=1, token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z\-]+\b"),
        dict(ngram_range=(1,3), stop_words=None,      max_df=1.0,  min_df=1, token_pattern=r"(?u)\b\w[\w\-]+\b"),
    ]
    for cfg in attempts:
        try:
            vect = TfidfVectorizer(**cfg)
            X = vect.fit_transform(corpus)
            if X.shape[1] == 0:
                continue
            ch_vec = X[0].toarray()[0]
            feats = vect.get_feature_names_out()
            # keep top features present in chapter doc
            idxs = ch_vec.nonzero()[0]
            pairs = [(feats[i], ch_vec[i]) for i in idxs]
            pairs.sort(key=lambda x: x[1], reverse=True)
            out=[]
            for term, sc in pairs[:max(want, 40)]:
                cp = clean_phrase(term)
                if cp:
                    out.append((cp, float(sc)))
            if out:
                return out
        except ValueError:
            continue
    # if everything fails, return empty and let cue/heading logic handle it
    return []

def merge_scores(tfidf_terms, boost_counter, boost_pages, blobs):
    cand = {}
    for p, s in tfidf_terms:
        cand[p] = {"score": s, "blob_indices": pages_for_phrase(p, blobs)}

    for raw, b in boost_counter.items():
        p = clean_phrase(raw)
        if not p: continue
        hits = boost_pages.get(raw, set())
        hits = [h for h in hits if h < len(blobs)]
        c = cand.setdefault(p, {"score": 0.0, "blob_indices": hits})
        c["score"] += float(b) * 0.2
        c["blob_indices"] = sorted(set(c["blob_indices"]) | set(hits))

    for p, c in cand.items():
        cov = len(set(c["blob_indices"]))
        c["score"] *= (1.0 + 0.05 * cov)
    return cand

def pick_topics(cand: Dict[str, Dict[str, Any]], blobs: List[str], top_n: int):
    items = sorted(cand.items(), key=lambda kv: kv[1]["score"], reverse=True)
    picked=[]
    for term, info in items:
        if any(term in t for t,_ in picked if len(t) > len(term)+2):
            continue
        picked.append((term, info))
        if len(picked) >= top_n*2:
            break
    final=[]
    for term, info in picked[:top_n]:
        idxs = info["blob_indices"]
        words = sum(len(blobs[i].split()) for i in idxs if i < len(blobs))
        est = max(6, math.ceil(words/260.0) + 3)
        final.append({
            "topic": " ".join(w.capitalize() if i==0 else w for i,w in enumerate(term.split())),
            "score": round(info["score"], 4),
            "blob_indices": idxs,
            "estimated_minutes": int(est)
        })
    return final


def _llm_explain(topics, subject, grade, book, chapter):
    try:
        from openai import OpenAI
        client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    except Exception:
        return topics
    if not topics:
        return topics
    bullets = "\n".join(f"- {t['topic']}" for t in topics)
    system = ("You help a student prioritize a chapter. For each topic name, "
              "write ONE concise sentence on why it's important within the chapter scope. "
              "Return JSON list of {topic, why_important}. No new topics.")
    user = f"Subject: {subject}, Grade: {grade}, Book: {book}\nChapter: {chapter}\nTopics:\n{bullets}\n"
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role":"system","content":system},{"role":"user","content":user}],
            temperature=0
        )
        data = json.loads(resp.choices[0].message.content)
        mp = {d["topic"].strip().lower(): d["why_important"] for d in data if "topic" in d and "why_important" in d}
        for t in topics:
            k = t["topic"].lower()
            if k in mp:
                t["why_important"] = mp[k][:240]
    except Exception:
        pass
    return topics

In [None]:
import nest_asyncio
import uvicorn, json, threading, time, os
from pydantic import BaseModel
from fastapi import FastAPI, HTTPException, Query, Path

app = FastAPI(title="NCERT Tutor API")

class ChatReq(BaseModel):
    question: str
    subject: str | None = None
    grade: int | None = None

@app.post("/chat")
def chat(req: ChatReq):
    ans, cites, subj, additional = chat_router(req.question, subject=req.subject, grade=req.grade)
    return {"subject_detected": subj, "answer": ans, "citations": cites, "more info": additional}

class PlanReq(BaseModel):
    start: str
    deadline: str
    hours_per_day: float
    subject: str | None = None
    grade: int | None = None

class ImportantTopicsReq(BaseModel):
    subject: str
    grade: int
    book: str
    chapter: str
    top_n: int = 8
    explain: bool = False  # if True, add one-line 'why_important' via LLM

@app.post("/plan")
def plan_api(req: PlanReq):
    rows = [json.loads(l) for l in open("/content/kb/meta.jsonl", encoding="utf-8")]
    import math, collections
    from datetime import date, timedelta

    def estimate_minutes(txt):
        tokens = max(50, len(txt.split()))
        return max(6, math.ceil(tokens/40.0) + 2)

    agg = {}
    for r in rows:
        if req.subject and (r.get("subject") != req.subject):
            continue
        if req.grade and (r.get("grade") != req.grade):
            continue
        key = (r["book"], r.get("chapter",""), r.get("section",""))
        a = agg.setdefault(key, {"book": r["book"], "chapter": r.get("chapter",""), "section": r.get("section",""), "pages": set(), "minutes_est":0})
        a["pages"].add(r.get("page",0))
        a["minutes_est"] += estimate_minutes(r["text"])

    sections = []
    for (book,chap,sec), v in agg.items():
        ps = sorted(v["pages"])
        v["pages"] = f"{ps[0]}-{ps[-1]}" if ps else ""
        v["id"] = f"{book}::{chap}::{sec}"
        sections.append(v)
    sections = sorted(sections, key=lambda x: (x["book"], x["chapter"], x["section"]))

    start_d = date.fromisoformat(req.start)
    end_d = date.fromisoformat(req.deadline)
    minutes_per_day = int(req.hours_per_day*60)

    days = []
    d = start_d
    while d <= end_d:
        days.append({"date": d.isoformat(), "capacity": minutes_per_day, "items": []})
        d += timedelta(days=1)

    i = 0
    for sec in sections:
        remaining = sec["minutes_est"]
        while remaining > 0 and i < len(days):
            slot = min(remaining, days[i]["capacity"])
            if slot == 0:
                i += 1; continue
            days[i]["items"].append({"section_id": sec["id"], "book": sec["book"], "chapter": sec["chapter"], "minutes": slot})
            days[i]["capacity"] -= slot
            remaining -= slot

    reviews = []
    for day in days:
        for item in day["items"]:
            for gap in (1,3,7):
                try_date = (date.fromisoformat(day["date"]) + timedelta(days=gap)).isoformat()
                reviews.append({"date": try_date, "section_id": item["section_id"], "minutes": max(6, item["minutes"]//6)})

    days_out = []
    for d in days:
        used = minutes_per_day - d["capacity"]
        days_out.append({"date": d["date"], "used_minutes": used, "items": d["items"]})

    return {"subject": req.subject, "grade": req.grade, "days": days_out, "reviews": reviews}

@app.get("/subjects")
def subjects_api():
    with open("/content/kb/subjects.json", encoding="utf-8") as f:
        return json.load(f)

@app.get("/topics")
def topics_api():
    with open("/content/kb/topics.json", encoding="utf-8") as f:
        return json.load(f)

@app.post("/important-topics")
def important_topics_api(req: ImportantTopicsReq):
    rows = get_meta_rows()
    ch_blobs, ch_pages, book_blobs = extract_chapter_and_book(rows, req.subject, req.grade, req.book, req.chapter)
    if not ch_blobs:
        raise HTTPException(status_code=404, detail="No matching chapter content found. Check subject/grade/book/chapter values.")

    tfidf_terms = robust_tfidf_terms(ch_blobs, book_blobs, want=max(60, req.top_n*8))
    boosts, boost_pages = cue_heading_boosts(ch_blobs)

    # If TF-IDF still gave nothing, fall back entirely to boosts
    if not tfidf_terms:
        cand = {}
        for raw, b in boosts.items():
            p = clean_phrase(raw)
            if not p: continue
            hits = [h for h in boost_pages.get(raw, set()) if h < len(ch_blobs)]
            cand[p] = {"score": float(b), "blob_indices": hits}
    else:
        cand = merge_scores(tfidf_terms, boosts, boost_pages, ch_blobs)

    topics = pick_topics(cand, ch_blobs, req.top_n)

    # map to real pages
    for t in topics:
        idxs = t.pop("blob_indices", [])
        pages = [ch_pages[i] for i in idxs if i < len(ch_pages)]
        t["pages"] = compress_ranges(pages)
        t["why_important"] = t.get("why_important") or None

    if req.explain:
        topics = _llm_explain(topics, req.subject, req.grade, req.book, req.chapter)

    return {
        "subject": req.subject,
        "grade": req.grade,
        "book": req.book,
        "chapter": req.chapter,
        "method": "robust-tfidf+cue-fallback" + ("+llm" if req.explain else ""),
        "topics": topics
    }

if os.path.exists(PROGRESS_JSON):
    with open(PROGRESS_JSON, encoding="utf-8") as f:
        progress_store = json.load(f)
else:
    progress_store = {"completed": []}

class ProgressUpdate(BaseModel):
    user_id: str
    section_id: str
    completed_on: str  # ISO date

@app.post("/progress")
def progress_update(p: ProgressUpdate):
    progress_store["completed"].append(p.model_dump())
    with open(PROGRESS_JSON, "w", encoding="utf-8") as f:
        json.dump(progress_store, f, ensure_ascii=False, indent=2)
    return {"ok": True, "completed_count": len(progress_store["completed"])}

def run_server():
    nest_asyncio.apply()
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="warning")

server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()
print("FastAPI started on http://127.0.0.1:8000")

In [None]:
import requests, json, os
base = "http://127.0.0.1:8000"
r = requests.get(base + "/topics")
print("Subjects found:", len(r.json()))

In [None]:
r = requests.post(base + "/chat", json={"question":"What is Myopia?"})
print(json.dumps(r.json(), ensure_ascii=False, indent=2))

In [None]:
import requests, json
r = requests.post(base + "/important-topics", json={
    "subject": "science",
    "grade": 10,
    "book": "science_grade10_10",          # Corrected book name to match meta.jsonl
    "chapter": "10",  # exact chapter string from meta
    "top_n": 8,
    "explain": True
})
print(json.dumps(r.json(), ensure_ascii=False, indent=2))

In [None]:
base = "http://127.0.0.1:8000"
r = requests.get(base + "/topics")
print("Topics found:", r.json())

In [None]:
r = requests.post(base + "/plan", json={"start":"2025-09-16","deadline":"2025-10-01","hours_per_day":1.5})
print("Plan keys:", r.json())