<details>
<summary>FIXES</summary>

```python
import ssl, nltk, os
ssl._create_default_https_context = ssl._create_unverified_context  # TEMP: bypass SSL

nltk.download('punkt_tab')

for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger", "wordnet", "omw-1.4"]:
    nltk.download(pkg)

print("‚úÖ NLTK data downloaded. You can restart the kernel and remove the SSL bypass line.")

import ssl, certifi, urllib.request
print("certifi:", certifi.where())
print("ssl cafile:", ssl.get_default_verify_paths().cafile)
urllib.request.urlopen("https://www.python.org").read(1)  # should not raise
# ERROR [1]
# ERROR [1]: still using the system‚Äôs default SSL context, so the request isn‚Äôt picking up certifi‚Äôs CA bundle. Do one of these (fastest first):

### A) Quick kernel patch (works immediately in this notebook)
# Run this **in a cell once**, then try the urllib/NLTK downloads again in the *same* kernel:
# Then your NLTK bootstrap/download calls should succeed.

import os, ssl, certifi

# Route ALL new HTTPS contexts to certifi's CA bundle
os.environ["SSL_CERT_FILE"] = certifi.where()
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
ssl._create_default_https_context = lambda *a, **k: ssl.create_default_context(cafile=certifi.where())

# **Test:**
import urllib.request
urllib.request.urlopen("https://www.python.org").read(1)  # should NOT raise now

### B) Make it persistent for this venv (recommended)

# Append these lines to **`.venv/bin/activate`**:
export SSL_CERT_FILE="$(python -c 'import certifi; print(certifi.where())')"
export REQUESTS_CA_BUNDLE="$SSL_CERT_FILE"

# Then restart:
deactivate 2>/dev/null || true
source .venv/bin/activate
python - <<'PY'
import ssl, urllib.request, certifi
print("certifi:", certifi.where())
print("cafile:", ssl.get_default_verify_paths().cafile)
urllib.request.urlopen("https://www.python.org").read(1)
print("OK")
PY

### C) Run Apple‚Äôs cert installer (if you have the python.org build)

# Your path shows a **Frameworks** install (`/Library/Frameworks/...`), which usually also installs this script
# open "/Applications/Python 3.12/Install Certificates.command"

# If unsure of the exact version folder, let macOS find it:
# !find /Applications -maxdepth 2 -name "Install Certificates.command" -print
# then:
# open "/Applications/Python 3.XX/Install Certificates.command"
# After it runs, restart the terminal, reactivate the venv, and the SSL error should be gone without any code patches.
# If you still hit issues after A)  hard-wire the NLTK downloader to your `~/nltk_data` directory as a fallback

import os, ssl, certifi

# Route ALL new HTTPS contexts to certifi's CA bundle
os.environ["SSL_CERT_FILE"] = certifi.where()
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
ssl._create_default_https_context = lambda *a, **k: ssl.create_default_context(cafile=certifi.where())
import urllib.request
urllib.request.urlopen("https://www.python.org").read(1)
# üìö Inline Sectioned Glossary Builder ‚Äî with Noun-Phrase Bigrams: "e.g., customer churn, monthly charges"
#    - Captures unigrams (NN/JJ) and NP bigrams like "customer churn"
#    - Looks up definitions (tries underscore form for WordNet)

# --- NLTK bootstrap: handle macOS SSL + local cache ---
# import os, ssl, nltk
# NLTK_DIR = os.path.expanduser("~/nltk_data")
# os.makedirs(NLTK_DIR, exist_ok=True)
# # Make sure NLTK looks here first
# if NLTK_DIR not in nltk.data.path:
#     nltk.data.path.insert(0, NLTK_DIR)

# # TEMP: bypass SSL verification only for these downloads
# try:
#     _orig_ctx = ssl._create_default_https_context
#     ssl._create_default_https_context = ssl._create_unverified_context
# except Exception:
#     _orig_ctx = None

# # Download required packages if missing, into ~/nltk_data
# for pkg, kind in [
#     ("punkt", "tokenizers"), ("punkt_tab", "tokenizers"),
#     ("averaged_perceptron_tagger", "taggers"),
#     ("wordnet", "corpora"), ("omw-1.4", "corpora")
# ]:
#     try:
#         nltk.data.find(f"{kind}/{pkg}")
#     except LookupError:
#         nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)

# # Restore SSL context
# if _orig_ctx:
#     ssl._create_default_https_context = _orig_ctx

###

# --- Use certifi CA bundle for all HTTPS in this kernel ---
import os, ssl, certifi, nltk
os.environ["SSL_CERT_FILE"] = certifi.where()      # honored by Python ssl
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where() # for requests, if used

# ensure new contexts use certifi's bundle
ssl._create_default_https_context = \
    (lambda *a, **kw: ssl.create_default_context(cafile=certifi.where()))

# now downloads should succeed:
for pkg, kind in [
    ("punkt", "tokenizers"), ("punkt_tab", "tokenizers"),
    ("averaged_perceptron_tagger", "taggers"),
    ("wordnet", "corpora"), ("omw-1.4", "corpora")
]:
    try:
        nltk.data.find(f"{kind}/{pkg}")
    except LookupError:
        nltk.download(pkg, quiet=False)  # first run may take a minute
print("‚úÖ NLTK resources ready")

###

import json, re
from pathlib import Path
from collections import Counter, defaultdict
import nltk

nltk.download('punkt_tab')
# Ensure required NLTK data (handles new punkt_tab too)
for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger", "wordnet", "omw-1.4"]:
    try:
        if pkg in ("punkt","punkt_tab"):
            nltk.data.find(f"tokenizers/{pkg}")
        else:
            nltk.data.find(f"corpora/{pkg}")
    except LookupError:
        nltk.download(pkg, quiet=True)

from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

# ------------ CONFIG ------------
NOTEBOOK_PATH = Path("01_EDA.ipynb")
TOP_K_PER_SECTION = 80
MIN_FREQ_UNI = 2                  # min unigram frequency
MIN_FREQ_BI  = 2                  # min bigram frequency
ALLOW_POS_UNI = {"NN","NNS","NNP","NNPS","JJ"}  # nouns + adjectives
MIN_LEN = 3
INCLUDE_BIGRAMS = True
SAVE_JSON = Path("outputs/notebook_glossary_by_section.json")
SAVE_JSON.parent.mkdir(parents=True, exist_ok=True)
# ---------------------------------

# Detect section headings / comment headers
SEC_RE = re.compile(r"^\s{0,3}#{1,6}\s*(?:(?P<num>(?:\d+\.)*\d+)\s*)?(?P<title>.*)$")
COMMENT_SEC_RE = re.compile(r"^\s*#\s*(?:(?P<num>(?:\d+\.)*\d+)\s*)?(?P<title>.*)$")

lem = WordNetLemmatizer()

def map_pos(tag):
    if tag.startswith("NN"): return "n"
    if tag.startswith("JJ"): return "a"
    return None

def normalize_text(text: str) -> str:
    text = re.sub(r"[`*_<>]+", " ", text)
    text = re.sub(r"[\u2000-\u206F]", " ", text)
    return text

def extract_tokens(text: str):
    """Return tokens + POS tags filtered to A/N (kept raw for bigram logic too)."""
    tokens = [t for t in word_tokenize(text) if re.search(r"[A-Za-z]", t)]
    tagged  = pos_tag(tokens)
    return tagged

def lemmatize_if_needed(tok: str, tag: str) -> str:
    t = re.sub(r"[^A-Za-z\-]", "", tok).lower()
    if len(t) < MIN_LEN: 
        return ""
    wnpos = map_pos(tag)
    return lem.lemmatize(t, wnpos) if wnpos else t

def noun_phrase_bigrams(tagged):
    """
    Capture bigrams that look like NP chunks:
      (Adj|Noun) + Noun
      e.g., 'monthly charges', 'customer churn', 'fiber optic'
    Returns lower-cased space-joined bigrams, lemmatizing the head noun.
    """
    bigrams = []
    for (w1, t1), (w2, t2) in zip(tagged, tagged[1:]):
        if not (t2.startswith("NN") and (t1.startswith("JJ") or t1.startswith("NN"))):
            continue
        w1n = re.sub(r"[^A-Za-z\-]", "", w1).lower()
        w2n = lemmatize_if_needed(w2, t2)  # lemmatize head noun
        if len(w1n) >= MIN_LEN and len(w2n) >= MIN_LEN:
            bigrams.append(f"{w1n} {w2n}")
    return bigrams

def define_term(term: str) -> str:
    """
    Lookup definition in WordNet.
    - Try underscore form first for MWEs (e.g., 'customer_churn').
    - Fall back to space form.
    Prefer noun, then adjective.
    """
    candidates = [term.replace(" ", "_"), term]
    for cand in candidates:
        syns = wn.synsets(cand)
        if syns:
            noun_first = [s for s in syns if s.pos() == 'n'] + [s for s in syns if s.pos() == 'a'] + syns
            return noun_first[0].definition()
    return "(definition not found)"

# --- 1) Read notebook & gather text chunks by section
nb = json.loads(NOTEBOOK_PATH.read_text(encoding="utf-8"))
sections_text = defaultdict(list)
current_key = "0.0 Unsectioned"

def make_key(num: str|None, title: str) -> str:
    title = (title or "").strip()
    return f"{num} {title}".strip() if num else (title or "Unsectioned")

for cell in nb.get("cells", []):
    ctype = cell.get("cell_type")
    src_list = cell.get("source", [])
    src = "".join(src_list)

    if ctype == "markdown":
        set_key = None
        for line in src.splitlines():
            m = SEC_RE.match(line)
            if m:
                set_key = make_key(m.group("num"), m.group("title"))
                current_key = set_key
                break
        sections_text[current_key].append(src)

    elif ctype == "code":
        # top-of-cell comments as potential section headers
        lines = src_list
        comments = []
        comment_key = None
        for line in lines:
            if line.strip().startswith("#"):
                comments.append(re.sub(r"^#+\s?", "", line.strip()))
                if comment_key is None:
                    m2 = COMMENT_SEC_RE.match(line)
                    if m2:
                        comment_key = make_key(m2.group("num"), m2.group("title"))
            elif line.strip() == "":
                continue
            else:
                break
        if comment_key:
            current_key = comment_key
        if comments:
            sections_text[current_key].append("\n".join(comments))
        # optional: top-of-cell docstring
        m3 = re.match(r'\s*(?P<q>"""|\'\'\')(?P<doc>.*?)(?P=q)', src, flags=re.DOTALL)
        if m3:
            sections_text[current_key].append(m3.group("doc"))

# --- 2) Build per-section term frequencies (unigrams + bigrams)
by_section_defs = {}
global_counter = Counter()

for sec, chunks in sections_text.items():
    text = normalize_text("\n\n".join(chunks))
    tagged = extract_tokens(text)

    # Unigrams (nouns/adjectives)
    uni_terms = []
    for tok, tag in tagged:
        if tag in ALLOW_POS_UNI:
            lemmed = lemmatize_if_needed(tok, tag)
            if lemmed:
                uni_terms.append(lemmed)

    uni_freq = Counter(uni_terms)
    uni_kept = [(t, c) for t, c in uni_freq.items() if c >= MIN_FREQ_UNI]

    # Bigrams (NP patterns)
    bi_kept = []
    if INCLUDE_BIGRAMS:
        bi_terms = noun_phrase_bigrams(tagged)
        bi_freq = Counter(bi_terms)
        bi_kept = [(t, c) for t, c in bi_freq.items() if c >= MIN_FREQ_BI]

    # Merge & cap by TOP_K_PER_SECTION (prioritize bigrams, then unigrams)
    merged = sorted(bi_kept, key=lambda x: (-x[1], x[0])) \
           + sorted(uni_kept, key=lambda x: (-x[1], x[0]))
    merged = merged[:TOP_K_PER_SECTION]

    # Definitions + frequencies
    sec_map = {}
    for term, count in merged:
        defn = define_term(term)
        sec_map[term] = {"definition": defn, "frequency": count}
        global_counter[term] += count

    by_section_defs[sec] = sec_map

# --- 3) Build overall top terms
overall_top = {}
for term, count in global_counter.most_common(200):
    overall_top[term] = {"definition": define_term(term), "frequency": count}

# --- 4) Save JSON
payload = {
    "source_notebook": str(NOTEBOOK_PATH),
    "min_freq_unigram": MIN_FREQ_UNI,
    "min_freq_bigram": MIN_FREQ_BI,
    "top_k_per_section": TOP_K_PER_SECTION,
    "include_bigrams": INCLUDE_BIGRAMS,
    "by_section": by_section_defs,
    "overall_top_terms": overall_top
}
SAVE_JSON.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")

# --- 5) Preview
print(f"‚úÖ Glossary (unigrams + bigrams) built from {NOTEBOOK_PATH.name}")
print(f"üíæ Saved ‚Üí {SAVE_JSON}\n")

def preview(sec_key, n=12):
    sec = by_section_defs.get(sec_key, {})
    items = sorted(sec.items(), key=lambda kv: (-kv[1]['frequency'], kv[0]))[:n]
    print(f"--- {sec_key} (showing {len(items)}/{len(sec)}) ---")
    for i, (term, meta) in enumerate(items, 1):
        print(f"{i:>2}. {term} [{meta['frequency']}]: {meta['definition']}")
    print()

shown = 0
for sec_key in sorted(by_section_defs.keys(), key=lambda k: (k.split()[0], k)):
    preview(sec_key, n=10)
    shown += 1
    if shown >= 2:
        break

print("--- Overall Top Terms (first 15) ---")
for i, (t, meta) in enumerate(list(overall_top.items())[:15], 1):
    print(f"{i:>2}. {t} [{meta['frequency']}]: {meta['definition']}")

---

# ‚úÖ Force HTTPS to use certifi in THIS kernel
import os, ssl, certifi, urllib.request, sys

print("Python:", sys.executable)
print("certifi bundle:", certifi.where())

# 1) Make every new SSL context use certifi's CA bundle
ssl._create_default_https_context = lambda *a, **k: ssl.create_default_context(cafile=certifi.where())

# 2) Install a global urllib opener that uses that context
_ctx = ssl.create_default_context(cafile=certifi.where())
_opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=_ctx))
urllib.request.install_opener(_opener)

# 3) (optional) also set env vars some libs honor
os.environ["SSL_CERT_FILE"] = certifi.where()
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()

# 4) Test a real HTTPS fetch
print("Fetching 1 byte from python.org over HTTPS‚Ä¶")
print(urllib.request.urlopen("https://www.python.org", timeout=10).read(1))
print("‚úÖ HTTPS OK")

---
---

# Make it persistent (no more patches in notebooks)
# Create a sitecustomize.py inside your venv so every Python process in that venv uses certifi:

# python - <<'PY'

import certifi, sys, pathlib, textwrap
site_dir = next(p for p in sys.path if p.endswith("site-packages"))
target = pathlib.Path(site_dir) / "sitecustomize.py"
code = textwrap.dedent(f"""
import os, ssl, certifi
os.environ.setdefault("SSL_CERT_FILE", certifi.where())
os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
ssl._create_default_https_context = lambda *a, **k: ssl.create_default_context(cafile=certifi.where())
""")
target.write_text(code)
print("Wrote", target)
PY

# Then restart your terminal/kernel and retest the tiny fetch:
import urllib.request
urllib.request.urlopen("https://www.python.org").read(1)

---
---

import nltk, ssl, certifi
# contexts created inside nltk should now inherit the patched default
for pkg, kind in [
    ("punkt", "tokenizers"), ("punkt_tab", "tokenizers"),
    ("averaged_perceptron_tagger", "taggers"),
    ("wordnet", "corpora"), ("omw-1.4", "corpora")
]:
    try:
        nltk.data.find(f"{kind}/{pkg}")
        print(f"‚úì {pkg} already present")
    except LookupError:
        print(f"‚Üì downloading {pkg} ‚Ä¶")
        nltk.download(pkg, quiet=False)
print("‚úÖ NLTK resources ready")

---
---

You‚Äôve got the **new NLTK resource name** error. Recent NLTK versions look for
`taggers/averaged_perceptron_tagger_eng` (not the old `‚Ä¶_tagger`). Fix it by (1) forcing HTTPS to use **certifi** (so downloads work) and (2) downloading **both** the new and legacy tagger names.

### Drop-in cell to run **before** your glossary code

```python
# --- Make NLTK downloads work + fetch the right tagger resources ---
import os, ssl, certifi, nltk

# Route HTTPS to certifi bundle (works inside this kernel)
os.environ["SSL_CERT_FILE"] = certifi.where()
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
ssl._create_default_https_context = lambda *a, **k: ssl.create_default_context(cafile=certifi.where())

# Ensure a local cache dir is used (so it won't re-download every time)
NLTK_DIR = os.path.expanduser("~/nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.insert(0, NLTK_DIR)

# Try to find or download required packages
def need(path): 
    try: nltk.data.find(path); return False
    except LookupError: return True

to_get = []
if need("tokenizers/punkt"): to_get += ["punkt"]
# some installs want punkt_tab as well
if need("tokenizers/punkt_tab"): to_get += ["punkt_tab"]
# POS tagger: new & legacy names (grab both to be safe)
if need("taggers/averaged_perceptron_tagger_eng"): to_get += ["averaged_perceptron_tagger_eng"]
if need("taggers/averaged_perceptron_tagger"):     to_get += ["averaged_perceptron_tagger"]
# WordNet for definitions
if need("corpora/wordnet"): to_get += ["wordnet"]
if need("corpora/omw-1.4"): to_get += ["omw-1.4"]

if to_get:
    print("Downloading NLTK data:", ", ".join(to_get))
    for pkg in to_get:
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=False)
else:
    print("‚úÖ All required NLTK resources already present")
```

Now re-run your glossary cell. The `pos_tag` call should succeed because the **ENG** tagger is present.

---

### If downloads still fail (corporate proxy / SSL weirdness)

Use the **no-internet fallback** for bigrams (no POS tagger, no WordNet). It‚Äôs surprisingly decent for study notes:

```python
import re, json
from collections import Counter, defaultdict
from pathlib import Path

NOTEBOOK_PATH = Path("../../../Level_3/notebooks/01_EDA.ipynb").resolve()

def simple_tokens(t):
    return [w.lower() for w in re.findall(r"[A-Za-z][A-Za-z\-]+", t)]

def simple_bigrams(tokens):
    # keep (adj|noun)-ish approximations: exclude common stopwords
    stop = set("a an the of to and or in on for with from by as at is are was were be been being this that these those it its".split())
    keep = [w for w in tokens if w not in stop and len(w) >= 3]
    return [" ".join(pair) for pair in zip(keep, keep[1:])]

nb = json.loads(NOTEBOOK_PATH.read_text(encoding="utf-8"))
sections = defaultdict(list)
sec = "Unsectioned"
for cell in nb.get("cells", []):
    src = "".join(cell.get("source", []))
    if cell.get("cell_type") == "markdown":
        # crude section split: if a heading appears, switch section
        for line in src.splitlines():
            if line.lstrip().startswith("#"):
                sec = re.sub(r"^\s*#+\s*", "", line).strip() or "Unsectioned"
                break
        sections[sec].append(src)
    else:
        sections[sec].append(src)

gloss = {}
for sec, chunks in sections.items():
    text = "\n\n".join(chunks)
    toks = simple_tokens(text)
    uni = Counter(toks)
    bi  = Counter(simple_bigrams(toks))
    # keep top items; no definitions in offline mode
    terms = {}
    for t, c in bi.most_common(60):
        terms[t] = {"definition": "(offline mode ‚Äî no WordNet)", "frequency": c}
    for t, c in uni.most_common(60):
        terms.setdefault(t, {"definition": "(offline mode ‚Äî no WordNet)", "frequency": c})
    gloss[sec] = terms

out = Path("outputs/notebook_glossary_by_section.json")
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps({"by_section": gloss}, indent=2), encoding="utf-8")
print("‚úÖ Offline glossary saved ‚Üí", out)
```

This lets you keep moving; when your SSL is sorted, switch back to the NLTK-powered version for POS-aware bigrams and definitions.

```


In [None]:
# --- Make NLTK downloads work + fetch the right tagger resources ---
import os, ssl, certifi, nltk

# Route HTTPS to certifi bundle (works inside this kernel)
os.environ["SSL_CERT_FILE"] = certifi.where()
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
ssl._create_default_https_context = lambda *a, **k: ssl.create_default_context(cafile=certifi.where())

# Ensure a local cache dir is used (so it won't re-download every time)
NLTK_DIR = os.path.expanduser("~/nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.insert(0, NLTK_DIR)

# Try to find or download required packages
def need(path): 
    try: nltk.data.find(path); return False
    except LookupError: return True

to_get = []
if need("tokenizers/punkt"): to_get += ["punkt"]
# some installs want punkt_tab as well
if need("tokenizers/punkt_tab"): to_get += ["punkt_tab"]
# POS tagger: new & legacy names (grab both to be safe)
if need("taggers/averaged_perceptron_tagger_eng"): to_get += ["averaged_perceptron_tagger_eng"]
if need("taggers/averaged_perceptron_tagger"):     to_get += ["averaged_perceptron_tagger"]
# WordNet for definitions
if need("corpora/wordnet"): to_get += ["wordnet"]
if need("corpora/omw-1.4"): to_get += ["omw-1.4"]

if to_get:
    print("Downloading NLTK data:", ", ".join(to_get))
    for pkg in to_get:
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=False)
else:
    print("‚úÖ All required NLTK resources already present")


In [None]:
# üìö Inline Glossary Builder ‚Äî Jupyter Study Tool: V1
# - Extracts important words from your notebook and
# - builds a dictionary of definitions using NLTK WordNet.

# üîπ ensure required nltk data
# üí° Quick repair inside your current notebook
# The new tokenizer now separates its data into two resources:   (a **recent NLTK change**)
# 1. `"punkt"` ‚Üí old base tokenizer
# 2. `"punkt_tab"` ‚Üí new tokenization tables required by recent versions
# add `"punkt_tab"` to the download list at the top of the script:
# You can also just run this **once** in a separate Jupyter cell:
# import nltk
# nltk.download("punkt_tab")
# Then re-run the glossary cell.
# That will permanently add the new tokenizer tables to your NLTK data directory, and the rest of the script will work fine.

# %pip install nltk
import json, re
from pathlib import Path
from collections import Counter
import nltk

# üîπ ensure required nltk data
for pkg in ["punkt", "averaged_perceptron_tagger", "wordnet", "omw-1.4"]:
    try:
        nltk.data.find(f"tokenizers/{pkg}") if pkg=="punkt" else nltk.data.find(f"corpora/{pkg}")
    except LookupError:
        nltk.download(pkg, quiet=True)

from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

# --- CONFIG ---

# 1) Correct path reference:
# NOTEBOOK_PATH = Path(__file__).resolve().parents[3] / "Level_3" / "notebooks" / "01_EDA.ipynb"

# 2) If you‚Äôre running it interactively in Jupyter (so __file__ isn‚Äôt defined), use:
from pathlib import Path
NOTEBOOK_PATH = Path.cwd().resolve().parents[3]/ "Telco" / "Level_3" / "notebooks" / "01_EDA.ipynb"

# 3) if you just want a clean relative path from the glossary notebook‚Äôs folder:
# NOTEBOOK_PATH = Path("../../../Level_3/notebooks/01_EDA.ipynb").resolve()

#4) incorrect?
# NOTEBOOK_PATH = Path("./Level_3/notebooks/01_EDA.ipynb")

TOP_K = 1000                             # how many terms to keep
MIN_FREQ = 2                            # minimum frequency
ALLOW_POS = {"NN","NNS","NNP","NNPS","JJ"}  # nouns & adjectives only
# ---------------

# --- 1Ô∏è‚É£ Extract text from notebook ---
nb = json.loads(NOTEBOOK_PATH.read_text(encoding="utf-8"))
texts = []
for cell in nb["cells"]:
    if cell["cell_type"] == "markdown":
        texts.append("".join(cell["source"]))
    elif cell["cell_type"] == "code":
        # capture top comments
        lines = cell["source"]
        comments = []
        for line in lines:
            if line.strip().startswith("#"):
                comments.append(line.strip("#").strip())
            elif line.strip() == "":
                continue
            else:
                break
        if comments:
            texts.append("\n".join(comments))

text = "\n".join(texts)

# --- 2Ô∏è‚É£ Tokenize and filter important words ---
lem = WordNetLemmatizer()
tokens = [t for t in word_tokenize(text) if re.search(r"[A-Za-z]", t)]
tagged = pos_tag(tokens)

def map_pos(tag):
    if tag.startswith("NN"): return "n"
    if tag.startswith("JJ"): return "a"
    return None

terms = []
for word, tag in tagged:
    if tag not in ALLOW_POS or len(word) < 3: 
        continue
    pos = map_pos(tag)
    word = word.lower()
    lemma = lem.lemmatize(word, pos) if pos else word
    terms.append(lemma)

# --- 3Ô∏è‚É£ Count and select top terms ---
freq = Counter(terms)
top_terms = [w for w, c in freq.items() if c >= MIN_FREQ]
top_terms = sorted(top_terms, key=lambda x: (-freq[x], x))[:TOP_K]

# --- 4Ô∏è‚É£ Lookup definitions from WordNet ---
glossary = {}
for term in top_terms:
    syns = wn.synsets(term)
    if syns:
        glossary[term] = syns[0].definition()
    else:
        glossary[term] = "(definition not found)"

# --- 5Ô∏è‚É£ Display glossary preview ---
print(f"‚úÖ Extracted {len(glossary)} terms from {NOTEBOOK_PATH.name}")
for i, (k, v) in enumerate(list(glossary.items())[:15], 1):
    print(f"{i:>2}. {k}: {v}")

# --- 6Ô∏è‚É£ Optional: save as JSON for reuse ---
out_path = Path("notebook_glossary.json")
out_path.write_text(json.dumps(glossary, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"\nüíæ Saved glossary ‚Üí {out_path}")


In [None]:
# inline glossary builder - grouped by notebook section - V2
# (e.g., ‚Äú2.1 Missing Scan‚Äù, ‚Äú2.2 Constant Columns‚Äù). 
# It scans markdown headings and top-of-cell comments to track the current section, 
# extracts key terms per section,
# pulls **WordNet** definitions, and saves a nested JSON:

# TODO: make it runnable both ways ‚Äî as a CLI tool and as an importable module (with a main() guard and docstring header)
# ==========================================================
# üìö Inline Sectioned Glossary Builder ‚Äî Jupyter Study Tool
#    Groups terms by notebook sections like "2.1 ‚Ä¶", "2.2 ‚Ä¶"
# ==========================================================

import json, re
from pathlib import Path
from collections import Counter, defaultdict
import nltk
nltk.download("punkt_tab")

# üîπ Ensure required NLTK data (quiet, idempotent)
# for pkg in ["punkt", "averaged_perceptron_tagger", "wordnet", "omw-1.4"]:
#     try:
#         nltk.data.find(f"tokenizers/{pkg}") if pkg=="punkt" else nltk.data.find(f"corpora/{pkg}")
#     except LookupError:
#         nltk.download(pkg, quiet=True)

for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger", "wordnet", "omw-1.4"]:
    try:
        if pkg in ["punkt", "punkt_tab"]:
            nltk.data.find(f"tokenizers/{pkg}")
        else:
            nltk.data.find(f"corpora/{pkg}")
    except LookupError:
        nltk.download(pkg, quiet=True)

from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

# ------------ CONFIG ------------
NOTEBOOK_PATH = Path("01_EDA.ipynb")         # ‚Üê your notebook file
TOP_K_PER_SECTION = 60                       # max terms per section
MIN_FREQ = 2                                 # min frequency per section
ALLOW_POS = {"NN","NNS","NNP","NNPS","JJ"}   # nouns + adjectives
MIN_LEN = 3
SAVE_JSON = Path("outputs/notebook_glossary_by_section.json")
SAVE_JSON.parent.mkdir(parents=True, exist_ok=True)
# ---------------------------------

# Detect section from a line like:
#   "# 2.1 Missing / Null / Blank Scan"
#   "## 2.0.1 üß©üîí Dataset Guard ‚Ä¶"
#   "# 3Ô∏è‚É£ DESCRIPTIVE STATISTICS" (fallback keeps emoji titles too)
SEC_RE = re.compile(
    r"^\s{0,3}#{1,6}\s*(?P<num>(?:\d+\.)*\d+)\s*(?P<title>.*)$"
)

# Also detect from top-of-cell comments:
#   "# 2.1 Missing ‚Ä¶"
COMMENT_SEC_RE = re.compile(
    r"^\s*#\s*(?P<num>(?:\d+\.)*\d+)\s*(?P<title>.*)$"
)

def normalize_terms(text):
    # strip markup-y artifacts, then tokenize & POS tag
    text = re.sub(r"[`*_<>]+", " ", text)
    text = re.sub(r"[\u2000-\u206F]", " ", text)
    toks = [t for t in word_tokenize(text) if re.search(r"[A-Za-z]", t)]
    tagged = pos_tag(toks)

    def to_wnpos(tag):
        if tag.startswith("NN"): return "n"
        if tag.startswith("JJ"): return "a"
        return None

    lem = WordNetLemmatizer()
    terms = []
    for tok, tag in tagged:
        if tag not in ALLOW_POS:
            continue
        tok_clean = re.sub(r"[^A-Za-z\-]", "", tok).lower()
        if len(tok_clean) < MIN_LEN:
            continue
        wnpos = to_wnpos(tag)
        tok_lem = lem.lemmatize(tok_clean, wnpos) if wnpos else tok_clean
        terms.append(tok_lem)
    return terms

# --- 1) Read notebook & walk cells, maintaining current section
nb = json.loads(NOTEBOOK_PATH.read_text(encoding="utf-8"))
sections_text = defaultdict(list)   # section_key -> list of text chunks
current_key = "0.0 Unsectioned"

def make_key(num: str, title: str) -> str:
    title = title.strip()
    # Compact overly long titles
    return f"{num} {title}" if num else (title or "Unsectioned")

for cell in nb.get("cells", []):
    ctype = cell.get("cell_type")
    src_list = cell.get("source", [])
    src = "".join(src_list)

    # Check for a heading in markdown
    if ctype == "markdown":
        # look line-by-line for the first heading
        heading_key = None
        for line in src.splitlines():
            m = SEC_RE.match(line)
            if m:
                heading_key = make_key(m.group("num"), m.group("title"))
                current_key = heading_key
                break
        # record full markdown text under the *current* section
        sections_text[current_key].append(src)

    elif ctype == "code":
        # Try to capture a section from top-of-cell comments (first contiguous comment block)
        lines = src_list
        comments = []
        comment_section_key = None
        for line in lines:
            if line.strip().startswith("#"):
                comments.append(re.sub(r"^#+\s?", "", line.strip()))
                # also see if the very first comment line defines a section number
                if comment_section_key is None:
                    m2 = COMMENT_SEC_RE.match(line)
                    if m2:
                        comment_section_key = make_key(m2.group("num"), m2.group("title"))
            elif line.strip() == "":
                continue
            else:
                break

        if comment_section_key:
            current_key = comment_section_key

        if comments:
            sections_text[current_key].append("\n".join(comments))

        # (Optional) capture top-of-cell docstring if present at the very start
        m3 = re.match(r'\s*(?P[q]"""|\'\'\')(?P<doc>.*?)(?P=q)', src, flags=re.DOTALL)
        if m3:
            sections_text[current_key].append(m3.group("doc"))

# --- 2) Build per-section term frequencies
section_term_freqs = {}
for sec, chunks in sections_text.items():
    text = "\n\n".join(chunks)
    terms = normalize_terms(text)
    freq = Counter(terms)
    # filter by MIN_FREQ and pick top K
    kept = [(t, c) for t, c in freq.items() if c >= MIN_FREQ]
    kept.sort(key=lambda x: (-x[1], x[0]))
    section_term_freqs[sec] = kept[:TOP_K_PER_SECTION]

# --- 3) Look up definitions per section; also build global view
by_section_defs = {}
global_terms = Counter()

for sec, pairs in section_term_freqs.items():
    sec_defs = {}
    for term, count in pairs:
        global_terms[term] += count
        syns = wn.synsets(term)
        if syns:
            # Prefer noun first, then adjective
            noun_first = [s for s in syns if s.pos() == 'n'] + [s for s in syns if s.pos() == 'a'] + syns
            definition = noun_first[0].definition()
        else:
            definition = "(definition not found)"
        sec_defs[term] = {
            "definition": definition,
            "frequency": count
        }
    by_section_defs[sec] = sec_defs

overall_top = dict()
for term, count in global_terms.most_common(200):
    syns = wn.synsets(term)
    if syns:
        noun_first = [s for s in syns if s.pos() == 'n'] + [s for s in syns if s.pos() == 'a'] + syns
        definition = noun_first[0].definition()
    else:
        definition = "(definition not found)"
    overall_top[term] = {"definition": definition, "frequency": count}

# --- 4) Save JSON (nested: by_section + overall)
payload = {
    "source_notebook": str(NOTEBOOK_PATH),
    "min_freq": MIN_FREQ,
    "top_k_per_section": TOP_K_PER_SECTION,
    "by_section": by_section_defs,
    "overall_top_terms": overall_top
}
SAVE_JSON.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")

# --- 5) Pretty preview
print(f"‚úÖ Glossary built by section from {NOTEBOOK_PATH.name}")
print(f"üíæ Saved ‚Üí {SAVE_JSON}\n")

def preview(sec_key, n=10):
    terms = list(by_section_defs.get(sec_key, {}).items())[:n]
    print(f"--- {sec_key} (showing {len(terms)}/{len(by_section_defs.get(sec_key, {}))}) ---")
    for i, (term, data) in enumerate(terms, 1):
        print(f"{i:>2}. {term} [{data['frequency']}]: {data['definition']}")
    print()

# Show the first 2 sections found (if any)
shown = 0
for sec_key in sorted(by_section_defs.keys(), key=lambda k: (k.split()[0], k)):
    preview(sec_key, n=10)
    shown += 1
    if shown >= 2:
        break

# Show overall top terms
print("--- Overall Top Terms (first 15) ---")
for i, (term, data) in enumerate(list(overall_top.items())[:15], 1):
    print(f"{i:>2}. {term} [{data['frequency']}]: {data['definition']}")



In [None]:
### What‚Äôs new vs. the previous version
# **Section tracking** from:
#   * Markdown headings like `## 2.1 Missing / Null / Blank Scan`
#   * *or* the first top-of-cell **comment** like `# 2.1 ‚Ä¶`
# * **Per-section term frequency** (filtered by `MIN_FREQ`, capped by `TOP_K_PER_SECTION`)
# * **Definitions** stored **per section** and an **overall** roll-up
# * Saves a nested JSON you can search or render later

### Tips
# * If some sections don‚Äôt start with a numeric heading (e.g., ‚Äúüß≠ Intro‚Äù), they‚Äôll fall under **‚Äú0.0 Unsectioned‚Äù** until the next numeric heading/comment appears.
# * To include **verbs**, add POS tags to `ALLOW_POS` (e.g., `{"NN","NNS","NNP","NNPS","JJ","VB","VBD","VBG","VBN","VBP","VBZ"}`).


###
### 

# ==========================================
# üìñ Collapsible Glossary Renderer (inline)
# ==========================================
from pathlib import Path
import json, html, re
from IPython.display import HTML, display

JSON_PATH = Path("outputs/notebook_glossary_by_section.json")  # adjust if needed

if not JSON_PATH.exists():
    raise FileNotFoundError(f"Glossary JSON not found at: {JSON_PATH}")

data = json.loads(JSON_PATH.read_text(encoding="utf-8"))
by_section = data.get("by_section", {})
overall = data.get("overall_top_terms", {})

def _sec_sort_key(s):
    # sort by numeric prefix if present, else lexicographically
    m = re.match(r"^\s*((?:\d+\.)*\d+)", s)
    if not m: 
        return (9999, s.lower())
    parts = [int(p) for p in m.group(1).split(".")]
    return (parts + [0]*5)[:5]  # pad for consistent length

def _render_table(rows):
    # rows: list of (term, {definition, frequency})
    if not rows:
        return "<p class='muted'>No terms.</p>"
    rows_html = []
    for term, meta in rows:
        defn = html.escape(str(meta.get("definition","(no definition)")))
        freq = meta.get("frequency", "")
        term_esc = html.escape(term)
        rows_html.append(
            f"<tr class='term-row'><td class='term'>{term_esc}</td>"
            f"<td class='def'>{defn}</td><td class='freq'>{freq}</td></tr>"
        )
    return (
        "<table class='glossary-table'>"
        "<thead><tr><th>Term</th><th>Definition</th><th>Freq</th></tr></thead>"
        f"<tbody>{''.join(rows_html)}</tbody></table>"
    )

# Build HTML
sections_sorted = sorted(by_section.items(), key=lambda kv: _sec_sort_key(kv[0]))

overall_rows = list(overall.items())
overall_html = _render_table(overall_rows[:30])  # show first 30 overall

sec_blocks = []
for sec_title, term_map in sections_sorted:
    rows = sorted(term_map.items(), key=lambda kv: (-kv[1].get("frequency",0), kv[0]))
    block = (
        f"<details class='sec' open>"
        f"<summary><span class='sec-title'>{html.escape(sec_title)}</span>"
        f"<span class='count'>({len(rows)} terms)</span></summary>"
        f"{_render_table(rows)}"
        f"</details>"
    )
    sec_blocks.append(block)

css = """
<style>
  .glossary-wrap {font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial; line-height:1.45;}
  .header {display:flex; align-items:center; gap:.75rem; margin-bottom:.5rem;}
  .header h2 {margin:0; font-size:1.25rem;}
  .search {margin: .25rem 0 1rem 0;}
  .search input {width:100%; max-width:720px; padding:.6rem .75rem; border:1px solid #d0d7de; border-radius:8px; font-size:.95rem;}
  details.sec {margin-bottom:.6rem; border:1px solid #e5e7eb; border-radius:10px; padding:.4rem .8rem; background:#fff;}
  details.sec > summary {cursor:pointer; font-weight:600; display:flex; justify-content:space-between; list-style:none; outline:none;}
  details.sec > summary::-webkit-details-marker {display:none;}
  .sec-title {font-size:1rem;}
  .count {color:#64748b; font-weight:500;}
  .muted {color:#94a3b8;}
  .glossary-table {border-collapse:collapse; width:100%; margin:.5rem 0 .75rem;}
  .glossary-table th, .glossary-table td {border-top:1px solid #e5e7eb; padding:.5rem .6rem; vertical-align:top;}
  .glossary-table thead th {background:#f8fafc; font-weight:700; text-align:left;}
  .glossary-table td.term {white-space:nowrap; font-weight:600;}
  .glossary-table td.freq {text-align:right; color:#475569; width:70px;}
  .hint {color:#64748b; font-size:.9rem; margin:.25rem 0 1rem;}
  .overall {margin: .2rem 0 1rem; border:1px dashed #cbd5e1; border-radius:10px; padding:.6rem .8rem; background:#f8fafc;}
</style>
"""

js = """
<script>
(function(){
  const input = document.getElementById('glossary-search');
  if(!input) return;
  const wrap = document.querySelector('.glossary-wrap');
  const rowsSelector = 'table.glossary-table tbody tr.term-row';
  const secDetails = Array.from(document.querySelectorAll('details.sec'));

  function normalize(s){
    return (s || '').toLowerCase().normalize('NFKD').replace(/[\\u0300-\\u036f]/g,'');
  }

  function applyFilter(){
    const q = normalize(input.value.trim());
    if(!q){
      // reset: show all
      secDetails.forEach(d => d.style.display = '');
      wrap.querySelectorAll(rowsSelector).forEach(tr => tr.style.display = '');
      return;
    }
    secDetails.forEach(d => d.style.display = ''); // show all sections by default
    const rows = wrap.querySelectorAll(rowsSelector);
    rows.forEach(tr => {
      const term = normalize(tr.querySelector('.term')?.textContent || '');
      const def  = normalize(tr.querySelector('.def')?.textContent || '');
      const match = term.includes(q) || def.includes(q);
      tr.style.display = match ? '' : 'none';
    });
    // If a section has no visible rows, hide that section
    secDetails.forEach(d => {
      const visible = d.querySelectorAll('tbody tr.term-row:not([style*="display: none"])').length;
      d.style.display = visible ? '' : 'none';
    });
  }

  input.addEventListener('input', applyFilter);
})();
</script>
"""

html_doc = f"""
<div class="glossary-wrap">
  <div class="header">
    <h2>Notebook Glossary</h2>
  </div>
  <div class="search">
    <input id="glossary-search" type="search" placeholder="Search terms & definitions‚Ä¶" aria-label="Search glossary" />
    <div class="hint">Tip: search works across all sections; hidden sections reappear when you clear the search.</div>
  </div>

  <details class="overall" open>
    <summary><strong>Overall Top Terms</strong> (first 30)</summary>
    {overall_html}
  </details>

  {''.join(sec_blocks)}
</div>
"""

display(HTML(css + html_doc + js))
