# Gerador de json a partir de uma tabela única do nome-alvo colocada na pasta tables

In [44]:
# -*- coding: utf-8 -*-
"""
Gera **1 JSON por lema** (agregando todas as planilhas/abas do lema) com:
- senses[ ] = uma entrada por arquivo/aba do lema
- english_roleset lido do cabeçalho após 'Texto':
    • aceita 'base.02' OU 'base_02' (dois dígitos) → preserva .NN
    • se não houver número, assume '.01'
- Numeração PT é local por lema: .01, .02, ...
- Sem campo 'source' no JSON final.
"""

# --- Dependências ---
try:
    from conllu import parse_incr
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "conllu"])
    from conllu import parse_incr

try:
    import pandas as pd
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "pandas", "openpyxl"])
    import pandas as pd

try:
    import requests
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "requests"])
    import requests

try:
    import certifi
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "certifi"])
    import certifi

import warnings
from pathlib import Path
from collections import defaultdict
import json, re, unicodedata
import xml.etree.ElementTree as ET

# -------------------- CONFIG --------------------
CONLLU_PATH = Path("dante.conllu")   # seu .conllu
TABLES_DIR  = Path("tables")         # pasta com .xlsx
OUT_DIR     = Path("jsons")          # pasta de saída
PROCESS_ALL_SHEETS = True            # True = processa todas as abas
ROLESET_SUFFIX_EN = ".01"            # fallback quando o cabeçalho não traz .NN/_NN
MAX_ARGS = 5                         # Arg0..Arg4

# Rede / NomBank
ALLOW_INSECURE_NOMBANK = True
ALLOW_HTTP_FALLBACK   = True
DEBUG_NOMBANK         = True

# ----------------- Utilitários -----------------
SUFFIX_RE = re.compile(r"_(\d+)$")  # sufixo _NN no nome do ARQUIVO (apenas "hint" de polissemia)

def detect_base_and_sense(stem: str) -> tuple[str, int]:
    m = SUFFIX_RE.search(stem)
    if m:
        try:
            return stem[:m.start()], int(m.group(1))
        except Exception:
            return stem, 1
    return stem, 1

def strip_accents(s: str) -> str:
    if s is None:
        return ""
    s = unicodedata.normalize("NFD", str(s))
    return "".join(ch for ch in s if unicodedata.category(ch) != "Mn")

def slugify_id(s: str) -> str:
    s = strip_accents(str(s)).lower()
    s = re.sub(r"[^a-z0-9]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def normalize_header(h: str) -> str:
    h = (h or "").strip().lower()
    h = strip_accents(h)
    h = re.sub(r"[_\-–—:;,.()\[\]{}]+", " ", h)
    h = re.sub(r"\s+", " ", h)
    return h.strip()

def find_col_index(columns, *alternatives):
    norm = [normalize_header(c) for c in columns]
    alts = {normalize_header(a) for a in alternatives}
    for i, n in enumerate(norm):
        if n in alts:
            return i
    return None

PLACEHOLDER_RE = re.compile(r"^arg\s*[0-4]$", flags=re.I)
def is_placeholder_arg_header(raw_header: str) -> bool:
    return bool(PLACEHOLDER_RE.match(normalize_header(raw_header)))

# --------- Inglês: extrair base e possível .NN do cabeçalho ----------
# Regras:
# - aceitar '.NN' (1–2 dígitos) no final
# - aceitar '_NN' **apenas** com 2 dígitos no final (evita confundir 'buyer_1' → bug antigo)
# - limpar sufixos "_<n>" herdados de tabelas ruins (candidate_1 → candidate), sem usar esse número
def parse_roleset_token(raw: str) -> tuple[str, int|None]:
    raw = (raw or "").strip()
    ascii_raw = raw.encode("ascii", "ignore").decode("ascii").strip()
    token = re.sub(r"[^A-Za-z0-9_.-]+", "", ascii_raw).lower()

    # 1) '.NN' no fim
    m_dot = re.search(r"\.(\d{1,2})$", token)
    if m_dot:
        num = int(m_dot.group(1))
        base = token[:m_dot.start()]
    else:
        # 2) '_NN' (exatamente 2 dígitos) no fim → vale como número do roleset
        m_us = re.search(r"_(\d{2})$", token)
        if m_us:
            num = int(m_us.group(1))
            base = token[:m_us.start()]
        else:
            num = None
            base = token

    # remover "lixos" finais tipo '_1' ou '-1' (sem captar como número do roleset)
    base = re.sub(r"[_-](\d+)$", "", base)

    # normalizações mínimas
    base = re.sub(r"-{2,}", "-", base)
    base = re.sub(r"_{2,}", "_", base)
    base = base.strip("._-")

    return base, num

# Correções conhecidas de base (para buscar e para exibir)
NOMBANK_BASE_CORRECTIONS = {
    "hideway": "hideaway",
    "enlightment": "enlightenment",
    "selloff": "sell-off",
    "sell_off": "sell-off",
}

def english_roleset_from_post_text(columns, text_idx: int, lemma_pt_base: str) -> str:
    """
    Pega a PRIMEIRA coluna após 'Texto' que seja ASCII, não-POS e ≠ lemma_pt_base;
    lê 'base' e (se houver) número do roleset via '.NN' ou '_NN' (2 dígitos).
    Se não houver número explícito, usa '.01'.
    """
    POS_BLOCK = {
        "nom","verb","noun","adj","adv","pos","adp","propn","pron",
        "num","det","part","intj","aux","punct"
    }
    lemma_norm = slugify_id(lemma_pt_base)

    for raw in columns[text_idx+1:]:
        raw = str(raw).strip()
        ascii_raw = raw.encode("ascii", "ignore").decode("ascii").strip()
        is_ascii = (ascii_raw == raw) and ascii_raw != ""
        if not is_ascii:
            continue

        base, num = parse_roleset_token(ascii_raw)
        if not base:
            continue

        # pula tags POS e o próprio lemma
        base_id = slugify_id(base)
        if base_id in POS_BLOCK or base_id == lemma_norm:
            continue

        base_fixed = NOMBANK_BASE_CORRECTIONS.get(base, base)
        if num is None:
            return base_fixed + ROLESET_SUFFIX_EN
        # sempre emite com ponto ('.NN'), mesmo que o cabeçalho tenha vindo com '_NN'
        return f"{base_fixed}.{num:02d}"

    # fallback: lemma_base + '.01'
    return lemma_norm + ROLESET_SUFFIX_EN

def clean_cell(x):
    import pandas as pd
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return None
    s = str(x).strip()
    return None if s in ("", "-", "—", "--") else s

# ---------- CoNLL-U helpers ----------
def parse_conllu_index(conllu_path: Path):
    index = {}
    with conllu_path.open("r", encoding="utf-8") as f:
        for sent in parse_incr(f):
            sid = sent.metadata.get("sent_id")
            if sid:
                index[sid] = sent
    return index

WORD_RE = re.compile(r"[A-Za-zÀ-ÿ0-9_]+", re.UNICODE)

def content_word(phrase: str | None) -> str | None:
    if not phrase:
        return None
    toks = [t for t in re.split(r"\s+", str(phrase).strip()) if t]
    for w in reversed(toks):
        m = WORD_RE.search(w)
        if m:
            return m.group(0).lower()
    return None

def follow_head_if_conj(token, sent_tokens_by_id):
    if token and token.get("deprel") in ("conj", "flat", "appos"):
        head_id = token.get("head")
        return sent_tokens_by_id.get(head_id, token)
    return token

def arg_pref_deprels(arg_id: str):
    if arg_id == "Arg0":
        return ("nsubj", "csubj", "expl")
    if arg_id == "Arg1":
        return ("obj", "iobj", "nmod", "obl")
    return ("iobj", "nmod", "obl")

def score_deprel(arg_id: str, deprel: str) -> int:
    prefs = arg_pref_deprels(arg_id)
    if deprel in prefs:
        return 2
    if deprel in ("conj","flat","appos"):
        return 0
    return 1

def guess_deprel_for_phrase(sent, arg_id: str, phrase: str | None) -> str | None:
    if not phrase or str(phrase).strip() in ("-", "—", "--", ""):
        return None

    by_form = defaultdict(list)
    by_id   = {}
    for tok in sent:
        if isinstance(tok["id"], int):
            form = str(tok["form"]).lower()
            by_form[form].append(tok)
            by_id[tok["id"]] = tok

    key = content_word(phrase)
    candidates = []
    if key and key in by_form:
        candidates = by_form[key][:]
    else:
        phrase_low = f" {str(phrase).lower()} "
        for tok in sent:
            if not isinstance(tok["id"], int):
                continue
            form = " " + str(tok["form"]).lower() + " "
            if form in phrase_low:
                candidates.append(tok)

    if not candidates:
        return None

    best, best_sc = None, -10
    for tok in candidates:
        tok2 = follow_head_if_conj(tok, by_id)
        deprel = tok2.get("deprel")
        sc = score_deprel(arg_id, deprel)
        upos = tok2.get("upostag") or tok2.get("upos")
        if upos in ("NOUN","PROPN","PRON","NUM"):
            sc += 0.1
        if sc > best_sc:
            best, best_sc = tok2, sc
    return best.get("deprel") if best else None

# ---------- NomBank helpers ----------
_NOMBANK_XML_MEMO: dict[str, tuple[str, str]] = {}  # base_ok -> (xml_text, url_usada)

NOMBANK_BASE_CORRECTIONS_FETCH = {
    **NOMBANK_BASE_CORRECTIONS,
}

def _nombank_base_candidates(base: str) -> list[str]:
    b = (base or "").lower().strip()
    if not b:
        return []
    # variantes com '_' e '-'
    cands = {b, b.replace("_","-"), b.replace("-","_")}
    # tirar números colados no final (ex.: decision_1 → decision)
    cands |= {re.sub(r"[_-](\d+)$", "", x) for x in list(cands)}
    # correções conhecidas
    cands |= {NOMBANK_BASE_CORRECTIONS_FETCH.get(x, x) for x in list(cands)}
    return [x for x in cands if x]

def _fetch_nombank_xml_with_candidates(english_roleset: str, timeout: float = 12.0):
    base0 = (english_roleset or "").split(".")[0].strip().lower()
    candidates = _nombank_base_candidates(base0)
    if DEBUG_NOMBANK:
        print(f"[NomBank] Base candidates for '{base0}': {candidates}")

    headers = {"User-Agent": "NomBank-JSON-Builder/1.3 (+requests)"}

    for base in candidates:
        if base in _NOMBANK_XML_MEMO:
            xml_text, url_used = _NOMBANK_XML_MEMO[base]
            if DEBUG_NOMBANK:
                print(f"[NomBank] (cache) base={base} url={url_used}")
            return xml_text, base, url_used, 200, None

        url_https = f"https://nlp.cs.nyu.edu/meyers/nombank/nombank.1.0/frames/{base}.xml"
        url_http  = f"http://nlp.cs.nyu.edu/meyers/nombank/nombank.1.0/frames/{base}.xml"

        # 1) HTTPS com certifi
        try:
            if DEBUG_NOMBANK: print(f"[NomBank] HTTPS+certifi -> {url_https}")
            r = requests.get(url_https, headers=headers, timeout=timeout, verify=certifi.where())
            if DEBUG_NOMBANK: print(f"          status={r.status_code}")
            r.raise_for_status()
            _NOMBANK_XML_MEMO[base] = (r.text, url_https)
            return r.text, base, url_https, r.status_code, None
        except Exception as e:
            if DEBUG_NOMBANK: print(f"          falhou: {type(e).__name__}: {e}")

        # 2) HTTPS verify=False
        if ALLOW_INSECURE_NOMBANK:
            try:
                if DEBUG_NOMBANK: print(f"[NomBank] HTTPS verify=False -> {url_https}")
                warnings.filterwarnings("ignore", message="Unverified HTTPS request")
                r = requests.get(url_https, headers=headers, timeout=timeout, verify=False)
                if DEBUG_NOMBANK: print(f"          status={r.status_code}")
                r.raise_for_status()
                _NOMBANK_XML_MEMO[base] = (r.text, url_https)
                return r.text, base, url_https, r.status_code, None
            except Exception as e:
                if DEBUG_NOMBANK: print(f"          falhou: {type(e).__name__}: {e}")

        # 3) HTTP sem TLS
        if ALLOW_HTTP_FALLBACK:
            try:
                if DEBUG_NOMBANK: print(f"[NomBank] HTTP (sem TLS) -> {url_http}")
                r = requests.get(url_http, headers=headers, timeout=timeout)
                if DEBUG_NOMBANK: print(f"          status={r.status_code}")
                r.raise_for_status()
                _NOMBANK_XML_MEMO[base] = (r.text, url_http)
                return r.text, base, url_http, r.status_code, None
            except Exception as e:
                if DEBUG_NOMBANK: print(f"          falhou: {type(e).__name__}: {e}")

    return None, None, None, None, "Falha em HTTPS/HTTP para todos os candidatos"

def get_english_roleset_source(english_roleset: str) -> tuple[str|None, str|None]:
    xml_text, base_ok, url_used, status, err = _fetch_nombank_xml_with_candidates(english_roleset)
    if DEBUG_NOMBANK:
        base = (english_roleset or "").split(".")[0]
        print(f"[NomBank] roleset={english_roleset} base_init={base} base_ok={base_ok} url={url_used} status={status} err={err}")

    if not xml_text:
        return None, url_used

    try:
        root = ET.fromstring(xml_text)
    except ET.ParseError as e:
        if DEBUG_NOMBANK: print(f"[NomBank] ParseError em {url_used}: {e}")
        return None, url_used

    wanted = (english_roleset or "").strip().lower()
    for rs in root.iter("roleset"):
        rid = (rs.get("id") or "").strip().lower()
        if rid == wanted:
            src = (rs.get("source") or "").strip() or None
            if DEBUG_NOMBANK: print(f"[NomBank] Encontrado roleset={rid} source={src}")
            return src, url_used

    if DEBUG_NOMBANK:
        print(f"[NomBank] roleset '{wanted}' NÃO encontrado em {url_used}.")
    return None, url_used

# ---------- Núcleo: processar DataFrame (retorna um "sense") ----------
def process_dataframe_to_sense(df: pd.DataFrame, lemma_pt_base: str, sense_hint_num: int,
                               sent_index: dict) -> dict:
    cols = list(df.columns)
    sent_idx = find_col_index(cols, "sent_ID", "sent id", "sentid", "id")
    text_idx = find_col_index(cols, "Texto", "text", "texto")

    if sent_idx is None or text_idx is None:
        raise ValueError(f"[{lemma_pt_base}] Precisa ter 'sent_ID' e 'Texto'/'text' na planilha/aba.")

    # colunas de argumentos = entre sent_ID e Texto
    if sent_idx >= text_idx - 1:
        arg_cols = []
    else:
        arg_cols = cols[sent_idx+1:text_idx]

    # mapear por ORDEM para Arg0..Arg4
    arg_map = {}
    roles_list = []
    for i in range(MAX_ARGS):
        if i < len(arg_cols):
            col = arg_cols[i]
            placeholder = is_placeholder_arg_header(col)
            desc = None if placeholder else str(col).strip()
            arg_map[i] = (col, placeholder)
        else:
            arg_map[i] = (None, False)
            desc = None
        roles_list.append({"id": f"Arg{i}", "desc": desc})

    # english_roleset via colunas **após** Texto (usa .NN/_NN se vier)
    english_roleset = english_roleset_from_post_text(cols, text_idx, lemma_pt_base)

    # DEBUG
    if DEBUG_NOMBANK:
        base = (english_roleset or "").split(".")[0]
        debug_url = f"https://nlp.cs.nyu.edu/meyers/nombank/nombank.1.0/frames/{base}.xml"
        print(f"[DEBUG] lemma_pt_base='{lemma_pt_base}' sense_hint={sense_hint_num} → english_roleset='{english_roleset}' → URL={debug_url}")

    # buscar 'source' no NomBank
    english_roleset_source, used_url = get_english_roleset_source(english_roleset)

    # construir exemplos (Arg0..Arg4)
    examples = []
    profile = defaultdict(lambda: defaultdict(int))

    for _, row in df.iterrows():
        sid  = clean_cell(row.get(cols[sent_idx]))
        text = clean_cell(row.get(cols[text_idx])) or ""
        if not sid:
            continue

        realization = {}
        syntax = {}
        sent = sent_index.get(sid)

        for i in range(MAX_ARGS):
            arg_id = f"Arg{i}"
            col, is_placeholder = arg_map[i]

            val = None
            if col is not None and not is_placeholder:
                val_raw = row.get(col)
                val = clean_cell(val_raw)

            if arg_id == "Arg1" and val and text and re.search(rf"\bde\s+{re.escape(str(val))}\b", text, flags=re.IGNORECASE):
                val = "de " + str(val)

            realization[arg_id] = val

            syn = None
            if sent is not None and val:
                syn = guess_deprel_for_phrase(sent, arg_id, val)
                if syn:
                    profile[arg_id][syn] += 1
            syntax[arg_id] = syn

        examples.append({
            "sent_ID": sid,
            "text": text,
            "realization": realization,
            "syntax": syntax
        })

    syntactic_profile = {arg: dict(cnts) for arg, cnts in profile.items() if cnts}

    sense_payload = {
        "pt_sense_hint": sense_hint_num,                 # apenas rastreio interno
        "english_roleset": english_roleset,              # ex.: 'appearance.02'
        "english_roleset_source": english_roleset_source,
        "nombank_url": used_url,
        "roles": roles_list,
        "examples": examples,
        "syntactic_profile": syntactic_profile
    }
    return sense_payload

# ----------------- Pipeline principal -----------------
print(f"Lendo CoNLL-U: {CONLLU_PATH.resolve()}")
sent_index = parse_conllu_index(CONLLU_PATH)
print(f"  → {len(sent_index)} sentenças indexadas.")

OUT_DIR.mkdir(parents=True, exist_ok=True)
xlsx_files = sorted(TABLES_DIR.glob("*.xlsx"))
if not xlsx_files:
    print(f"Nenhum .xlsx encontrado em {TABLES_DIR.resolve()}")

# 1) Colete todos os "senses" por lemma_base
lemma_bucket: dict[str, list[dict]] = defaultdict(list)

for xlsx in xlsx_files:
    stem = xlsx.stem
    lemma_base, sense_hint = detect_base_and_sense(stem)
    sheets = pd.read_excel(xlsx, engine="openpyxl", sheet_name=None) if PROCESS_ALL_SHEETS \
             else {"Sheet1": pd.read_excel(xlsx, engine="openpyxl")}
    if len(sheets) == 0:
        print(f"(vazio) {xlsx.name}")
        continue

    for sheet_name, df in sheets.items():
        cols = list(df.columns)
        has_sent = find_col_index(cols, "sent_ID", "sent id", "sentid", "id") is not None
        has_text = find_col_index(cols, "Texto", "text", "texto") is not None
        if not (has_sent and has_text):
            continue

        try:
            sense_payload = process_dataframe_to_sense(
                df=df,
                lemma_pt_base=lemma_base,
                sense_hint_num=sense_hint,
                sent_index=sent_index
            )
            lemma_bucket[lemma_base].append(sense_payload)
        except Exception as e:
            print(f"[ERRO] {xlsx.name} [{sheet_name}] ({lemma_base}): {e}")

# 2) Consolidar por lemma: ordenar e numerar PT localmente (.01, .02, …)
generated = []
for lemma_base, senses in lemma_bucket.items():
    senses_sorted = sorted(
        enumerate(senses),
        key=lambda it: (it[1].get("pt_sense_hint", 1), it[0])
    )
    final_senses = []
    for idx, (orig_pos, s) in enumerate(senses_sorted, start=1):
        final_sense = {
            "pt_roleset": f"{lemma_base}.{idx:02d}",
            "pt_sense_index": idx,
            "pt_sense_hint": s.get("pt_sense_hint"),
            "english_roleset": s.get("english_roleset"),
            "english_roleset_source": s.get("english_roleset_source"),
            "nombank_url": s.get("nombank_url"),
            "roles": s.get("roles", []),
            "examples": s.get("examples", []),
            "syntactic_profile": s.get("syntactic_profile", {})
        }
        final_senses.append(final_sense)

    out_data = {
        "lemma": lemma_base,
        "lemma_base": lemma_base,
        "senses": final_senses
    }

    out_path = OUT_DIR / f"{lemma_base}.json"
    out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"→ JSON: {out_path}")
    generated.append(out_path.name)

# --- Manifesto ---
manifest = sorted(generated)
(OUT_DIR / "_manifest.json").write_text(
    json.dumps(manifest, ensure_ascii=False, indent=2),
    encoding="utf-8"
)
print(f"✓ Manifesto gerado: {OUT_DIR / '_manifest.json'} ({len(manifest)} itens)")

print("\nConcluído.")


Lendo CoNLL-U: /mnt/c/Users/bryan/Downloads/Dissertação/DEMO/Testing_scripts_json/dante.conllu
  → 4042 sentenças indexadas.
[DEBUG] lemma_pt_base='abastecimento' sense_hint=1 → english_roleset='supply.01' → URL=https://nlp.cs.nyu.edu/meyers/nombank/nombank.1.0/frames/supply.xml
[NomBank] Base candidates for 'supply': ['supply']
[NomBank] HTTPS+certifi -> https://nlp.cs.nyu.edu/meyers/nombank/nombank.1.0/frames/supply.xml
          falhou: SSLError: HTTPSConnectionPool(host='nlp.cs.nyu.edu', port=443): Max retries exceeded with url: /meyers/nombank/nombank.1.0/frames/supply.xml (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1016)')))
[NomBank] HTTPS verify=False -> https://nlp.cs.nyu.edu/meyers/nombank/nombank.1.0/frames/supply.xml
          status=200
[NomBank] roleset=supply.01 base_init=supply base_ok=supply url=https://nlp.cs.nyu.edu/meyers/nombank/nombank.1.0/frames/supply.

# Gerador de arquivos HTML baseado nos jsons da pasta "jsons"

In [42]:
# -*- coding: utf-8 -*-
"""
Gera páginas HTML (uma por lemma) a partir dos JSONs.

Novidades:
- Top bar minimalista: Home (esq.) • título (centro) • JSON download (dir.)
- Top bar translúcida ao rolar (backdrop blur).
- “← Voltar para X” no topo e também como botão flutuante no rodapé.
- Tabela com colunas fixas + divisórias + highlight “fronteira-segura”.
"""

import re, html, json, unicodedata
from pathlib import Path

escape = html.escape

# ---------- CONFIG ----------
JSON_DIR = Path("jsons")          # onde estão os JSONs
HTML_DIR = Path("site_pages")     # pasta de saída dos .html
CSS_HREF = "../styles.css"        # seu CSS global
MAX_EXAMPLES_IN_SECTION = 2
DOWNLOAD_PREFIX_FROM_PAGE = "../jsons/"  # caminho relativo (de site_pages/ para jsons/)
HTML_DIR.mkdir(parents=True, exist_ok=True)

# CSS inline adicional (somente o mínimo do layout novo)
STYLE_BLOCK = """
:root{
  --header-h: 56px;
}
body{margin:0;}
/* ===== Top bar ===== */
.site-header{
  position: sticky; top:0; z-index:1000;
  /* alpha menor -> mais transparente */
  background: rgba(20,20,20,0.70);
  transition: background .2s ease, backdrop-filter .2s ease;
}
.site-header.scrolled{
  /* ainda mais transparente quando rolar */
  background: rgba(20,20,20,0.40);
  backdrop-filter: blur(6px);
}
.site-nav{
  height: var(--header-h);
  display:flex; align-items:center; justify-content:space-between;
  padding: 0 16px;
}
.site-nav .home-link{
  color:#cfe3ff; text-decoration:none; font-weight:600;
  padding:8px 12px; border-radius:10px; background:rgba(84,102,170,.25);
}
.site-nav .home-link:hover{ background:rgba(84,102,170,.35); }
.site-nav .page-title{
  color:#cfd8ff; opacity:.75; font-weight:600;
  text-align:center; margin:0 12px; white-space:nowrap; overflow:hidden; text-overflow:ellipsis;
}

/* ===== Botão JSON (agora dentro da header, alinhado) ===== */
.site-nav .download-json{
  display:inline-block; text-decoration:none; font-weight:700;
  padding:10px 14px; border-radius:12px; color:#fff;
  background:#2e7d32; box-shadow:0 4px 14px rgba(0,0,0,.25);
  transition:transform .08s ease, box-shadow .2s ease, opacity .2s ease;
  opacity:.95;
}
.site-nav .download-json:hover{ transform:translateY(-1px); box-shadow:0 8px 22px rgba(0,0,0,.35); opacity:1; }

/* ===== “Voltar para X” (flutuante no rodapé) ===== */
.back-link{ color:#9ecbff; text-decoration:none; font-weight:500; }
.back-link:hover{ text-decoration:underline; }
.back-floating{
  position: fixed; right: 20px; bottom: 20px; z-index: 980;
  background:#2b2b2b; color:#cfe3ff; padding:10px 12px; border-radius:12px;
  box-shadow:0 8px 22px rgba(0,0,0,.35); text-decoration:none;
}

/* ===== Tabela de realização sintática ===== */
#relations-table{ width:100%; border-collapse:collapse; }
#relations-table col.numcol{ width:3.5rem; }
#relations-table col.argcol{ width:18rem; }
#relations-table col.textcol{ width:auto; }
#relations-table thead th{
  text-align:center; padding:.45rem .6rem; white-space:nowrap;
}
#relations-table tbody td{
  padding:.45rem .6rem; vertical-align:top; border-bottom:1px solid #fff;
}
#relations-table tbody tr:last-child td{ border-bottom:none; }
#relations-table tbody td:first-child{ text-align:center; }
"""

# ---------- utils ----------
def strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

def first_letter(lemma: str) -> str:
    s = strip_accents((lemma or "").strip()).lower()
    return s[0] if s and s[0].isalpha() else "a"

def _boundary_safe_pattern(phrase: str) -> re.Pattern:
    core = re.escape(phrase)
    patt = rf"(?<![#\w])({core})(?!\w)"
    return re.compile(patt, flags=re.IGNORECASE)

def highlight_once(text: str, phrase: str, cls: str, deprel: str | None = None) -> str:
    if not phrase: return text
    pat = _boundary_safe_pattern(phrase)
    def _rep(m):
        frag = m.group(1)
        h = f'<span class="{cls}">{escape(frag)}</span>'
        if deprel: h += f'<sub class="deprel">{escape(deprel)}</sub>'
        return h
    new_text, _ = pat.subn(_rep, text, count=1)
    return new_text

def build_roles_list(roles):
    shown, items = [], []
    for k in range(3):
        arg_id = f"Arg{k}"
        desc = next((r.get("desc") for r in roles if r.get("id") == arg_id), None)
        if desc is not None:
            shown.append(arg_id)
            items.append(f'<li class="arg{k}">Arg {k}: {escape(str(desc))}</li>')
    return "<ul>\n" + "\n".join(items) + "\n</ul>", shown

def extract_counts_for(shown_args, syntactic_profile):
    ks = set()
    for arg in shown_args:
        for dep in (syntactic_profile.get(arg) or {}): ks.add(dep)
    rows = []
    for dep in sorted(ks):
        cells = [ (syntactic_profile.get(arg) or {}).get(dep, 0) for arg in shown_args ]
        rows.append((dep, cells))
    return rows

def build_examples_section(lemma, shown_args, examples, max_examples: int):
    parts = []
    lemma_pat = _boundary_safe_pattern(lemma)
    subset = examples[:max_examples]
    order = [a for a in ["Arg1", "Arg2", "Arg0"] if a in shown_args]
    for i, ex in enumerate(subset, start=1):
        text = ex.get("text") or ""
        real = ex.get("realization") or {}
        colored = text
        for arg_id in order:
            k = int(arg_id[-1]); val = real.get(arg_id)
            if val: colored = highlight_once(colored, val, f"arg{k}", deprel=None)
        colored = lemma_pat.sub(r'<span class="rel">\1</span>', colored, count=1)
        parts.append(f'<h3>{i}: {colored}</h3>')
        items = [f'<li class="rel">rel: {escape(lemma)}</li>']
        for arg_id in shown_args:
            k = int(arg_id[-1]); val = real.get(arg_id)
            items.append(f'<li class="arg{k}">Arg {k}: {escape(val) if val is not None else "-"}</li>')
        parts.append("<ul>\n" + "\n".join(items) + "\n</ul>")
    return "\n".join(parts)

def build_realization_table(lemma, shown_args, examples):
    colgroup = ['<col class="numcol">'] + ['<col class="argcol">' for _ in shown_args] + ['<col class="textcol">']
    ths = ['<th>#</th>'] + [f'<th class="arg{int(a[-1])}">{a.replace("Arg","Arg ")}</th>' for a in shown_args] + ['<th>Texto</th>']
    rows = []
    order = [a for a in ["Arg1", "Arg2", "Arg0"] if a in shown_args]
    lemma_pat = _boundary_safe_pattern(lemma)
    for i, ex in enumerate(examples, start=1):
        text = ex.get("text") or ""
        real = ex.get("realization") or {}
        syn  = ex.get("syntax") or {}
        colored = text
        for arg_id in order:
            k = int(arg_id[-1]); val = real.get(arg_id); dep = syn.get(arg_id)
            if val: colored = highlight_once(colored, val, f"arg{k}", dep)
        colored = lemma_pat.sub(r'<span class="rel">\1</span>', colored, count=1)
        tds = [f"<td>{i}</td>"]
        for arg_id in shown_args:
            k = int(arg_id[-1]); val = real.get(arg_id)
            tds.append(f'<td class="arg{k}">{escape(val) if val is not None else "-"}</td>')
        tds.append(f"<td class='texto'>{colored}</td>")
        rows.append("<tr>" + "".join(tds) + "</tr>")
    return f"""
    <table id="relations-table">
      <colgroup>{''.join(colgroup)}</colgroup>
      <thead><tr>{''.join(ths)}</tr></thead>
      <tbody>{''.join(rows)}</tbody>
    </table>
    """

def build_freq_table(shown_args, syntactic_profile):
    rows = extract_counts_for(shown_args, syntactic_profile)
    ths = ['<th>Relações de dependência - <i><a href="https://universaldependencies.org/u/dep/"> Universal Dependencies</a></i></th>'] \
          + [f'<th class="arg{int(a[-1])}">{a.replace("Arg","Arg ")}</th>' for a in shown_args]
    body = []
    for dep, cells in rows:
        body.append("<tr>" + "".join([f"<td>{escape(dep)}</td>"] + [f"<td>{int(v)}</td>" for v in cells]) + "</tr>")
    if not body:
        body.append(f"<tr><td colspan='{len(shown_args)+1}'><i>Sem ocorrências</i></td></tr>")
    return f"""
    <div class="statistics-table-container">
      <h2>Frequência das realizações sintáticas</h2>
      <table id="statistics-table">
        <thead><tr>{''.join(ths)}</tr></thead>
        <tbody>{''.join(body)}</tbody>
      </table>
    </div>
    """
    
def iter_json_docs(payload):
    """
    Aceita payloads em dois formatos:
      - dict: retorna um único (doc, None)
      - list: retorna pares (doc, idx) para cada item dict da lista
    Ignora itens que não forem dict.
    """
    if isinstance(payload, dict):
        yield payload, None
        return
    if isinstance(payload, list):
        for i, item in enumerate(payload):
            if isinstance(item, dict):
                yield item, i
        return
    # formato inesperado -> nada
    return


# --------- rendering ---------
def render_html(doc: dict, json_filename: str | None = None) -> str:
    """
    Mantém o layout original. Se o JSON tiver "senses", achata para um único bloco:
      - Junta pt_roleset/english_roleset/source no cabeçalho (mesma linha).
      - Mantém UMA tabela e UMA seção de exemplos (exemplos somados e limitados).
      - Soma syntactic_profile por Arg/dep.
      - Para roles Arg0..Arg4, pega a primeira descrição não nula encontrada.
    """
    # ---------- Detecta formato novo (com senses) e prepara visão "achatada" ----------
    senses = doc.get("senses") if isinstance(doc, dict) else None
    if isinstance(senses, list) and senses:
        lemma = (doc.get("lemma") or doc.get("lemma_base") or "").strip()

        # pt_roleset(s)
        pt_rolesets = []
        # english rolesets + urls + sources
        en_links = []          # [(eng, url)]
        sources = []           # ["verb-..."]
        # roles (primeira desc não nula por Arg0..Arg4)
        roles = []
        for k in range(5):
            chosen = None
            for s in senses:
                for r in (s.get("roles") or []):
                    if r.get("id") == f"Arg{k}" and r.get("desc") is not None:
                        chosen = r.get("desc")
                        break
                if chosen is not None:
                    break
            roles.append({"id": f"Arg{k}", "desc": chosen})

        # exemplos: concatena todos
        examples = []
        for s in senses:
            examples.extend(s.get("examples") or [])

        # syntactic_profile: soma
        syntactic_profile = {}
        for s in senses:
            sp = s.get("syntactic_profile") or {}
            for arg, deps in sp.items():
                syntactic_profile.setdefault(arg, {})
                for dep, cnt in deps.items():
                    syntactic_profile[arg][dep] = syntactic_profile[arg].get(dep, 0) + int(cnt or 0)

        # coletar cabeçalhos (pt_roleset / english_roleset / sources)
        seen_pt = set()
        seen_en = set()
        seen_src = set()
        for s in senses:
            pr = (s.get("pt_roleset") or "").strip()
            if pr and pr not in seen_pt:
                seen_pt.add(pr); pt_rolesets.append(pr)

            er = (s.get("english_roleset") or "").strip()
            if er and er not in seen_en:
                seen_en.add(er)
                url = s.get("nombank_url")
                if not url:
                    base_en = er.split(".")[0]
                    if base_en:
                        url = f"https://nlp.cs.nyu.edu/meyers/nombank/nombank.1.0/frames/{base_en}.xml"
                en_links.append((er, url))

            src = (s.get("english_roleset_source") or "").strip()
            if src and src not in seen_src:
                seen_src.add(src); sources.append(src)

        # strings/HTML do cabeçalho
        roleset_id_str = ", ".join(pt_rolesets) if pt_rolesets else f"{lemma}.01"
        en_map_html = ", ".join(
            (f'<a href="{url}">{escape(er)}</a>' if url else escape(er))
            for (er, url) in en_links
        ) if en_links else "-"
        # source(s)
        if sources:
            src_links = []
            for src in sources:
                m = re.match(r"verb-([^.]+)", src, flags=re.I)
                if m:
                    verb = m.group(1).lower()
                    pb_url = f"https://verbs.colorado.edu/propbank/framesets-english-aliases/{verb}.html"
                    src_links.append(f'<a href="{pb_url}">{escape(src)}</a>')
                else:
                    src_links.append(escape(src))
            source_tail = ", source = " + ", ".join(src_links)
        else:
            source_tail = ""

        # --- A partir daqui, seguimos o fluxo original com esses agregados ---
        roles_html, shown_args = build_roles_list(roles)
        examples_html = build_examples_section(lemma, shown_args, examples, max_examples=MAX_EXAMPLES_IN_SECTION)
        table_html = build_realization_table(lemma, shown_args, examples)
        freq_html = build_freq_table(shown_args, syntactic_profile)

        # “Voltar para X”
        letter = first_letter(lemma)
        back_href = f"../index.html?letter={letter}"
        back_label = f"← Voltar para {letter.upper()}"

        # botão de download (dentro da header)
        json_file = json_filename if json_filename else f"{lemma}.json"
        download_href = f"{DOWNLOAD_PREFIX_FROM_PAGE}{json_file}"

        head = f"""<!DOCTYPE html>
<html lang="pt">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{escape(lemma)} - NounBank.DS</title>
<link rel="stylesheet" href="{CSS_HREF}">
<style>{STYLE_BLOCK}</style>
</head>
<body>

<header class="site-header">
  <nav class="site-nav">
    <a class="home-link" href="../index.html">Home</a>
    <span class="page-title">{escape(lemma)}</span>
    <a class="download-json" href="{download_href}" download="{escape(json_file)}" aria-label="Baixar JSON">JSON download</a>
  </nav>
</header>

<script>
(function(){{
  const h = document.querySelector('.site-header');
  if(!h) return;
  const onScroll = () => h.classList.toggle('scrolled', window.scrollY > 16);
  onScroll(); window.addEventListener('scroll', onScroll, {{passive:true}});
}})();
</script>

<div class="content">
  """

        topo = f"""
  <h1>Nome predicador: <i style="color: red;">{escape(lemma)}</i></h1>
  <p><strong>Roleset id:</strong> {escape(roleset_id_str)}, Mapeamento para o inglês: {en_map_html}{source_tail}</p>
  <h2>Roles:</h2>
  {roles_html}
  <h2>Exemplos:</h2>
  {examples_html}
  <br><br>
"""

        tail = f"""
  <h2>Realização sintática da estrutura de argumentos</h2>
  {table_html}
  {freq_html}
</div>

<a class="back-link back-floating" href="{back_href}" title="Voltar para letra {letter.upper()}">{back_label}</a>

</body>
</html>
"""
        return head + topo + tail

    # ---------- Formato antigo (sem senses): comportamento original ----------
    lemma = doc.get("lemma") or ""
    roleset_id = doc.get("roleset_id") or f"{lemma}.01"
    english_roleset = doc.get("english_roleset") or ""
    english_roleset_source = doc.get("english_roleset_source")
    roles = doc.get("roles") or []
    examples = doc.get("examples") or []
    syntactic_profile = doc.get("syntactic_profile") or {}

    # links
    base_en = (english_roleset or "").split(".")[0]
    nombank_url = f"https://nlp.cs.nyu.edu/meyers/nombank/nombank.1.0/frames/{base_en}.xml" if base_en else "#"
    source_tail = ""
    if english_roleset_source:
        m = re.match(r"verb-([^.]+)", english_roleset_source, flags=re.I)
        if m:
            verb = m.group(1).lower()
            pb_url = f"https://verbs.colorado.edu/propbank/framesets-english-aliases/{verb}.html"
            source_tail = f', source = <a href="{pb_url}">{escape(english_roleset_source)}</a>'

    roles_html, shown_args = build_roles_list(roles)
    examples_html = build_examples_section(lemma, shown_args, examples, max_examples=MAX_EXAMPLES_IN_SECTION)
    table_html = build_realization_table(lemma, shown_args, examples)
    freq_html = build_freq_table(shown_args, syntactic_profile)

    # “Voltar para X”
    letter = first_letter(lemma)
    back_href = f"../index.html?letter={letter}"
    back_label = f"← Voltar para {letter.upper()}"

    # botão de download (dentro da header)
    json_file = json_filename if json_filename else f"{lemma}.json"
    download_href = f"{DOWNLOAD_PREFIX_FROM_PAGE}{json_file}"

    head = f"""<!DOCTYPE html>
<html lang="pt">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{escape(lemma)} - NounBank.DS</title>
<link rel="stylesheet" href="{CSS_HREF}">
<style>{STYLE_BLOCK}</style>
</head>
<body>

<header class="site-header">
  <nav class="site-nav">
    <a class="home-link" href="../index.html">Home</a>
    <span class="page-title">{escape(lemma)}</span>
    <a class="download-json" href="{download_href}" download="{escape(json_file)}" aria-label="Baixar JSON">JSON download</a>
  </nav>
</header>

<script>
(function(){{
  const h = document.querySelector('.site-header');
  if(!h) return;
  const onScroll = () => h.classList.toggle('scrolled', window.scrollY > 16);
  onScroll(); window.addEventListener('scroll', onScroll, {{passive:true}});
}})();
</script>

<div class="content">
  """

    topo = f"""
  <h1>Nome predicador: <i style="color: red;">{escape(lemma)}</i></h1>
  <p><strong>Roleset id:</strong> {escape(roleset_id)}, Mapeamento para o inglês: <a href="{nombank_url}">{escape(english_roleset)}</a>{source_tail}</p>
  <h2>Roles:</h2>
  {roles_html}
  <h2>Exemplos:</h2>
  {examples_html}
  <br><br>
"""

    tail = f"""
  <h2>Realização sintática da estrutura de argumentos</h2>
  {table_html}
  {freq_html}
</div>

<a class="back-link back-floating" href="{back_href}" title="Voltar para letra {letter.upper()}">{back_label}</a>

</body>
</html>
"""
    return head + topo + tail

# ---------- driver ----------
for jf in sorted(JSON_DIR.glob("*.json")):
    payload = json.loads(jf.read_text(encoding="utf-8"))

    any_doc = False
    for doc, idx in iter_json_docs(payload):
        any_doc = True
        try:
            html_str = render_html(doc, json_filename=jf.name)
        except Exception as e:
            print(f"[ERRO] {jf.name} (item {idx}): {e}")
            continue

        # nome do arquivo de saída (se vier lista, diferencia com __N)
        lemma_for_name = (doc.get("lemma") or doc.get("lemma_base") or "frame").strip() or "frame"
        out_file = HTML_DIR / (f"{lemma_for_name}__{idx+1}.html" if idx is not None else f"{lemma_for_name}.html")
        out_file.write_text(html_str, encoding="utf-8")
        print("✓", out_file)

    if not any_doc:
        print(f"[AVISO] {jf.name}: payload não é dict nem lista de dicts; ignorado.")


[AVISO] _manifest.json: payload não é dict nem lista de dicts; ignorado.
✓ site_pages/abastecimento.html
✓ site_pages/acesso.html
✓ site_pages/acordo.html
✓ site_pages/aditivo.html
✓ site_pages/administração.html
✓ site_pages/ajuste.html
✓ site_pages/alavancagem.html
✓ site_pages/aliança.html
✓ site_pages/alienação.html
✓ site_pages/alteração.html
✓ site_pages/amor.html
✓ site_pages/antro.html
✓ site_pages/apreensão.html
✓ site_pages/aquisição.html
✓ site_pages/assinatura.html
✓ site_pages/ataque.html
✓ site_pages/atestado.html
✓ site_pages/avaliação.html
✓ site_pages/briga.html
✓ site_pages/caminho.html
✓ site_pages/candidato.html
✓ site_pages/cara.html
✓ site_pages/carteira.html
✓ site_pages/causa.html
✓ site_pages/cenário.html
✓ site_pages/chance.html
✓ site_pages/comissão.html
✓ site_pages/comparação.html
✓ site_pages/compartilhamento.html
✓ site_pages/compra.html
✓ site_pages/comprador.html
✓ site_pages/confirmação.html
✓ site_pages/construção.html
✓ site_pages/contrato.html
✓ sit

# GERADOR DOS ARQUIVOS .JS

In [43]:
# -*- coding: utf-8 -*-
"""
Gera arquivos <lemma>.js (um por JSON) **na mesma estrutura do ACORDO.JS**,
usando os JSONs produzidos anteriormente.

O script também garante que o respectivo <lemma>.html (em `HTML_DIR`) referencie
o .js gerado (inserindo <script src="..."> antes de </body> se ainda não existir).

Regras de preenchimento do JS:
- data[i].Texto           = example.text
- data[i].marks           = [{word: <rel>, type: "rel"}, {word: Arg0, type:"arg0"}, ...] (só arg0..arg2)
- data[i].subscripts      = [{word: <rel>, subscript:"rel"}, {word: <token do arg>, subscript: <deprel>}, ...]
  (token do arg escolhido por heurística simples descrita em pick_sub_word)
- data[i].args            = {"arg0": ..., "arg1": ..., "arg2": ...} com "-" quando vazio

Como usar:
  1) Ajuste JSON_DIR e HTML_DIR abaixo, se necessário.
  2) Execute. Serão criados arquivos <lemma>.js em HTML_DIR e injetadas as tags <script> nos HTMLs.
"""

from pathlib import Path
import json, re
from html import escape

# ----------------- CONFIG -----------------
JSON_DIR = Path("jsons")        # onde estão os *.json gerados anteriormente
HTML_DIR = Path("site_pages")   # onde estão/ficarão os .html e onde salvaremos os .js
HTML_DIR.mkdir(parents=True, exist_ok=True)

# ----------------- Helpers -----------------
def first_rel_surface(lemma: str, text: str) -> str:
    """
    Retorna a 1ª ocorrência do lemma no texto preservando o *case* encontrado.
    Se não encontrar, retorna o próprio lemma.
    """
    m = re.search(re.escape(lemma), text, flags=re.IGNORECASE)
    return m.group(0) if m else lemma

WORD_RE = re.compile(r"[A-Za-zÀ-ÿ0-9]+", re.UNICODE)

def pick_sub_word(phrase: str) -> str | None:
    """
    Escolhe um token (palavra única) dentro da *phrase* para receber o subscrito do UD.
    Heurística leve:
      1) se encontrar 'de|da|do|das|dos|com|para <X>' -> retorna <X>
      2) senão, retorna a última palavra alfanumérica da *phrase*
    """
    if not phrase:
        return None
    m = re.search(r"\b(?:de|da|do|das|dos|com|para)\s+([A-Za-zÀ-ÿ0-9]+)", phrase, flags=re.IGNORECASE)
    if m:
        return m.group(1)
    toks = WORD_RE.findall(phrase)
    return toks[-1] if toks else None

def js_stringify(obj) -> str:
    """Serializa para JS usando JSON (válido em JS)."""
    return json.dumps(obj, ensure_ascii=False, indent=2)

def flatten_doc_for_new_json(payload: dict) -> dict:
    """
    Aceita um JSON no formato novo (com 'senses') e devolve um doc "achatado"
    com as chaves mínimas que o writer antigo espera: lemma, roles, examples.
    - roles: apenas Arg0..Arg2, pegando a primeira descrição não-nula dentre os sentidos
    - examples: concatenação dos exemplos de todos os sentidos
    """
    lemma = (payload.get("lemma") or payload.get("lemma_base") or "").strip()
    senses = payload.get("senses") or []

    # roles Arg0..Arg2 (primeira desc não-nula)
    roles = []
    for k in range(3):
        desc = None
        for s in senses:
            for r in (s.get("roles") or []):
                if r.get("id") == f"Arg{k}" and r.get("desc") is not None:
                    desc = r.get("desc")
                    break
            if desc is not None:
                break
        roles.append({"id": f"Arg{k}", "desc": desc})

    # exemplos: concatena todos
    examples = []
    for s in senses:
        examples.extend(s.get("examples") or [])

    return {"lemma": lemma, "roles": roles, "examples": examples}

# ----------------- Core (escrita do .js) -----------------
def write_js_for_doc(doc: dict, out_js_path: Path):
    # quais args mostrar (apenas Arg0..Arg2 com desc != null)
    shown_args = [r["id"] for r in (doc.get("roles") or [])
                  if r.get("id") in ("Arg0","Arg1","Arg2") and r.get("desc") is not None]
    shown_args_lc = [a.lower() for a in shown_args]  # ex.: ['arg0','arg1']

    # monta os dados (marks/args) mas só para os args exibidos
    def build_js_data(doc, shown):
        lemma = doc.get("lemma") or ""
        examples = doc.get("examples") or []
        data_items = []
        for ex in examples:
            text = ex.get("text") or ""
            real = ex.get("realization") or {}
            syn  = ex.get("syntax") or {}

            # relação (rel)
            rel_surface = first_rel_surface(lemma, text)

            marks = [{"word": rel_surface, "type": "rel"}]
            subs  = [{"word": rel_surface, "subscript": "rel"}]

            for a in shown:
                val = real.get(a)
                if val:
                    marks.append({"word": val, "type": a.lower()})
                    dep = syn.get(a)
                    if dep:
                        tok = pick_sub_word(val)
                        if tok:
                            subs.append({"word": tok, "subscript": dep})

            # args: "-" quando vazio/nulo
            args = {a.lower(): (real.get(a) if (real.get(a) is not None and str(real.get(a)).strip() != "") else "-")
                    for a in shown}

            data_items.append({"Texto": text, "marks": marks, "subscripts": subs, "args": args})
        return data_items

    data_items = build_js_data(doc, shown_args)

    js_code = f"""document.addEventListener('DOMContentLoaded', function() {{
  const SHOWN_ARGS = {json.dumps(shown_args_lc, ensure_ascii=False)};
  const data = {json.dumps(data_items, ensure_ascii=False, indent=2)};

  const tableBody = document.getElementById('relations-table').getElementsByTagName('tbody')[0];
  tableBody.innerHTML = ''; // evita duplicar quando o HTML já trouxe linhas

  data.forEach((item, index) => {{
    let textWithMarks = item.Texto;

    // aplicar cores
    item.marks.forEach(mark => {{
      const pattern = new RegExp(`\\b(${{
        mark.word
      }})\\b`, 'g');
      textWithMarks = textWithMarks.replace(pattern, `<span class="${{mark.type}}">$1</span>`);
    }});

    // subscritos
    item.subscripts.forEach(sub => {{
      const pattern = new RegExp(`\\b(${{
        sub.word
      }})\\b`, 'g');
      textWithMarks = textWithMarks.replace(pattern, `$1<sub>${{sub.subscript}}</sub>`);
    }});

    // monta a linha com # + Args mostrados + Texto
    const row = document.createElement('tr');
    let cells = `<td>${{index + 1}}</td>`;
    SHOWN_ARGS.forEach(a => {{
      const v = (item.args[a] ?? '-');
      const cls = (v && v !== '-') ? a : '';
      cells += `<td class="${{cls}}">${{v}}</td>`;
    }});
    cells += `<td>${{textWithMarks}}</td>`;
    row.innerHTML = cells;
    tableBody.appendChild(row);

    // linha divisória (colspan dinâmico)
    if (index < data.length - 1) {{
      const dividerRow = document.createElement('tr');
      const dividerCell = document.createElement('td');
      dividerCell.colSpan = SHOWN_ARGS.length + 2; // # + args + Texto
      dividerCell.style.borderBottom = '1px solid #ccc';
      dividerRow.appendChild(dividerCell);
      tableBody.appendChild(dividerRow);
    }}
  }});
}});"""
    out_js_path.write_text(js_code, encoding="utf-8")
    print("✓ JS:", out_js_path)

def ensure_script_in_html(html_path: Path, js_filename: str):
    """
    Garante que o HTML inclua <script src="js_filename"></script> antes do </body>.
    Se já houver, não duplica.
    """
    if not html_path.exists():
        return
    html = html_path.read_text(encoding="utf-8")
    if js_filename in html:
        return
    injected = re.sub(
        r"</body>\s*</html>\s*$",
        f"<script src=\"{js_filename}\"></script>\n</body>\n</html>",
        html,
        flags=re.IGNORECASE
    )
    if injected != html:
        html_path.write_text(injected, encoding="utf-8")
        print("→ script adicionado em", html_path.name)

# ----------------- Driver -----------------
for jf in sorted(JSON_DIR.glob("*.json")):
    payload = json.loads(jf.read_text(encoding="utf-8"))

    # Determina o(s) "doc(s)" a processar:
    docs = []
    if isinstance(payload, dict) and "senses" in payload:
        # novo formato (um lemma com vários sentidos)
        docs.append(flatten_doc_for_new_json(payload))
    elif isinstance(payload, dict):
        # formato antigo (um único doc)
        docs.append(payload)
    elif isinstance(payload, list):
        # lista de docs (raro hoje, mas mantemos compat.)
        for item in payload:
            if not isinstance(item, dict):
                continue
            if "senses" in item:
                docs.append(flatten_doc_for_new_json(item))
            else:
                docs.append(item)

    for doc in docs:
        lemma_for_name = (doc.get("lemma") or doc.get("lemma_base") or "frame").strip() or "frame"
        out_js = HTML_DIR / f"{lemma_for_name}.js"
        write_js_for_doc(doc, out_js)

        # injeta <script> no HTML correspondente (mesmo nome do lemma)
        html_path = HTML_DIR / f"{lemma_for_name}.html"
        ensure_script_in_html(html_path, out_js.name)

print("\nConcluído.")


✓ JS: site_pages/abastecimento.js
✓ JS: site_pages/acesso.js
✓ JS: site_pages/acordo.js
✓ JS: site_pages/aditivo.js
✓ JS: site_pages/administração.js
✓ JS: site_pages/ajuste.js
✓ JS: site_pages/alavancagem.js
✓ JS: site_pages/aliança.js
✓ JS: site_pages/alienação.js
✓ JS: site_pages/alteração.js
✓ JS: site_pages/amor.js
✓ JS: site_pages/antro.js
✓ JS: site_pages/apreensão.js
✓ JS: site_pages/aquisição.js
✓ JS: site_pages/assinatura.js
✓ JS: site_pages/ataque.js
✓ JS: site_pages/atestado.js
✓ JS: site_pages/avaliação.js
✓ JS: site_pages/briga.js
✓ JS: site_pages/caminho.js
✓ JS: site_pages/candidato.js
✓ JS: site_pages/cara.js
✓ JS: site_pages/carteira.js
✓ JS: site_pages/causa.js
✓ JS: site_pages/cenário.js
✓ JS: site_pages/chance.js
✓ JS: site_pages/comissão.js
✓ JS: site_pages/comparação.js
✓ JS: site_pages/compartilhamento.js
✓ JS: site_pages/compra.js
✓ JS: site_pages/comprador.js
✓ JS: site_pages/confirmação.js
✓ JS: site_pages/construção.js
✓ JS: site_pages/contrato.js


✓ JS: site_pages/controle.js
✓ JS: site_pages/conversa.js
✓ JS: site_pages/conversão.js
✓ JS: site_pages/convocação.js
✓ JS: site_pages/coordenador.js
✓ JS: site_pages/coragem.js
✓ JS: site_pages/corte.js
✓ JS: site_pages/curiosidade.js
✓ JS: site_pages/custódio.js
✓ JS: site_pages/daytrade.js
✓ JS: site_pages/decisão.js
✓ JS: site_pages/declaração.js
✓ JS: site_pages/deliberação.js
✓ JS: site_pages/demanda.js
✓ JS: site_pages/denúncia.js
✓ JS: site_pages/descoberta.js
✓ JS: site_pages/descolamento.js
✓ JS: site_pages/desova.js
✓ JS: site_pages/diretor.js
✓ JS: site_pages/diretoria.js
✓ JS: site_pages/discussão.js
✓ JS: site_pages/divergência.js
✓ JS: site_pages/divisão.js
✓ JS: site_pages/divulgação.js
✓ JS: site_pages/eleição.js
✓ JS: site_pages/encerramento.js
✓ JS: site_pages/entendimento.js
✓ JS: site_pages/entrada.js
✓ JS: site_pages/esclarecimento.js
✓ JS: site_pages/estimativo.js
✓ JS: site_pages/exemplo.js
✓ JS: site_pages/expectativa.js
✓ JS: site_pages/exploração.js
✓ JS: si