In [None]:
# ============================================================
# PUNTO 4 — COMPARATIVO MULTI-PORTAL (VERSION + AJUSTES FINALES)
# - TOTAL % en comparativas (Índice Global)
# - Top 3 categorías: completar 3 por portal tras excluir vacíos/no definidos
# - Accesibilidad vs Trazabilidad y Calidad vs Trazabilidad: leyenda con % TOTAL global
# - Trazabilidad temporal: mejorar legibilidad (sin textos montados)
# - Licenciamiento: barras verticales AGRUPADAS (no stack)
# ============================================================

import pandas as pd
import numpy as np
import re
import ast
from pathlib import Path
from datetime import datetime

import plotly.express as px
import plotly.graph_objects as go
from plotly.io import to_html

# ============================================================
# 0) CONFIG
# ============================================================
INPUT_GLOB = "Punto3_*_datasetlevel_*_CARPETA*.xlsx"      # <-- cambia SEGUN EL NOMBRE DE LA CARPETA DONDE ESTAN LOS RESULTADOS DE LOS ARCHIVOS PUNTO 3
OUT_HTML = "Punto4_Comparativo_TODOS_PORTALES.html"    # <-- cambia SEGUN EL NOMBRE QUE EL USUARIO QUIERA NOMBRAR EL ARCHIVO
PORTAL_COL = "portal"

EXPORT_DIR = Path("Punto4_HTML_por_grafica_COMPARATIVO_")      # <-- cambia SEGUN EL USUARIO 
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

PLOTLY_CONFIG = {
    "displaylogo": False,
    "toImageButtonOptions": {
        "format": "png",
        "height": 900,
        "width": 1400,
        "scale": 2
    }
}

# ============================================================
# 1) HELPERS (export + html blocks)
# ============================================================
def _slugify(text: str) -> str:
    text = (text or "").strip().lower()
    text = re.sub(r"[^\w\s-]", "", text, flags=re.UNICODE)
    text = re.sub(r"[\s_-]+", "_", text)
    return text[:90] if len(text) > 90 else text

def save_plotly_figure_html(fig, filename_hint: str, out_dir: Path = EXPORT_DIR) -> str:
    safe = _slugify(filename_hint)
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = out_dir / f"{safe}_{ts}.html"
    fig.write_html(
        str(out_path),
        full_html=True,
        include_plotlyjs=True,
        config=PLOTLY_CONFIG
    )
    return str(out_path)

def plotly_polish(fig, height=520):
    fig.update_layout(
        template="plotly_white",
        height=height,
        margin=dict(l=80, r=30, t=90, b=110),
        title=dict(x=0.02, y=0.96),
        font=dict(size=14),
        legend=dict(orientation="h", yanchor="top", y=-0.22, xanchor="left", x=0.0),
    )
    return fig

def plotly_caption(text_html: str) -> str:
    return f"<div class='cap'>{text_html}</div>"

def plotly_fig_block(fig, caption_html, export_name=None):
    if export_name is None:
        export_name = fig.layout.title.text if fig.layout.title and fig.layout.title.text else "figura"

    _ = save_plotly_figure_html(fig, export_name)
    fig_html = to_html(fig, full_html=False, include_plotlyjs=False, config=PLOTLY_CONFIG)

    return f"""
    <div class="card">
      {fig_html}
      {caption_html}
    </div>
    """

def build_html(blocks, title="Gráfica Comparativo – Madurez Open Data"):
    return f"""
<!doctype html>
<html lang="es">
<head>
  <meta charset="utf-8"/>
  <title>{title}</title>
  <script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script>
  <style>
    body {{ font-family: Arial, sans-serif; margin: 22px; background: #fafafa; }}
    h1 {{ margin-bottom: 6px; }}
    h2 {{ margin-top: 24px; margin-bottom: 10px; }}
    .sub {{ color: #555; margin-bottom: 18px; }}
    .card {{
      background: white;
      border: 1px solid #ddd;
      border-radius: 14px;
      padding: 12px;
      margin-bottom: 16px;
      box-shadow: 0 1px 6px rgba(0,0,0,0.04);
    }}
    .cap {{
      margin-top: 10px;
      font-size: 13.5px;
      color: #222;
      line-height: 1.35;
      background: #f6f6f6;
      border: 1px solid #eee;
      border-radius: 12px;
      padding: 10px 12px;
    }}
  </style>
</head>
<body>
  <h1>{title}</h1>
  <div class="sub">Plotly interactivo: usa el icono de cámara en cada gráfico para descargar la imagen.</div>
  {''.join(blocks)}
</body>
</html>
"""

# ============================================================
# 2) LOAD MULTI-PORTAL
# ============================================================
files = sorted(Path(".").glob(INPUT_GLOB))
if not files:
    raise FileNotFoundError(f"No encontré archivos con patrón: {INPUT_GLOB}")

dfs = []
for fp in files:
    d = pd.read_excel(fp)
    if PORTAL_COL not in d.columns:
        d[PORTAL_COL] = fp.stem
    dfs.append(d)

df = pd.concat(dfs, ignore_index=True)

print("Cargado:", df.shape)
print("Portales:", df[PORTAL_COL].unique().tolist())

# ============================================================
# 3) NORMALIZACIONES
# ============================================================
def parse_list_cell(x):
    if isinstance(x, (list, tuple, set)):
        return list(x)
    if pd.isna(x):
        return []
    s = str(x).strip()
    if s == "":
        return []
    try:
        v = ast.literal_eval(s)
        if isinstance(v, list):
            return v
    except Exception:
        pass
    return [t.strip() for t in re.split(r"[;,|\s]+", s) if t.strip()]

def clean_text_series(s: pd.Series) -> pd.Series:
    s2 = s.astype(str).str.strip()
    s2 = s2.replace({"nan":"", "NaN":"", "None":"", "NaT":"", "nat":"", "null":"", "N/A":"", "na":""})
    return s2

def present_text(series: pd.Series) -> pd.Series:
    s = clean_text_series(series)
    bad = {"", "no definido", "no definida", "sin definir", "consultar", "consulte", "none"}
    return (~s.str.lower().isin(bad) & (s != "")).astype(int)

def present_binary(series: pd.Series) -> pd.Series:
    s = series.copy()
    if s.dtype == "object":
        return present_text(s)
    s = pd.to_numeric(s, errors="coerce").fillna(0)
    return (s > 0).astype(int)

def wavg(values, weights):
    v = pd.to_numeric(values, errors="coerce").fillna(0)
    w = pd.to_numeric(weights, errors="coerce").fillna(0)
    w = np.where(w <= 0, 1, w)
    return float(np.average(v, weights=w))

# formatos
if "open_formats_list" not in df.columns:
    df["open_formats_list"] = [[] for _ in range(len(df))]
if "non_open_formats_list" not in df.columns:
    df["non_open_formats_list"] = [[] for _ in range(len(df))]

df["open_formats_list_parsed"] = df["open_formats_list"].apply(parse_list_cell)
df["non_open_formats_list_parsed"] = df["non_open_formats_list"].apply(parse_list_cell)

# licencias
if "license" not in df.columns:
    df["license"] = np.nan
if "license_present" not in df.columns:
    df["license_present"] = df["license"].notna().astype(int)
if "license_open" not in df.columns:
    df["license_open"] = 0

df["license_present"] = pd.to_numeric(df["license_present"], errors="coerce").fillna(0).astype(int)
df["license_open"] = pd.to_numeric(df["license_open"], errors="coerce").fillna(0).astype(int)

def license_bucket(row):
    lic = str(row.get("license","") if pd.notna(row.get("license",np.nan)) else "").lower()
    present = int(row.get("license_present", 0) or 0)
    open_flag = int(row.get("license_open", 0) or 0)

    if (
        present == 0 or lic.strip() == ""
        or "no definido" in lic or "sin definir" in lic
        or "consultar" in lic or "consulte" in lic
        or "permiso" in lic or "autoriz" in lic
    ):
        return "Vacío legal"

    if (
        "noncommercial" in lic or "no comercial" in lic or "no-comercial" in lic
        or re.search(r"\bby[-\s]?nc\b", lic)
        or "noderivatives" in lic or "sin deriv" in lic
        or re.search(r"\bby[-\s]?nd\b", lic)
    ):
        return "Restringida"

    if "avisolegal" in lic or "/aviso-legal" in lic or "aviso legal" in lic:
        return "Apertura total"

    if (
        "cc0" in lic
        or (
            (("creative commons" in lic) or re.search(r"\bcc\b", lic))
            and ("nc" not in lic and "nd" not in lic)
        )
    ):
        return "Apertura total"

    if open_flag == 1:
        return "Apertura total"

    return "Vacío legal"

df["license_bucket"] = df.apply(license_bucket, axis=1)

# ============================================================
# 4) MÉTRICAS
# ============================================================
def is_defined_category(x) -> int:
    s = "" if pd.isna(x) else str(x).strip().lower()
    if s in ["", "nan", "none", "no definido", "no definida", "sin definir"]:
        return 0
    return 1

def update_frequency_group_new(series):
    s = series.astype(str).str.lower().str.strip()
    s = s.replace({"nan": "", "none": "", "nat": "", "null": "", "NaT":"", "NaN":""})

    def classify(v: str):
        if v == "":
            return "Grupo 4: Vacío / sin definir"

        never_keys = ["never", "nunca", "sin periodicidad", "irregular", "sporadic", "eventual"]
        if any(k in v for k in never_keys):
            return "Grupo 3: Nunca"

        g1_keys = [
            "instant", "realtime", "real time", "minut", "minute", "hora", "hour",
            "daily", "diari", "seman", "week",
            "monthly", "mensu",
            "quarter", "trimes",
            "semiannual", "semes", "cada 6", "6 meses", "biannual"
        ]
        if any(k in v for k in g1_keys):
            return "Grupo 1: Maduro (alta/media)"

        g2_keys = ["annual", "anual", "year", "yearly"]
        if any(k in v for k in g2_keys):
            return "Grupo 2: Baja (semestral/anual)"

        return "Grupo 3: Nunca"

    return s.apply(classify)

if "update_frequency" in df.columns and "update_frequency_group_new" not in df.columns:
    df["update_frequency_group_new"] = update_frequency_group_new(df["update_frequency"])

def compute_temporal_summary_for_portal(g: pd.DataFrame) -> dict:
    n = len(g) if len(g) else 1

    # A) modified 0-6
    if "age_months_modified" in g.columns:
        mod = pd.to_numeric(g["age_months_modified"], errors="coerce")
        a_cnt = int(((mod >= 0) & (mod <= 6)).sum())
        a_pct = (a_cnt / n) * 100
    else:
        a_cnt, a_pct = 0, 0.0

    # B) update_frequency group 1
    if "update_frequency" in g.columns:
        grp = update_frequency_group_new(g["update_frequency"])
        b_cnt = int((grp == "Grupo 1: Maduro (alta/media)").sum())
        b_pct = (b_cnt / n) * 100
    else:
        b_cnt, b_pct = 0, 0.0

    return {
        "n": int(n),
        "modified_g1_pct": float(round(a_pct, 1)),
        "modified_g1_cnt": int(a_cnt),
        "update_g1_pct": float(round(b_pct, 1)),
        "update_g1_cnt": int(b_cnt),
        "score_temporal": float(round(np.mean([a_pct, b_pct]), 1))
    }

def compute_traceability_scores(g: pd.DataFrame) -> dict:
    pub_pct = (present_text(g["publisher"]).mean() * 100) if "publisher" in g.columns else 0.0
    id_pct  = (present_text(g["identifier"]).mean() * 100) if "identifier" in g.columns else 0.0
    score_origen = float(np.mean([pub_pct, id_pct]))

    tinfo = compute_temporal_summary_for_portal(g)
    score_temporal = float(tinfo["score_temporal"])

    uri_pct = (present_text(g["dataset_uri"]).mean() * 100) if "dataset_uri" in g.columns else 0.0
    doi_pct = (present_text(g["doi"]).mean() * 100) if "doi" in g.columns else 0.0
    score_unica = float(np.mean([uri_pct, doi_pct]))

    score_global = float(np.mean([score_origen, score_temporal, score_unica]))

    return {
        "score_traz_global": score_global,
        "score_origen": score_origen,
        "score_temporal": score_temporal,
        "score_unica": score_unica,
        "temporal_info": tinfo
    }


def compute_dim_scores(g: pd.DataFrame) -> dict:
    traz = compute_traceability_scores(g)

    # Técnica
    tec_vals = [
        present_binary(g["has_open_format"]).mean() * 100 if "has_open_format" in g.columns else 0.0,
        present_binary(g["portal_supports_dcat_dcatap"]).mean() * 100 if "portal_supports_dcat_dcatap" in g.columns else 0.0
    ]
    score_tec = float(np.mean(tec_vals))

    # Semántica
    sem_vals = []
    sem_vals.append(pd.Series(g["category"]).apply(is_defined_category).mean() * 100 if "category" in g.columns else 0.0)
    sem_vals.append(present_binary(g["uses_controlled_vocab"]).mean() * 100 if "uses_controlled_vocab" in g.columns else 0.0)
    sem_vals.append(present_binary(g["has_semantic_serialization"]).mean() * 100 if "has_semantic_serialization" in g.columns else 0.0)
    score_sem = float(np.mean(sem_vals))

    # Accesibilidad
    acc_cols = ["portal_has_api_rest", "license_open", "public_access_ok", "download_url_present"]
    acc_vals = [present_binary(g[c]).mean() * 100 if c in g.columns else 0.0 for c in acc_cols]
    score_acc = float(np.mean(acc_vals))

    # Calidad
    cal_vals = []
    cal_vals.append(present_text(g["title"]).mean() * 100 if "title" in g.columns else 0.0)
    cal_vals.append(present_text(g["description"]).mean() * 100 if "description" in g.columns else 0.0)
    cal_vals.append(present_binary(g["has_data_dictionary"]).mean() * 100 if "has_data_dictionary" in g.columns else 0.0)
    score_cal = float(np.mean(cal_vals))

    dim_scores = {
        "Trazabilidad": float(traz["score_traz_global"]),
        "Interoperabilidad Semántica": score_sem,
        "Interoperabilidad Técnica": score_tec,
        "Accesibilidad": score_acc,
        "Calidad": score_cal,
    }

    global_score = float(np.mean(list(dim_scores.values())))

    return {
        "dim_scores": dim_scores,
        "global_score": global_score,
        "temporal_info": traz["temporal_info"],
        "traz_subscores": traz
    }


# Base comparativa cmp
rows = []
for portal, g in df.groupby(PORTAL_COL):
    res = compute_dim_scores(g)
    ds = res["dim_scores"]

    rows.append({
        "portal": portal,
        "n": len(g),

        "global_score": round(res["global_score"], 1),

        "Trazabilidad": round(ds["Trazabilidad"], 1),
        "Interoperabilidad Semántica": round(ds["Interoperabilidad Semántica"], 1),
        "Interoperabilidad Técnica": round(ds["Interoperabilidad Técnica"], 1),
        "Accesibilidad": round(ds["Accesibilidad"], 1),
        "Calidad": round(ds["Calidad"], 1),

        "Temporal_score": round(res["temporal_info"]["score_temporal"], 1),
        "Temporal_modified_0_6": round(res["temporal_info"]["modified_g1_pct"], 1),
        "Temporal_modified_0_6_cnt": int(res["temporal_info"]["modified_g1_cnt"]),
        "Temporal_updatefreq_g1": round(res["temporal_info"]["update_g1_pct"], 1),
        "Temporal_updatefreq_g1_cnt": int(res["temporal_info"]["update_g1_cnt"]),
    })

cmp = pd.DataFrame(rows)


# -------------------------------------------------
# TOTALES GLOBALES ()
# -------------------------------------------------
TOTAL_METHOD = "mean"   # "mean" (promedio simple) o "weighted" (ponderado por N)

def total_metric(series, weights):
    if TOTAL_METHOD == "weighted":
        return wavg(series, weights)
    return float(pd.to_numeric(series, errors="coerce").mean())

TOTAL_GLOBAL   = total_metric(cmp["global_score"], cmp["n"])
TOTAL_ACC      = total_metric(cmp["Accesibilidad"], cmp["n"])
TOTAL_TRAZ     = total_metric(cmp["Trazabilidad"], cmp["n"])
TOTAL_CAL      = total_metric(cmp["Calidad"], cmp["n"])
TOTAL_TEMPORAL = total_metric(cmp["Temporal_score"], cmp["n"])


# ============================================================
# 5) GRÁFICAS COMPARATIVAS ()
# ============================================================
blocks = []
blocks.append("<h2>Comparativo multi-portal: Índice global y componentes</h2>")

# 5.1 Índice global (barra horizontal) + TOTAL
cmp1 = cmp.sort_values("global_score", ascending=False).copy()
fig1 = px.bar(
    cmp1, x="global_score", y="portal", orientation="h",
    text=cmp1["global_score"].round(1),
    title=f"Índice Global de Madurez de los Portales — Comparativo (0–100) — TOTAL: {TOTAL_GLOBAL:.1f}%"
)
fig1.update_traces(textposition="outside", cliponaxis=False)
fig1.update_layout(xaxis_range=[0,105], xaxis_title="Score (0–100)", yaxis_title="")

fig1.add_annotation(
    x=0.5, y=1.12, xref="paper", yref="paper",
    text=f"<b>TOTAL (ponderado por N): {TOTAL_GLOBAL:.1f}%</b>",
    showarrow=False
)

fig1 = plotly_polish(fig1, height=max(520, 45*len(cmp1)+220))

blocks.append(plotly_fig_block(
    fig1,
    plotly_caption(
        "<b>Interpretación.</b> Índice global = promedio de 5 componentes (20% c/u): "
        "Trazabilidad, Interoperabilidad semántica, Interoperabilidad técnica, Accesibilidad y Calidad."
    ),
    export_name="comparativo_indice_global"
))

# 5.2 Componentes del índice — BARRAS VERTICALES (agrupadas)
comp_order = ["Trazabilidad", "Interoperabilidad Semántica", "Interoperabilidad Técnica", "Accesibilidad", "Calidad"]
long_comp = cmp.melt(id_vars=["portal"], value_vars=comp_order, var_name="Componente", value_name="Score")

figC = px.bar(
    long_comp, x="portal", y="Score", color="Componente",
    barmode="group",
    title="Componentes del índice Global — Métricas Detallado",
    text=long_comp["Score"].round(1)
)
figC.update_traces(textposition="outside", cliponaxis=False)
figC.update_layout(
    yaxis_range=[0,105],
    yaxis_title="Score (0–100)",
    xaxis_title="Portal",
    xaxis_tickangle=-25
)
figC.update_xaxes(automargin=True, tickfont=dict(size=11))
figC = plotly_polish(figC, height=680)

blocks.append(plotly_fig_block(
    figC,
    plotly_caption(
        "<b>Interpretación.</b> Comparativo directo de los 5 componentes del índice (barras agrupadas por portal). "
        "Cada componente está expresado en escala 0–100."
    ),
    export_name="comparativo_componentes_barras_verticales"
))

# 5.3 Heatmap (opcional)
heat = cmp[["portal"] + comp_order].copy().set_index("portal")
figH = go.Figure(data=go.Heatmap(
    z=heat.values,
    x=heat.columns,
    y=heat.index,
    zmin=0, zmax=100,
    hovertemplate="Portal=%{y}<br>Componente=%{x}<br>Score=%{z:.1f}<extra></extra>"
))
figH.update_layout(title="Componentes del Índice — Heatmap comparativo (0–100)",
                   xaxis_title="Componente", yaxis_title="Portal")
figH = plotly_polish(figH, height=max(520, 38*len(heat)+260))

blocks.append(plotly_fig_block(
    figH,
    plotly_caption(
        "<b>Interpretación.</b> Vista alternativa tipo matriz para identificar patrones rápidos por portal."
    ),
    export_name="comparativo_heatmap_componentes"
))

# 5.4 Accesibilidad vs Trazabilidad — BARRAS + leyenda con TOTAL
acc_trz = cmp[["portal", "Accesibilidad", "Trazabilidad"]].copy()
acc_trz_long = acc_trz.melt(id_vars=["portal"], var_name="Métrica", value_name="Score")

figAT = px.bar(
    acc_trz_long, x="portal", y="Score", color="Métrica",
    barmode="group",
    title="Accesibilidad vs Trazabilidad Global — Comparativo por portal",
    text=acc_trz_long["Score"].round(1)
)
figAT.update_traces(textposition="outside", cliponaxis=False)
figAT.update_layout(yaxis_range=[0,105], xaxis_tickangle=-25, yaxis_title="Score (0–100)", xaxis_title="Portal")
figAT.update_xaxes(automargin=True, tickfont=dict(size=11))

figAT.add_trace(go.Scatter(
    x=[None], y=[None], mode="markers",
    marker=dict(size=10),
    name=f"TOTAL (ponderado) → Accesibilidad: {TOTAL_ACC:.1f}% | Trazabilidad: {TOTAL_TRAZ:.1f}%",
    showlegend=True
))
figAT.update_layout(legend_title_text="Métrica (incluye total global)")
figAT = plotly_polish(figAT, height=640)

blocks.append(plotly_fig_block(
    figAT,
    plotly_caption(
        "<b>Interpretación.</b> Comparación de Accesibilidad vs Trazabilidad global por portal (escala 0–100). "
        "La leyenda incluye el total global ponderado por número de datasets."
    ),
    export_name="comparativo_barras_accesibilidad_vs_trazabilidad"
))

# 5.5 Calidad vs Trazabilidad — BARRAS + leyenda con TOTAL
cal_trz = cmp[["portal", "Calidad", "Trazabilidad"]].copy()
cal_trz_long = cal_trz.melt(id_vars=["portal"], var_name="Métrica", value_name="Score")

figQT = px.bar(
    cal_trz_long, x="portal", y="Score", color="Métrica",
    barmode="group",
    title="Calidad de Metadatos vs Trazabilidad Global — Comparativo por portal",
    text=cal_trz_long["Score"].round(1)
)
figQT.update_traces(textposition="outside", cliponaxis=False)
figQT.update_layout(yaxis_range=[0,105], xaxis_tickangle=-25, yaxis_title="Score (0–100)", xaxis_title="Portal")
figQT.update_xaxes(automargin=True, tickfont=dict(size=11))

figQT.add_trace(go.Scatter(
    x=[None], y=[None], mode="markers",
    marker=dict(size=10),
    name=f"TOTAL (ponderado) → Calidad: {TOTAL_CAL:.1f}% | Trazabilidad: {TOTAL_TRAZ:.1f}%",
    showlegend=True
))
figQT.update_layout(legend_title_text="Métrica (incluye total global)")
figQT = plotly_polish(figQT, height=640)

blocks.append(plotly_fig_block(
    figQT,
    plotly_caption(
        "<b>Interpretación.</b> Comparación de Calidad (completitud) vs Trazabilidad global por portal (escala 0–100). "
        "La leyenda incluye el total global ponderado por número de datasets."
    ),
    export_name="comparativo_barras_calidad_vs_trazabilidad"
))

# 5.6 Trazabilidad temporal — % + (N) + TOTAL + mejor legibilidad
tmpT = cmp[[
    "portal", "n",
    "Temporal_modified_0_6", "Temporal_modified_0_6_cnt",
    "Temporal_updatefreq_g1", "Temporal_updatefreq_g1_cnt",
    "Temporal_score"
]].copy()

longT = []
for _, r in tmpT.iterrows():
    longT.append({
        "portal": r["portal"],
        "Métrica": "Antigüedad desde última actualización (modified) 0–6 meses",
        "Porcentaje": r["Temporal_modified_0_6"],
        "Conteo": r["Temporal_modified_0_6_cnt"],
        "Etiqueta": f"{r['Temporal_modified_0_6']:.1f}% ({int(r['Temporal_modified_0_6_cnt'])})"
    })
    longT.append({
        "portal": r["portal"],
        "Métrica": "Frecuencia declarada Grupo 1 (Maduro)",
        "Porcentaje": r["Temporal_updatefreq_g1"],
        "Conteo": r["Temporal_updatefreq_g1_cnt"],
        "Etiqueta": f"{r['Temporal_updatefreq_g1']:.1f}% ({int(r['Temporal_updatefreq_g1_cnt'])})"
    })

longT = pd.DataFrame(longT)

order = tmpT.sort_values("Temporal_score", ascending=False)["portal"].tolist()
longT["portal"] = pd.Categorical(longT["portal"], categories=order, ordered=True)
longT = longT.sort_values("portal")

figT = px.bar(
    longT, x="portal", y="Porcentaje", color="Métrica",
    barmode="group",
    title=f"Trazabilidad Temporal — % por portal (con conteo de datasets) — TOTAL: {TOTAL_TEMPORAL:.1f}%",
    text="Etiqueta"
)
figT.update_traces(textposition="outside", cliponaxis=False)
figT.update_layout(
    yaxis_range=[0,105],
    xaxis_tickangle=-25,
    yaxis_title="% de datasets",
    xaxis_title="Portal",
    bargap=0.25,
    uniformtext_minsize=10,
    uniformtext_mode="hide",
)
figT.update_xaxes(automargin=True, tickfont=dict(size=11))

# Score temporal por portal: como "texto flotante" arriba, sin pisar barras
figT.add_trace(go.Scatter(
    x=tmpT["portal"],
    y=[104]*len(tmpT),
    mode="text",
    text=[f"Score={v:.1f}%" for v in tmpT["Temporal_score"]],
    textposition="top center",
    showlegend=False,
    hoverinfo="skip"
))

# TOTAL arriba
figT.add_annotation(
    x=0.5, y=1.14, xref="paper", yref="paper",
    text=f"<b>TOTAL trazabilidad temporal (ponderado por N): {TOTAL_TEMPORAL:.1f}%</b>",
    showarrow=False
)

figT = plotly_polish(figT, height=720)
figT.update_layout(margin=dict(t=130, l=80, r=30, b=110))

blocks.append(plotly_fig_block(
    figT,
    plotly_caption(
        "<b>Interpretación.</b> La Trazabilidad temporal es el promedio de: "
        "(i) % con <i>modified</i> en 0–6 meses y (ii) % cuya <i>update_frequency</i> cae en Grupo 1 "
        "(instantánea/minutos/horas/diaria/semanal/mensual/trimestral/semestral/cada 6 meses). "
        "Las barras muestran % y conteo (N) entre paréntesis."
    ),
    export_name="comparativo_trazabilidad_temporal_pct_y_n"
))

# 5.7 Licenciamiento — BARRAS VERTICALES AGRUPADAS (no stack) + orden
blocks.append("<h2>Comparativo multi-portal: Legal / Técnica / Categorías</h2>")

lic_cmp = (df.groupby([PORTAL_COL, "license_bucket"]).size().reset_index(name="Datasets"))
tot = lic_cmp.groupby(PORTAL_COL)["Datasets"].transform("sum").replace(0, 1)
lic_cmp["Porcentaje"] = (lic_cmp["Datasets"] / tot * 100).round(1)

lic_order = ["Apertura total", "Restringida", "Vacío legal"]
lic_cmp["license_bucket"] = pd.Categorical(lic_cmp["license_bucket"], categories=lic_order, ordered=True)

figL = px.bar(
    lic_cmp, x=PORTAL_COL, y="Porcentaje", color="license_bucket",
    barmode="group",
    category_orders={"license_bucket": lic_order},
    title="Licenciamiento (Apertura total / Restringida / Vacío legal) — % por portal",
    text="Porcentaje"
)
figL.update_traces(textposition="outside", cliponaxis=False)
figL.update_layout(
    yaxis_range=[0,105],
    xaxis_tickangle=-25,
    yaxis_title="Porcentaje (%)",
    xaxis_title="Portal",
    legend_title="Licencia"
)
figL.update_xaxes(automargin=True, tickfont=dict(size=11))
figL = plotly_polish(figL, height=650)

blocks.append(plotly_fig_block(
    figL,
    plotly_caption(
        "<b>Interpretación.</b> Distribución porcentual del licenciamiento por portal. "
        "<i>Vacío legal</i> incluye licencia ausente o no definida/consultar."
    ),
    export_name="comparativo_licenciamiento_vertical_group"
))

# 5.8 Formatos — vertical, arreglar etiquetas largas
rows_fmt = []
for portal, g in df.groupby(PORTAL_COL):
    n = len(g) if len(g) else 1
    any_open = g["open_formats_list_parsed"].apply(lambda lst: 1 if isinstance(lst, list) and len(lst) > 0 else 0).sum()
    any_closed = g["non_open_formats_list_parsed"].apply(lambda lst: 1 if isinstance(lst, list) and len(lst) > 0 else 0).sum()
    rows_fmt.append({"portal": portal, "Tipo": "Datasets con formatos abiertos", "Porcentaje": round(100*any_open/n, 1)})
    rows_fmt.append({"portal": portal, "Tipo": "Datasets con formatos cerrados/propietarios", "Porcentaje": round(100*any_closed/n, 1)})

fmt_long = pd.DataFrame(rows_fmt)

figF = px.bar(
    fmt_long, x="portal", y="Porcentaje", color="Tipo",
    barmode="group",
    title="Formatos (dataset-level) — % por portal (Abiertos vs Cerrados)",
    text="Porcentaje"
)
figF.update_traces(textposition="outside", cliponaxis=False)
figF.update_layout(
    yaxis_range=[0,105],
    xaxis_tickangle=-25,
    yaxis_title="% de datasets",
    xaxis_title="Portal"
)
figF.update_xaxes(automargin=True, tickfont=dict(size=11))
figF = plotly_polish(figF, height=650)

blocks.append(plotly_fig_block(
    figF,
    plotly_caption(
        "<b>Interpretación.</b> Métrica a nivel dataset: % con al menos un formato abierto y % con al menos un formato cerrado/propietario. "
        "No necesariamente suman 100% (un dataset puede ofrecer ambos)."
    ),
    export_name="comparativo_formatos_vertical"
))

# 5.9 Categorías — Top 3 por portal (completando 3 tras excluir vacíos/no definidos)
CATEGORY_CANDIDATES = ["category", "categoría", "categoria", "theme", "themes", "tematica", "temática", "tags"]
CATEGORY_COL = next((c for c in CATEGORY_CANDIDATES if c in df.columns), None)

INVALID_CATS = {
    "", " ", None, "nan", "none", "null", "n/a", "na",
    "no definido", "no definida", "sin definir", "sin categoría", "sin categoria",
    "undefined", "unknown"
}

def norm_cat(x):
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s == "":
        return None
    low = s.lower().strip()
    if low in INVALID_CATS:
        return None
    if "no definido" in low or "sin definir" in low:
        return None
    return s

if CATEGORY_COL:
    tmp = df.copy()
    tmp["cat_norm"] = tmp[CATEGORY_COL].apply(norm_cat)
    tmp = tmp.dropna(subset=["cat_norm"]).copy()

    # Cuenta las categorías por portal
    cat_counts = (tmp.groupby([PORTAL_COL, "cat_norm"]).size()
                  .reset_index(name="Datasets"))

    # Filtra las tres principales categorías para cada portal
    cat_top3 = cat_counts.groupby(PORTAL_COL).apply(lambda x: x.nlargest(3, 'Datasets')).reset_index(drop=True)

    # Ordena categorías dentro de cada portal para que se lean bien
    cat_top3 = cat_top3.sort_values([PORTAL_COL, "Datasets"], ascending=[True, False]).copy()

    figK = px.bar(
        cat_top3,
        x="Datasets",
        y="cat_norm",
        orientation="h",
        facet_col=PORTAL_COL,
        facet_col_wrap=2,           # ajusta: 2 o 3 según cuántos portales tengas
        color="cat_norm",           # color por categoría (NO por portal)
        title="Top 3 de Categorías por Portal"
    )

    # Mejoras de legibilidad
    figK.update_yaxes(automargin=True, categoryorder="total ascending")
    figK.update_xaxes(automargin=True)

    # Separación entre facets
    figK.update_layout(
        legend_title="Categoría",
        margin=dict(t=90, l=60, r=30, b=80)
    )


    # Altura dinámica (más portales => más alto)
    n_portals = cat_top3[PORTAL_COL].nunique()
    rows_facets = int(np.ceil(n_portals / 2))  # facet_col_wrap=2
    figK = plotly_polish(figK, height=max(520, 260 * rows_facets))

    blocks.append(plotly_fig_block(
        figK,
        plotly_caption(
            "<b>Interpretación.</b> Top 3 categorías más frecuentes por portal, excluyendo valores vacíos o no definidos. "
            "Al excluir vacíos, se seleccionan automáticamente las siguientes categorías válidas hasta completar 3."
        ),
        export_name="comparativo_categorias_top3_sin_vacios"
    ))
else:
    blocks.append("<div class='card'><div class='cap'><b>Nota.</b> No se detectó columna de categorías en los archivos.</div></div>")

# ============================================================
# 6) EXPORT Gráfica HTML
# ============================================================
html_final = build_html(blocks, title="Gráfica Comparativo – Madurez del ecosistema de datos abiertos FINAL")
Path(OUT_HTML).write_text(html_final, encoding="utf-8")
print(f"\n Archivo generado exitosamente: {OUT_HTML}")
print(f" HTML individuales por gráfica: {EXPORT_DIR.resolve()}")