In [None]:
# =============================================================
# PUNTO 4 – Dashboard y Resultados de metricas

#   1) Trazabilidad temporal (age_month_issued / age_months_modified)  
#   2) Traceability componentes (conteo) + desglose agregado (normalizado 100)
#   3) Update frequency (vacíos -> "No definido") con conteo + %
#   4) Traceability_score (0/33/66/100) -> barras (NO hist) + interpretación clara
#   5) interoperability_semantics (distribución) + componentes + desglose
#   6) interoperability_technical (distribución) + componentes + desglose
#   7) accessibility_score (distribución) + componentes + desglose
#   8) quality_score (distribución) + componentes (CONTEO) + desglose
#   9) Índice global (portal_maturity) -> scorecard + niveles (conteo + %)
# =============================================================

import pandas as pd
import numpy as np
import re
import ast
from datetime import datetime
from pathlib import Path

import plotly.express as px
import plotly.graph_objects as go

TODAY = pd.to_datetime(datetime.now().date())

# =============================
# INPUTS / OUTPUT
# =============================
DATASET_XLSX = "Punto3_NOMBREDELARCHIVO_USARPARARESULTADOS.xlsx"          # ESTOS NOMBRES SE PUEDEN MODIFICAR
DATASET_CSV  = "Punto3_NOMBREDELARCHIVO_USARPARARESULTADOS.csv"           # ESTOS NOMBRES SE PUEDEN MODIFICAR
OUT_HTML = "Punto4_RESULTADOS.html"                                        # ESTOS NOMBRES SE PUEDEN MODIFICAR

# -----------------------------
# LOAD
# -----------------------------
try:
    df = pd.read_excel(DATASET_XLSX)
except:
    df = pd.read_csv(DATASET_CSV)

# -----------------------------
# Normalizaciones base
# -----------------------------
for c in ["issued", "modified"]:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce")

# Si no existe portal (caso Barcelona), lo fija
if "portal" not in df.columns:
    df["portal"] = "Barcelona"

# Asegurar numéricos para flags principales si existen
for c in [
    "portal_has_api_rest",
    "license_present","license_open",
    "has_data_dictionary","has_semantic_serialization",
    "portal_supports_dcat_dcatap","has_allowed_format",
    "has_open_format","uses_controlled_vocab",
    "download_url_present",
    "traceable_origen","traceable_temporal","traceable_reutilizable",
]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

# -----------------------------
# parse de listas de formatos (para gráficas 1 y 2)
# -----------------------------
def parse_list_cell(x):
    if isinstance(x, (list, tuple, set)):
        return list(x)
    if pd.isna(x):
        return []
    s = str(x).strip()
    if s == "":
        return []
    try:
        v = ast.literal_eval(s)
        if isinstance(v, list):
            return v
    except:
        pass
    return [t.strip() for t in re.split(r"[;,|\s]+", s) if t.strip()]

if "open_formats_list" not in df.columns:
    df["open_formats_list"] = [[] for _ in range(len(df))]
if "non_open_formats_list" not in df.columns:
    df["non_open_formats_list"] = [[] for _ in range(len(df))]

df["open_formats_list_parsed"] = df["open_formats_list"].apply(parse_list_cell)
df["non_open_formats_list_parsed"] = df["non_open_formats_list"].apply(parse_list_cell)

# -----------------------------
# Estilo global (márgenes + altura consistente)
# -----------------------------
def polish(fig, height=460):
    fig.update_layout(
        height=height,
        margin=dict(l=60, r=30, t=70, b=80),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
        font=dict(size=14),
    )
    return fig

# =============================================================
# Paletas / Colores
# =============================================================
COLORS = {
    "abierto": "#1f77b4",      # Azul
    "cerrado": "#ff7f0e",      # Naranja
    "restringido": "#d62728",  # Rojo
    "vacio": "#7f7f7f"         # Gris
}

COMP_COLORS = {
    "Origen": "#1f77b4",
    "Temporal": "#2ca02c",
    "Reutilizable (DOI)": "#d62728",
    "DOI/Reutilizable": "#d62728",

    "DCAT/DCAT-AP": "#1f77b4",
    "API type presente": "#9467bd",
    "Vocabulario controlado": "#2ca02c",
    "Serialización semántica": "#ff7f0e",

    "Licencia abierta": "#1f77b4",
    "Formato abierto": "#ff7f0e",

    "API REST": "#1f77b4",
    "Formato permitido": "#ff7f0e",
    "Licencia presente": "#2ca02c",
    "URL descarga": "#d62728",

    "Diccionario de datos": "#1f77b4",
    "Descripción presente": "#ff7f0e",
}

# =============================================================
# Helpers (gráficas)
# =============================================================
def stacked_breakdown(labels, values, title):
    # values en % (0-100) -> normaliza a 100
    vals = np.array(values, dtype=float)
    tot = vals.sum()
    if tot > 0:
        vals = vals / tot * 100.0

    fig = go.Figure()
    left = 0.0
    for lab, v in zip(labels, vals):
        fig.add_trace(go.Bar(
            x=[v], y=[""],
            orientation="h",
            name=lab,
            marker=dict(color=COMP_COLORS.get(lab, None)),
            text=[f"{v:.1f}%"],
            textposition="inside",
            insidetextanchor="middle",
            base=left
        ))
        left += v

    fig.update_layout(
        barmode="stack",
        xaxis=dict(range=[0, 100], title="Contribución (%)"),
        yaxis=dict(showticklabels=False),
        title=title
    )
    return polish(fig, height=300)

def hist_score(data, col, title, xlabel="Score (0–100)", nbins=20):
    tmp = data.copy()
    tmp[col] = pd.to_numeric(tmp[col], errors="coerce")
    tmp = tmp.dropna(subset=[col])
    fig = px.histogram(tmp, x=col, nbins=nbins, title=title, labels={col: xlabel, "count":"Nº de datasets"})
    fig.update_layout(xaxis_range=[0, 100])
    fig.update_yaxes(title_text="Nº de datasets")
    return polish(fig, height=420)

def bar_components_pct(labels, values, title):
    comp = pd.DataFrame({"Componente": labels, "Valor": values})
    fig = px.bar(
        comp, x="Componente", y="Valor",
        text=comp["Valor"].round(1),
        title=title,
        labels={"Valor":"% de datasets"}
    )
    fig.update_traces(
        textposition="outside", cliponaxis=False,
        marker_color=[COMP_COLORS.get(x, "#1f77b4") for x in comp["Componente"]]
    )
    fig.update_layout(yaxis_range=[0, 105], xaxis_tickangle=-20)
    return polish(fig, height=420)

def bar_components_count(labels, counts, title):
    comp = pd.DataFrame({"Componente": labels, "Datasets": counts})
    fig = px.bar(
        comp, x="Componente", y="Datasets",
        text="Datasets",
        title=title,
        labels={"Datasets":"Nº de datasets"}
    )
    fig.update_traces(
        textposition="outside", cliponaxis=False,
        marker_color=[COMP_COLORS.get(x, "#1f77b4") for x in comp["Componente"]]
    )
    fig.update_layout(xaxis_tickangle=-20)
    fig.update_yaxes(title_text="Nº de datasets")
    return polish(fig, height=420)

def scorecard(scores_dict, title):
    tmp = pd.DataFrame({"Métrica": list(scores_dict.keys()), "Score": list(scores_dict.values())})
    tmp["Score"] = pd.to_numeric(tmp["Score"], errors="coerce")
    tmp = tmp.sort_values("Score", ascending=False)

    fig = px.bar(
        tmp, x="Métrica", y="Score",
        text=tmp["Score"].round(1),
        title=title,
        labels={"Score":"Promedio (0–100)"}
    )
    fig.update_traces(textposition="outside", cliponaxis=False)
    fig.update_layout(yaxis_range=[0, 105], xaxis_tickangle=-20)
    return polish(fig, height=460)

def traceability_score_bars(g, portal_name):
    """
    Traceability_score viene de Punto 3 como:
        ((traceable_origen + traceable_temporal + traceable_reutilizable)/3)*100
    => valores esperados: 0, 33.33, 66.67, 100 (aprox)
    Lo hacemos amigable con barras por nivel.
    """
    s = pd.to_numeric(g["traceability_score"], errors="coerce").dropna()

    def snap(v):
        levels = np.array([0, 33.33, 66.67, 100.0])
        return float(levels[np.argmin(np.abs(levels - v))])

    snapped = s.apply(snap)
    counts = snapped.value_counts().reindex([0.0, 33.33, 66.67, 100.0]).fillna(0).astype(int)

    dfb = pd.DataFrame({
        "Nivel": ["0", "33", "66", "100"],
        "Datasets": counts.values
    })

    fig = px.bar(
        dfb, x="Nivel", y="Datasets",
        text="Datasets",
        title=f"Traceability score (0–100) – distribución por niveles | {portal_name}",
        labels={"Datasets":"Nº de datasets"}
    )
    level_colors = ["#d62728", "#ff7f0e", "#2ca02c", "#1f77b4"]
    fig.update_traces(marker_color=level_colors, textposition="outside", cliponaxis=False)
    fig.update_yaxes(title_text="Nº de datasets")
    fig.update_layout(xaxis_title="Score (0–100)")
    return polish(fig, height=420)

# =============================================================
# 1) Interoperabilidad técnica – Tipos de formatos detectados 
# =============================================================
OPEN_FORMATS = ["CSV","JSON","GEOJSON","XML","RDF","TTL","TURTLE","N-TRIPLES","NT","JSON-LD","JSONLD"]

open_counts = []
for f in OPEN_FORMATS:
    cnt = int(df["open_formats_list_parsed"].apply(lambda lst: 1 if f in lst else 0).sum())
    if cnt > 0:
        open_counts.append({"Formato": f, "Datasets": cnt})
open_counts_df = pd.DataFrame(open_counts)

others_count = int(df["non_open_formats_list_parsed"].apply(lambda lst: 1 if len(lst) > 0 else 0).sum())
others_df = pd.DataFrame([{"Formato": "Otros formatos (cerrados)", "Datasets": others_count}])

fmt_bar_df = pd.concat([open_counts_df, others_df], ignore_index=True)
fmt_bar_df = fmt_bar_df.sort_values("Datasets", ascending=False)

fmt_bar_df["Tipo"] = fmt_bar_df["Formato"].apply(lambda x: "Abierto" if x in OPEN_FORMATS else "Cerrado")

fig_fmt_types = px.bar(
    fmt_bar_df,
    x="Formato", y="Datasets",
    text="Datasets",
    title="Clasificación de formatos para interoperabilidad técnica",
    labels={"Datasets": "Nº de datasets"},
    color="Tipo",
    color_discrete_map={"Abierto": COLORS["abierto"], "Cerrado": COLORS["cerrado"]}
)
fig_fmt_types.update_traces(textposition="outside", cliponaxis=False)
fig_fmt_types.update_layout(xaxis=dict(tickangle=-25, categoryorder="total descending"))
polish(fig_fmt_types, height=500)

caption_fmt_types = (
    "<b>Interpretación.</b> Cada barra de formato abierto indica cuántos datasets del portal lo ofrecen. "
    "La barra <i>Otros formatos (cerrados)</i> agrupa datasets con al menos un formato propietario o no estándar."
)

# =============================================================
# 2) Interoperabilidad en formatos – porcentaje 
# =============================================================
if "open_format_count" not in df.columns or df["open_format_count"].isna().all():
    df["open_format_count"] = df["open_formats_list_parsed"].apply(len)
if "non_open_format_count" not in df.columns or df["non_open_format_count"].isna().all():
    df["non_open_format_count"] = df["non_open_formats_list_parsed"].apply(len)

open_total = float(df["open_format_count"].fillna(0).sum())
non_open_total = float(df["non_open_format_count"].fillna(0).sum())
den = (open_total + non_open_total) if (open_total + non_open_total) > 0 else 1

fmt_pct = pd.DataFrame({
    "Clasificación": ["Abiertos", "Cerrados/Propietarios"],
    "Porcentaje": [100 * open_total/den, 100 * non_open_total/den]
})

fig_fmt_pct = px.bar(
    fmt_pct,
    x="Clasificación", y="Porcentaje",
    text=fmt_pct["Porcentaje"].round(1),
    title="Distribución porcentual de formatos (dimensión técnica)",
    labels={"Porcentaje":"Porcentaje (%)"},
    color="Clasificación",
    color_discrete_map={"Abiertos": COLORS["abierto"], "Cerrados/Propietarios": COLORS["cerrado"]}
)
fig_fmt_pct.update_traces(textposition="outside", cliponaxis=False)
fig_fmt_pct.update_layout(yaxis_range=[0, 105])
polish(fig_fmt_pct, height=420)

caption_fmt_pct = (
    "<b>Interpretación.</b> Proporción calculada sobre el total de formatos reportados en los metadatos "
    "(sumatoria de formatos abiertos y cerrados por dataset)."
)

# =============================================================
# 3) Licenciamiento permitido – 3 categorías
# =============================================================
if "license" not in df.columns:
    df["license"] = np.nan
if "license_present" not in df.columns:
    df["license_present"] = df["license"].notna().astype(int)
if "license_open" not in df.columns:
    df["license_open"] = 0

def license_bucket(row):
    lic = str(row.get("license","") if pd.notna(row.get("license",np.nan)) else "").lower()
    present = int(pd.to_numeric(row.get("license_present", 0), errors="coerce") or 0)
    open_flag = int(pd.to_numeric(row.get("license_open", 0), errors="coerce") or 0)

    # 1) Vacío legal / no definido
    if (
        present == 0
        or lic.strip() == ""
        or "no definido" in lic
        or "sin definir" in lic
        or "consultar" in lic
        or "consulte" in lic
        or "consejer" in lic
        or "permiso" in lic
        or "autoriz" in lic
    ):
        return "Vacío legal"

    # 2) Restricciones explícitas (PRIORIDAD MÁXIMA)
    if (
        "noncommercial" in lic
        or "no comercial" in lic
        or "no-comercial" in lic
        or re.search(r"\bby[-\s]?nc\b", lic)
        or "noderivatives" in lic
        or "sin deriv" in lic
        or re.search(r"\bby[-\s]?nd\b", lic)
    ):
        return "Restringida"

    # 3) Aviso legal estándar (abierta)
    if "avisolegal" in lic or "/aviso-legal" in lic or "aviso legal" in lic:
        return "Apertura total"

    # 4) Creative Commons realmente abiertas (sin NC / ND)
    if (
        "cc0" in lic
        or (
            (("creative commons" in lic) or re.search(r"\bcc\b", lic))
            and ("nc" not in lic and "nd" not in lic)
        )
    ):
        return "Apertura total"

    # 5) Fallback coherente con Punto 1
    if open_flag == 1:
        return "Apertura total"

    return "Vacío legal"


df["license_bucket"] = df.apply(license_bucket, axis=1)

lic_counts = df["license_bucket"].value_counts().reindex(
    ["Apertura total","Restringida","Vacío legal"]
).fillna(0).reset_index()
lic_counts.columns = ["Categoría","Datasets"]

color_map_lic = {
    "Apertura total": COLORS["abierto"],
    "Restringida": COLORS["restringido"],
    "Vacío legal": COLORS["vacio"]
}

fig_license = px.bar(
    lic_counts, x="Categoría", y="Datasets",
    text="Datasets",
    title="Licenciamiento permitido (dimensión legal)",
    labels={"Datasets":"Nº de datasets"},
    color="Categoría",
    color_discrete_map=color_map_lic
)
fig_license.update_traces(textposition="outside", cliponaxis=False)
polish(fig_license, height=420)

caption_license = (
    "<b>Interpretación.</b> <i>Apertura total</i>: CC BY/CC Creative Commons y equivalentes. "
    "<i>Restringida</i>: cláusula “No comercial” (CC BY-NC). "
    "<i>Vacío legal</i>: licencia ausente o no definida."
)

# =============================================================
# MÉTRICAS MADURAS POR PORTAL
# =============================================================
for col, default in [
    ("traceability_score", np.nan),
    ("interoperability_semantics", np.nan),
    ("interoperability_technical", np.nan),
    ("accessibility_score", np.nan),
    ("quality_score", np.nan),

    ("traceable_origen", 0),
    ("traceable_temporal", 0),
    ("traceable_reutilizable", 0),

    ("age_month_issued", np.nan),
    ("age_months_modified", np.nan),

    ("portal_supports_dcat_dcatap", 0),
    ("api_type", np.nan),
    ("uses_controlled_vocab", 0),
    ("has_semantic_serialization", 0),

    ("license_open", 0),
    ("has_open_format", 0),

    ("portal_has_api_rest", 0),
    ("has_allowed_format", 0),
    ("license_present", 0),
    ("download_url_present", 0),

    ("has_data_dictionary", 0),
    ("description", np.nan),

    ("update_frequency", np.nan),

    ("portal_maturity", np.nan),
    ("portal_maturity_level", None),
]:
    if col not in df.columns:
        df[col] = default

# auxiliares
df["_api_type_present"] = df["api_type"].notna().astype(int)
df["_description_present"] = df["description"].astype(str).str.strip().replace("nan","").ne("").astype(int)

# Si portal_maturity no viene, lo calculamos como en Punto 3
df["portal_maturity"] = pd.to_numeric(df["portal_maturity"], errors="coerce")
mask_missing = df["portal_maturity"].isna()
if mask_missing.any():
    df.loc[mask_missing, "portal_maturity"] = (
        pd.to_numeric(df.loc[mask_missing, "accessibility_score"], errors="coerce") +
        pd.to_numeric(df.loc[mask_missing, "interoperability_semantics"], errors="coerce") +
        pd.to_numeric(df.loc[mask_missing, "interoperability_technical"], errors="coerce") +
        pd.to_numeric(df.loc[mask_missing, "traceability_score"], errors="coerce") +
        pd.to_numeric(df.loc[mask_missing, "quality_score"], errors="coerce")
    ) / 5.0

# =========================
# Generar figs por portal
# =========================
portal_figs = {}  # portal -> list of (fig, caption)

for portal_name, g in df.groupby("portal"):
    figs = []
    g = g.copy()

    # =========================================================
    # 1) Trazabilidad temporal (USAR edades ya calculadas en Punto 3)
    # =========================================================
    g["age_month_issued"] = pd.to_numeric(g["age_month_issued"], errors="coerce")
    g["age_months_modified"] = pd.to_numeric(g["age_months_modified"], errors="coerce")

    fig_age_mod = px.histogram(
        g.dropna(subset=["age_months_modified"]),
        x="age_months_modified", nbins=25,
        title=f"Trazabilidad temporal – antigüedad desde última actualización (meses) | {portal_name}",
        labels={"age_months_modified":"Meses", "count":"Nº de datasets"}
    )
    fig_age_mod.update_yaxes(title_text="Nº de datasets")
    figs.append((polish(fig_age_mod, height=420),
                 "<b>Interpretación.</b> Antigüedad en meses desde <i>modified</i>, calculada en Punto 3 como "
                 "<i>age_months_modified</i> (referencia: fecha del sistema)."))

    fig_age_iss = px.histogram(
        g.dropna(subset=["age_month_issued"]),
        x="age_month_issued", nbins=25,
        title=f"Trazabilidad temporal – antigüedad desde publicación (meses) | {portal_name}",
        labels={"age_month_issued":"Meses", "count":"Nº de datasets"}
    )
    fig_age_iss.update_yaxes(title_text="Nº de datasets")
    figs.append((polish(fig_age_iss, height=420),
                 "<b>Interpretación.</b> Antigüedad en meses desde <i>issued</i>, calculada en Punto 3 como "
                 "<i>age_month_issued</i>."))

    # =========================================================
    # 2) Traceability – componentes (conteo) + desglose agregado
    # =========================================================
    n = len(g)
    origin_count   = int((pd.to_numeric(g["traceable_origen"], errors="coerce").fillna(0) == 1).sum())
    temporal_count = int((pd.to_numeric(g["traceable_temporal"], errors="coerce").fillna(0) == 1).sum())
    doi_count      = int((pd.to_numeric(g["traceable_reutilizable"], errors="coerce").fillna(0) == 1).sum())

    fig_trace_comp_count = bar_components_count(
        ["Origen", "Temporal", "Reutilizable (DOI)"],
        [origin_count, temporal_count, doi_count],
        f"Traceability – componentes (conteo de datasets) | {portal_name}"
    )
    figs.append((fig_trace_comp_count,
                 "<b>Interpretación.</b> Conteo de datasets que cumplen cada componente: "
                 "<i>origen</i> (publisher+url+identifier), <i>temporal</i> (issued+modified+update_frequency) y "
                 "<i>reutilizable</i> (DOI)."))

    origin_pct   = origin_count / n * 100 if n else 0
    temporal_pct = temporal_count / n * 100 if n else 0
    doi_pct      = doi_count / n * 100 if n else 0

    fig_trace_break = stacked_breakdown(
        ["Origen", "Temporal", "DOI/Reutilizable"],
        [origin_pct, temporal_pct, doi_pct],
        f"Traceability – desglose agregado (normalizado a 100) | {portal_name}"
    )
    figs.append((fig_trace_break,
                 "<b>Interpretación.</b> Lectura agregada y comparable: muestra el peso relativo de cada componente. "
                 "Si <i>DOI</i>=0, la trazabilidad máxima observable queda limitada."))

    # =========================================================
    # 3) Update frequency (vacíos -> No definido) con conteo + %
    # =========================================================
    uf = g["update_frequency"].astype(str).str.strip()
    uf = uf.replace(["", "nan", "NaN", "None"], np.nan)
    uf = uf.fillna("No definido")

    uf = uf.str.lower().replace({
        "mensual": "monthly",
        "semanal": "weekly",
        "diaria": "daily",
        "anual": "annual",
        "trimestral": "quarterly",
        "semestral": "semiannual",
    })

    freq_counts = uf.value_counts().reset_index()
    freq_counts.columns = ["Frecuencia", "Datasets"]
    freq_counts["Porcentaje"] = (freq_counts["Datasets"] / len(g) * 100).round(1)
    freq_counts["Etiqueta"] = freq_counts.apply(lambda r: f"{int(r['Datasets'])} ({r['Porcentaje']}%)", axis=1)

    fig_update_freq = px.bar(
        freq_counts,
        x="Frecuencia", y="Datasets",
        text="Etiqueta",
        title=f"Frecuencia de actualización declarada (update_frequency) | {portal_name}",
        labels={"Datasets": "Nº de datasets"}
    )
    fig_update_freq.update_traces(textposition="outside", cliponaxis=False)
    fig_update_freq.update_layout(xaxis_tickangle=-25)
    fig_update_freq.update_yaxes(title_text="Nº de datasets")

    figs.append((polish(fig_update_freq, height=460),
                 "<b>Interpretación.</b> Frecuencia tomada directamente del Excel (columna <i>update_frequency</i>, Punto 3). "
                 "Los valores vacíos se agrupan como <i>No definido</i>. La etiqueta muestra conteo y porcentaje."))

    # =========================================================
    # 4) Traceability_score -> barras por niveles (0/33/66/100)
    # =========================================================
    fig_trace_score_levels = traceability_score_bars(g, portal_name)
    figs.append((polish(fig_trace_score_levels, height=420),
                 "<b>Interpretación.</b> Gráfico construido desde la columna <i>traceability_score</i> (Excel, Punto 3). "
                 "Se calcula como <i>((traceable_origen + traceable_temporal + traceable_reutilizable)/3)*100</i>, "
                 "por eso sus niveles esperados son 0/33/66/100."))

    # =========================================================
    # 5) Interoperabilidad semántica (dist + comp % + desglose)
    # =========================================================
    figs.append((hist_score(g, "interoperability_semantics",
                            f"Interoperabilidad semántica (0–100) – distribución | {portal_name}"),
                 "<b>Interpretación.</b> Score (0–100) basado en: DCAT/DCAT-AP, presencia de api_type, vocabulario controlado y serialización semántica."))

    dcat_pct  = float((pd.to_numeric(g["portal_supports_dcat_dcatap"], errors="coerce").fillna(0) == 1).mean() * 100)
    api_pct   = float((pd.to_numeric(g["_api_type_present"], errors="coerce").fillna(0) == 1).mean() * 100)
    vocab_pct = float((pd.to_numeric(g["uses_controlled_vocab"], errors="coerce").fillna(0) == 1).mean() * 100)
    ser_pct   = float((pd.to_numeric(g["has_semantic_serialization"], errors="coerce").fillna(0) == 1).mean() * 100)

    figs.append((bar_components_pct(
        ["DCAT/DCAT-AP", "API type presente", "Vocabulario controlado", "Serialización semántica"],
        [dcat_pct, api_pct, vocab_pct, ser_pct],
        f"Interoperabilidad semántica – componentes (% datasets) | {portal_name}"
    ), "<b>Interpretación.</b> Porcentaje de datasets que evidencian cada componente semántico."))

    figs.append((stacked_breakdown(
        ["DCAT/DCAT-AP", "API type presente", "Vocabulario controlado", "Serialización semántica"],
        [dcat_pct, api_pct, vocab_pct, ser_pct],
        f"Interoperabilidad semántica – desglose agregado (normalizado a 100) | {portal_name}"
    ), "<b>Interpretación.</b> Peso relativo de cada componente en la dimensión semántica (normalizado)." ))

    # =========================================================
    # 6) Interoperabilidad técnica (dist + comp % + desglose)
    # =========================================================
    figs.append((hist_score(g, "interoperability_technical",
                            f"Interoperabilidad técnica (0–100) – distribución | {portal_name}"),
                 "<b>Interpretación.</b> Score (0–100) basado en licencia abierta y presencia de formato abierto."))

    lic_open_pct = float((pd.to_numeric(g["license_open"], errors="coerce").fillna(0) == 1).mean() * 100)
    fmt_open_pct = float((pd.to_numeric(g["has_open_format"], errors="coerce").fillna(0) == 1).mean() * 100)

    figs.append((bar_components_pct(
        ["Licencia abierta", "Formato abierto"],
        [lic_open_pct, fmt_open_pct],
        f"Interoperabilidad técnica – componentes (% datasets) | {portal_name}"
    ), "<b>Interpretación.</b> Cumplimiento técnico desagregado: licencias y formatos abiertos."))

    figs.append((stacked_breakdown(
        ["Licencia abierta", "Formato abierto"],
        [lic_open_pct, fmt_open_pct],
        f"Interoperabilidad técnica – desglose agregado (normalizado a 100) | {portal_name}"
    ), "<b>Interpretación.</b> Peso relativo de licencia vs formato (normalizado)." ))

    # =========================================================
    # 7) Accesibilidad (dist + comp % + desglose)
    # =========================================================
    figs.append((hist_score(g, "accessibility_score",
                            f"Accesibilidad (0–100) – distribución | {portal_name}"),
                 "<b>Interpretación.</b> Accesibilidad medida por API REST, formato permitido, licencia presente y URL de descarga."))

    api_rest_pct  = float((pd.to_numeric(g["portal_has_api_rest"], errors="coerce").fillna(0) == 1).mean() * 100)
    allowed_pct   = float((pd.to_numeric(g["has_allowed_format"], errors="coerce").fillna(0) == 1).mean() * 100)
    lic_pres_pct  = float((pd.to_numeric(g["license_present"], errors="coerce").fillna(0) == 1).mean() * 100)
    dl_pct        = float((pd.to_numeric(g["download_url_present"], errors="coerce").fillna(0) == 1).mean() * 100)

    figs.append((bar_components_pct(
        ["API REST", "Formato permitido", "Licencia presente", "URL descarga"],
        [api_rest_pct, allowed_pct, lic_pres_pct, dl_pct],
        f"Accesibilidad – componentes (% datasets) | {portal_name}"
    ), "<b>Interpretación.</b> Porcentaje de datasets que soportan cada condición de accesibilidad."))

    figs.append((stacked_breakdown(
        ["API REST", "Formato permitido", "Licencia presente", "URL descarga"],
        [api_rest_pct, allowed_pct, lic_pres_pct, dl_pct],
        f"Accesibilidad – desglose agregado (normalizado a 100) | {portal_name}"
    ), "<b>Interpretación.</b> Peso relativo de cada componente de accesibilidad."))

    # =========================================================
    # 8) Calidad (dist + componentes CONTEO + desglose)
    # =========================================================
    figs.append((hist_score(g, "quality_score",
                            f"Calidad (0–100) – distribución | {portal_name}"),
                 "<b>Interpretación.</b> Calidad medida por diccionario de datos y presencia de descripción."))

    dict_count = int((pd.to_numeric(g["has_data_dictionary"], errors="coerce").fillna(0) == 1).sum())
    desc_count = int((pd.to_numeric(g["_description_present"], errors="coerce").fillna(0) == 1).sum())

    fig_q_comp_count = bar_components_count(
        ["Diccionario de datos", "Descripción presente"],
        [dict_count, desc_count],
        f"Calidad – componentes (conteo de datasets) | {portal_name}"
    )
    figs.append((fig_q_comp_count,
                 "<b>Interpretación.</b> Conteo de datasets con diccionario de datos y con descripción disponible."))

    dict_pct = dict_count / n * 100 if n else 0
    desc_pct = desc_count / n * 100 if n else 0

    figs.append((stacked_breakdown(
        ["Diccionario de datos", "Descripción presente"],
        [dict_pct, desc_pct],
        f"Calidad – desglose agregado (normalizado a 100) | {portal_name}"
    ), "<b>Interpretación.</b> Peso relativo de diccionario vs descripción (normalizado)." ))

    # =========================================================
    # 9) Scorecard e Índice global + niveles (conteo + %)
    # =========================================================
    m = {
        "accessibility_score": float(pd.to_numeric(g["accessibility_score"], errors="coerce").mean()),
        "interoperability_semantics": float(pd.to_numeric(g["interoperability_semantics"], errors="coerce").mean()),
        "interoperability_technical": float(pd.to_numeric(g["interoperability_technical"], errors="coerce").mean()),
        "traceability_score": float(pd.to_numeric(g["traceability_score"], errors="coerce").mean()),
        "quality_score": float(pd.to_numeric(g["quality_score"], errors="coerce").mean()),
        "Índice global (promedio 5 scores)": float(pd.to_numeric(g["portal_maturity"], errors="coerce").mean()),
    }

    figs.append((scorecard(m, f"Scorecard de madurez (promedios 0–100) | {portal_name}"),
                 "<b>Interpretación.</b> El <i>Índice global</i> (<i>portal_maturity</i>) es el promedio de 5 scores: "
                 "<i>accessibility_score</i>, <i>interoperability_semantics</i>, <i>interoperability_technical</i>, "
                 "<i>traceability_score</i> y <i>quality_score</i>."))

    if "portal_maturity_level" in g.columns and g["portal_maturity_level"].notna().any():
        levels = g["portal_maturity_level"].astype(str)
    else:
        levels = pd.cut(
            pd.to_numeric(g["portal_maturity"], errors="coerce"),
            bins=[-0.1, 40, 70, 100],
            labels=["bajo", "medio", "alto"]
        ).astype(str)

    vc = levels.value_counts(dropna=False)
    total_n = int(len(g))

    level_df = pd.DataFrame({
        "Nivel": ["bajo", "medio", "alto"],
        "Datasets": [int(vc.get("bajo", 0)), int(vc.get("medio", 0)), int(vc.get("alto", 0))]
    })
    level_df["Porcentaje"] = (level_df["Datasets"] / (total_n if total_n else 1) * 100).round(1)
    level_df["Etiqueta"] = level_df.apply(lambda r: f"{r['Datasets']} ({r['Porcentaje']}%)", axis=1)

    fig_levels = px.bar(
        level_df, x="Nivel", y="Datasets", text="Etiqueta",
        title=f"Distribución de niveles de madurez (índice global) | {portal_name}",
        labels={"Datasets": "Nº de datasets"},
        color="Nivel",
        color_discrete_map={"bajo": "#d62728", "medio": "#ff7f0e", "alto": "#2ca02c"}
    )
    fig_levels.update_traces(textposition="outside", cliponaxis=False)
    fig_levels.update_yaxes(title_text="Nº de datasets")

    figs.append((polish(fig_levels, height=420),
                 f"<b>Interpretación.</b> Nivel calculado desde <i>portal_maturity</i> (Excel, Punto 3). "
                 f"Se reporta conteo y porcentaje sobre el total del portal: <b>N={total_n}</b>. "
                 "Cortes: bajo ≤ 40, medio (40–70], alto > 70."))

    portal_figs[portal_name] = figs

# =============================================================
# Render HTML con “captions” fuera del gráfico 
# =============================================================
def fig_html(fig):
    return fig.to_html(full_html=False, include_plotlyjs=False)

def card(fig, caption):
    return f"""
    <div class="card">
      {fig_html(fig)}
      <div class="caption">{caption}</div>
    </div>
    """

cards_html = []
cards_html.append(card(fig_fmt_types, caption_fmt_types))
cards_html.append(card(fig_fmt_pct, caption_fmt_pct))
cards_html.append(card(fig_license, caption_license))

for portal_name, figs in portal_figs.items():
    cards_html.append(f"<h2 style='margin-top:26px;'>Portal: {portal_name}</h2>")
    for fig, cap in figs:
        cards_html.append(card(fig, cap))

html = f"""
<!doctype html>
<html lang="es">
<head>
  <meta charset="utf-8"/>
  <title>Dashboard – Madurez Ecosistema Open Data</title>
  <script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script>
  <style>
    body {{ font-family: Arial, sans-serif; margin: 22px; }}
    h1 {{ margin-bottom: 6px; }}
    h2 {{ margin-bottom: 8px; }}
    .sub {{ color: #555; margin-bottom: 18px; }}
    .grid2 {{ display: grid; grid-template-columns: 1fr; gap: 18px; }}
    .card {{
      border: 1px solid #ddd; border-radius: 12px; padding: 12px;
      background: #fff;
      margin-bottom: 14px;
    }}
    .caption {{
      margin-top: 10px;
      font-size: 13.5px;
      color: #333;
      line-height: 1.35;
      background: #fafafa;
      border: 1px solid #eee;
      border-radius: 10px;
      padding: 10px 12px;
    }}
    @media (min-width: 1100px) {{
      .grid2 {{ grid-template-columns: 1fr 1fr; }}
    }}
  </style>
</head>
<body>
  <h1>Dashboard – Análisis de madurez del ecosistema de datos abiertos</h1>
  <div class="sub">Visualizaciones interactivas basadas en los metadatos procesados (Punto 3 – V9 LIMPIO).</div>

  <div class="grid2">
    {cards_html[0]}
    {cards_html[1]}
  </div>

  <div class="grid2">
    {cards_html[2]}
  </div>

  <div>
    {"".join(cards_html[3:])}
  </div>

</body>
</html>
"""

Path(OUT_HTML).write_text(html, encoding="utf-8")
print(" Dashboard PRO generado:", OUT_HTML)