In [None]:
import re
import time
import requests
import pandas as pd
from datetime import datetime, timezone, timedelta
import unicodedata
from urllib.parse import quote

# ============================================================
# PUNTO 1 (METADATOS) - EJEMPLO PORTAL: datos.gob.es (Portal Nacional)
# Fuente: APIDATA (Linked Data API) + (opcional) SPARQL para labels
# ============================================================

PORTAL = "datos.gob.es (Portal Nacional)"
API_TYPE = "APIDATA+SPARQL"  #TIPO DE API
 
APIDATA_BASE = "https://datos.gob.es/apidata/catalog/dataset.json" #URL DE API
SPARQL_ENDPOINT = "https://datos.gob.es/virtuoso/sparql"           #URL ENDPOINT DE API   

TIMEOUT = 60
SLEEP = 0.05
MAX_DATASETS = 500      # PUEDE CAMBIAR EL LIMITE DE DATASET
PAGE_SIZE = 500         # se ajusta a max 50 por especificación APIDATA

MIN_AGE_MONTHS = 12             # PUEDE CAMBIAR EL CRITERIO DE EDAD DEL DATASET

UA_HEADERS = {
    "User-Agent": "TFM-IIP-Metadata-Extractor/1.0",
    "Accept": "application/json",
}

OPEN_FORMATS = {"CSV","JSON","GEOJSON","XML","RDF","TTL","TURTLE","N-TRIPLES","NT","JSON-LD","JSONLD"}
SEMANTIC_FORMATS = {"RDF","TTL","TURTLE","N-TRIPLES","NT","JSON-LD","JSONLD"}

DATA_DICT_PATTERNS = [
    r"diccionario de datos", r"data dictionary", r"schema", r"esquema",
    r"documentaci[oó]n", r"metadatos", r"data model", r"glosario"
]

 # PUEDE CAMBIAR LOS CRITERIOS DE CATEGORIAS
CATEGORIAS_KEYWORDS = {
    "Transporte y Movilidad": [
        "transporte", "movilidad", "trafico", "tráfico", "carreteras",
        "vehiculos", "vehículos", "autobuses", "metro", "ciclistas",
        "aparcamientos", "taxis", "vialidad"
    ],
    "Ciencia y Tecnología": [
        "ciencia", "tecnologia", "tecnología", "innovacion", "innovación",
        "investigacion", "investigación", "i+d", "proyectos",
        "desarrollo", "sistemas", "tecnologías"
    ],
    "Salud": [
        "salud", "sanidad", "hospitales", "urgencias", "epidemiologia",
        "epidemiología", "asistencia sanitaria", "covid", "enfermedades",
        "vacunacion", "vacunación", "salud pública", "farmacias"
    ],
    "Educación": [
        "educacion", "educación", "formacion", "formación", "universidad",
        "universidades", "colegios", "institutos", "centros educativos",
        "profesorado", "alumnado", "matriculas", "matrículas"
    ],
    "Datos Geográficos y Medioambientales": [
        "medio ambiente", "medio-ambiente", "geografia", "geografía",
        "cartografia", "cartografía", "clima", "meteorologia", "meteorología",
        "contaminacion", "contaminación", "biodiversidad", "parques",
        "rios", "ríos", "fauna", "flora", "ecologia", "ecología"
    ],
    "Datos Demográficos y Estadísticos": [
        "demografia", "demografía", "estadistica", "estadística",
        "poblacion", "población", "municipios", "censos",
        "indicadores", "series temporales"
    ]
}

DOI_REGEX = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"

# ------------------------------------------------------------
# Helpers generales
# ------------------------------------------------------------
def safe_get_json(url, params=None, max_retries=5, backoff_base=1.4):
    last_err = None
    for attempt in range(max_retries):
        try:
            r = requests.get(url, params=params, headers=UA_HEADERS, timeout=TIMEOUT, allow_redirects=True)
            if r.status_code in (429, 500, 502, 503, 504):
                time.sleep(backoff_base ** attempt)
                last_err = RuntimeError(f"HTTP {r.status_code} temporal: {r.url}")
                continue
            r.raise_for_status()
            return r.json()
        except Exception as e:
            last_err = e
            time.sleep(backoff_base ** attempt)
    raise last_err

def normalize_text(s: str) -> str:
    if not s:
        return ""
    s = str(s).lower()
    s = unicodedata.normalize("NFD", s)
    s = "".join(c for c in s if unicodedata.category(c) != "Mn")
    return s

def parse_dt(s: str):
    if not s:
        return None
    try:
        s2 = str(s).strip().replace("Z", "+00:00")
        dt = datetime.fromisoformat(s2)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc)
    except Exception:
        return None

def meets_age(dt_utc):
    if not dt_utc:
        return True
    limite = datetime.now(timezone.utc) - timedelta(days=MIN_AGE_MONTHS * 30.44)
    return dt_utc <= limite

def contains_any(text: str, patterns) -> bool:
    if not text:
        return False
    t = str(text).lower()
    return any(re.search(p, t) for p in patterns)

def pick_lang_value(val, preferred="es"):
    """
    APIDATA suele devolver title/description como:
    [{"_value":"...","_lang":"es"}, ...]
    """
    if val is None:
        return ""
    if isinstance(val, str):
        return val
    if isinstance(val, list):
        # preferido
        for it in val:
            if isinstance(it, dict) and it.get("_lang") == preferred and it.get("_value"):
                return str(it["_value"])
        # fallback primer valor con _value
        for it in val:
            if isinstance(it, dict) and it.get("_value"):
                return str(it["_value"])
        # fallback str del primer elemento
        if val:
            return str(val[0])
    if isinstance(val, dict):
        return str(val.get("_value") or val.get("value") or "")
    return str(val)

def normalize_format(fmt_value: str) -> str:
    """
    Convierte MIME/strings a formatos tipo CSV/JSON/XML...
    Ej: 'text/csv' -> 'CSV', 'application/json' -> 'JSON'
    """
    if not fmt_value:
        return ""
    s = fmt_value.strip().lower()

    mapping = {
        "text/csv": "CSV",
        "application/csv": "CSV",
        "application/json": "JSON",
        "application/geo+json": "GEOJSON",
        "application/geojson": "GEOJSON",
        "application/xml": "XML",
        "text/xml": "XML",
        "application/rdf+xml": "RDF",
        "text/turtle": "TTL",
        "application/x-turtle": "TTL",
        "application/ld+json": "JSON-LD",
        "application/vnd.ms-excel": "XLS",
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "XLSX",
    }
    if s in mapping:
        return mapping[s]

    # heurísticas por substring
    if "csv" in s:
        return "CSV"
    if "json" in s and "ld" in s:
        return "JSON-LD"
    if "json" in s:
        return "JSON"
    if "xml" in s:
        return "XML"
    if "turtle" in s or "ttl" in s:
        return "TTL"
    if "rdf" in s:
        return "RDF"
    if "xlsx" in s:
        return "XLSX"

    # fallback: última parte de mime
    if "/" in s:
        tail = s.split("/")[-1].upper()
        return tail[:20]
    return s.upper()[:20]

def extract_doi_from_text(*parts) -> str:
    blob = " ".join([p for p in parts if p])
    m = re.search(DOI_REGEX, blob, flags=re.I)
    return m.group(1) if m else ""

def classify_category_from_blob(title, description, keywords, theme_uri) -> tuple[str, str]:
    """
    Clasificación "real" para este portal: no hay groups/tags CKAN,
    así que usamos:
    - keywords (APIDATA 'keyword')
    - theme URI (APIDATA 'theme')
    - texto (title+description)
    """
    kw_blob = ""
    if isinstance(keywords, list):
        kw_blob = " ".join([pick_lang_value(k) for k in keywords])
    elif keywords:
        kw_blob = str(keywords)

    theme_txt = ""
    if theme_uri:
        theme_txt = str(theme_uri).rstrip("/").split("/")[-1]

    blob = normalize_text(f"{title} {description} {kw_blob} {theme_txt}")

    for cat, kws in CATEGORIAS_KEYWORDS.items():
        if any(normalize_text(kw) in blob for kw in kws):
            # source "keyword/theme/text" (no CKAN groups)
            return cat, "keyword/theme/text"

    return "No definido", "none"

def detect_controlled_vocab_from_apidata(theme_uri, keywords):
    # En datos.gob.es, theme suele venir de taxonomía -> vocab controlado
    if theme_uri:
        return True, "theme"
    if keywords:
        return True, "keyword"
    return False, "none"

# ------------------------------------------------------------
# APIDATA fetch
# ------------------------------------------------------------
def apidata_fetch_page(page: int, page_size: int):
    # APIDATA doc: _pageSize max 50 (ajustamos para no romper) :contentReference[oaicite:3]{index=3}
    ps = min(int(page_size), 50)
    params = {
        "_page": page,
        "_pageSize": ps,
        "_metadata": "all",
        "_sort": "title",
    }
    js = safe_get_json(APIDATA_BASE, params=params)
    result = (js or {}).get("result") or {}
    items = result.get("items") or []
    return items

# ------------------------------------------------------------
# SPARQL: resolver labels de URIs (publisher principalmente)
# ------------------------------------------------------------
def sparql_select(query: str):
    params = {
        "query": query,
        "format": "application/sparql-results+json"
    }
    return safe_get_json(SPARQL_ENDPOINT, params=params)

def sparql_resolve_labels(uris: list[str], prefer_lang="es") -> dict[str, str]:
    """
    Devuelve dict uri -> label, en lotes para performance.
    """
    uris = [u for u in uris if u and isinstance(u, str) and u.startswith("http")]
    if not uris:
        return {}

    # Lotes chicos
    out = {}
    batch_size = 50
    for i in range(0, len(uris), batch_size):
        chunk = uris[i:i+batch_size]
        values = " ".join([f"<{u}>" for u in chunk])

        q = f"""
        SELECT ?s (COALESCE(?lbl_es, ?lbl_any, STR(?s)) AS ?label) WHERE {{
          VALUES ?s {{ {values} }}
          OPTIONAL {{ ?s <http://www.w3.org/2000/01/rdf-schema#label> ?lbl_es .
                     FILTER(lang(?lbl_es) = "{prefer_lang}") }}
          OPTIONAL {{ ?s <http://www.w3.org/2000/01/rdf-schema#label> ?lbl_any . }}
        }}
        """
        try:
            js = sparql_select(q)
            bindings = js.get("results", {}).get("bindings", []) or []
            for b in bindings:
                s = b.get("s", {}).get("value")
                lab = b.get("label", {}).get("value")
                if s:
                    out[s] = lab or s
        except Exception:
            # si falla SPARQL, no detenemos pipeline
            continue

        time.sleep(SLEEP)
    return out

# ------------------------------------------------------------
# Evaluación dataset -> fila (APIDATA item)
# ------------------------------------------------------------
def evaluate_dataset_apidata(item: dict, publisher_label_map: dict[str, str]):
    dataset_uri = item.get("_about") or item.get("identifier") or ""
    identifier = dataset_uri

    title = pick_lang_value(item.get("title"))
    description = pick_lang_value(item.get("description"))

    issued_raw = item.get("issued") or ""
    modified_raw = item.get("modified") or ""

    issued_dt = parse_dt(issued_raw)
    meets_age_flag = 1 if meets_age(issued_dt) else 0

    # publisher puede venir como URI
    publisher_uri = item.get("publisher") or ""
    publisher = publisher_label_map.get(publisher_uri, publisher_uri)

    # licencia (puede ser URI o texto)
    license_val = item.get("license") or ""
    license_text = pick_lang_value(license_val) if isinstance(license_val, (list, dict)) else str(license_val)
    license_present = 1 if str(license_text).strip() else 0
    license_open = 1 if ("creative commons" in license_text.lower() or re.search(r"\bcc\s*by\b", license_text.lower())) else 0

    # theme / keyword
    theme_uri = item.get("theme") or ""
    keywords = item.get("keyword") or []

    category, category_source = classify_category_from_blob(title, description, keywords, theme_uri)
    uses_cv, cv_source = detect_controlled_vocab_from_apidata(theme_uri, keywords)

    # distributions
    distributions = item.get("distribution") or []
    download_urls = []
    formats = []
    doc_hint = []

    for d in distributions:
        if not isinstance(d, dict):
            continue
        acc = d.get("accessURL") or ""
        dl = d.get("downloadURL") or ""
        if dl:
            download_urls.append(dl)
        elif acc:
            download_urls.append(acc)

        fmt_obj = d.get("format")
        fmt_val = ""
        if isinstance(fmt_obj, dict):
            fmt_val = fmt_obj.get("value") or fmt_obj.get("_value") or ""
        elif isinstance(fmt_obj, str):
            fmt_val = fmt_obj
        fmt_norm = normalize_format(fmt_val)
        if fmt_norm:
            formats.append(fmt_norm)

        # hints para diccionario
        t = pick_lang_value(d.get("title"))
        doc_hint.append(t)
        doc_hint.append(d.get("accessURL") or "")
        doc_hint.append(d.get("downloadURL") or "")
        doc_hint.append(pick_lang_value(d.get("description")))

    formats_unique = sorted(set([f for f in formats if f]))
    format_join = ", ".join(formats_unique)
    n_formats = len(formats_unique)

    has_allowed_format = 1 if any(f in OPEN_FORMATS for f in formats_unique) else 0
    has_semantic_serialization = 1 if any(f in SEMANTIC_FORMATS for f in formats_unique) else 0

    download_url = download_urls[0] if download_urls else ""

    # DOI
    doi = extract_doi_from_text(title, description, download_url, " ".join(download_urls))
    has_doi = 1 if doi else 0

    # data dictionary
    has_data_dictionary = 1 if (
        contains_any(title + " " + description, DATA_DICT_PATTERNS) or
        contains_any(" ".join(doc_hint), DATA_DICT_PATTERNS)
    ) else 0

    # update frequency (en datos.gob suele venir en accrualPeriodicity)
    update_frequency = ""
    ap = item.get("accrualPeriodicity")
    if isinstance(ap, dict):
        # a veces trae estructura con "value" -> days
        update_frequency = str(ap.get("_about") or ap.get("value") or ap)
    elif ap:
        update_frequency = str(ap)
    update_frequency = update_frequency or "No definido"
    frequency_documented = 1 if update_frequency != "No definido" else 0

    # performance: no hacemos HEAD/GET masivo
    public_access_ok = 1 if download_url else 0

    # DCAT en portal nacional: sí
    portal_supports_dcat_dcatap = 1
    metadata_rdf_available = 1 if dataset_uri else 0

     # SE PUEDEN AGREGAR O QUITAR COLUMNAS DE RESULTADOS SIEMPRE QUE LAS DEFINAS
    
    
    return {
        "portal": PORTAL,
        "api_type": API_TYPE,
        "portal_has_api_rest": 1,
        "portal_supports_dcat_dcatap": portal_supports_dcat_dcatap,

        "identifier": identifier,
        "doi": doi,
        "has_doi": has_doi,

        "publisher": publisher,

        "download_url": download_url,
        "download_urls": " | ".join(download_urls),

        "license": license_text,
        "license_present": license_present,
        "license_open": license_open,

        "dataset_id": dataset_uri,       # en nacional, el id práctico es el URI
        "dataset_uri": dataset_uri,
        "title": title,
        "description": description,

        "category": category,
        "category_source": category_source,

        "uses_controlled_vocab": 1 if uses_cv else 0,
        "controlled_vocab_source": cv_source,

        "issued": issued_raw,
        "modified": modified_raw,
        "meets_age_criterion": meets_age_flag,

        "format": format_join,
        "n_formats": n_formats,
        "metadata_rdf_available": metadata_rdf_available,

        "has_allowed_format": has_allowed_format,
        "has_semantic_serialization": has_semantic_serialization,
        "has_data_dictionary": has_data_dictionary,

        "update_frequency": update_frequency,
        "frequency_documented": frequency_documented,

        "public_access_ok": public_access_ok,
    }

def main():
    # 1) Recolectar items (primero rápido)
    items_all = []
    page = 0
    ps = min(int(PAGE_SIZE), 50)  # doc indica max 50 :contentReference[oaicite:4]{index=4}

    while True:
        items = apidata_fetch_page(page=page, page_size=ps)
        if not items:
            break
        items_all.extend(items)

        if MAX_DATASETS and len(items_all) >= MAX_DATASETS:
            items_all = items_all[:MAX_DATASETS]
            break

        page += 1
        time.sleep(SLEEP)

    print(f"Items APIDATA recolectados: {len(items_all)} (pageSize efectivo={ps})")

    # 2) Resolver publisher labels por SPARQL (opcional pero útil)
    publishers = []
    for it in items_all:
        p = it.get("publisher")
        if p and isinstance(p, str) and p.startswith("http"):
            publishers.append(p)
    publishers = sorted(set(publishers))

    print(f"Publishers únicos a resolver por SPARQL: {len(publishers)}")
    publisher_label_map = sparql_resolve_labels(publishers, prefer_lang="es")

    # 3) Evaluar + filtrar según tus criterios
    rows = []
    stats = {"sin_categoria": 0, "no_age": 0, "no_format": 0, "no_download": 0, "ok": 0}

    for it in items_all:
        reg = evaluate_dataset_apidata(it, publisher_label_map)

        # categorías definidas
        if reg["category"] == "No definido":
            stats["sin_categoria"] += 1
            continue

        # edad mínima
        if reg["meets_age_criterion"] != 1:
            stats["no_age"] += 1
            continue

        # formato permitido
        if reg["has_allowed_format"] != 1:
            stats["no_format"] += 1
            continue

        # debe tener descarga
        if not reg["download_url"]:
            stats["no_download"] += 1
            continue

        rows.append(reg)
        stats["ok"] += 1

        if stats["ok"] % 50 == 0:
            print(f"OK={stats['ok']} | stats={stats}")

        time.sleep(SLEEP)

    df = pd.DataFrame(rows)
  # SE PUEDEN CAMBIAR LOS NOMBRES DE LOS ARCHIVOS RESULTADOS CSV Y XLSX  
    out_csv = "Punto1_NOMBREPORTAL.csv"
    out_xlsx = "Punto1_NOMBREPORTAL.xlsx"
    df.to_csv(out_csv, index=False, encoding="utf-8-sig")
    df.to_excel(out_xlsx, index=False, engine="openpyxl")

    print("\n Listo:", PORTAL)
    print(" Total final:", len(df))
    print(" Stats:", stats)
    print("Archivos:", out_csv, "|", out_xlsx)

if __name__ == "__main__":
    main()