In [1]:
import re
import time
import requests
import pandas as pd
from datetime import datetime, timezone, timedelta
from urllib.parse import urljoin
import unicodedata

# ============================================================
# PUNTO 1 (METADATOS) - PORTAL:  (CKAN)            EJEMPLO PORTAL:
# Salida: Excel/CSV con columnas CLAVE (las tuyas) + formatos + categorías + fechas
# ============================================================
PORTAL = "Junta de Andalucía"
API_TYPE = "CKAN"            #TIPO DE API
API_BASE = "https://www.juntadeandalucia.es/datosabiertos/portal/api/3/action/"        #URL DE API
SITE_BASE = "https://www.juntadeandalucia.es/datosabiertos/portal"                   #URL ENDPOINT DE API   

DATASET_LANDING_TEMPLATE = f"{SITE_BASE}/dataset/{{name}}"
RDF_TEMPLATE = f"{SITE_BASE}/dataset/{{name}}.rdf"

TIMEOUT = 45
SLEEP = 0.05
ROWS = 200
MAX_DATASETS = 600                     # PUEDE CAMBIAR EL LIMITE DE DATASET
MIN_AGE_MONTHS = 12                  # PUEDE CAMBIAR EL CRITERIOS DE EDAD DEL DATASET


UA_HEADERS = {
    "User-Agent": "TFM-IIP-Metadata-Extractor/1.0",
    "Accept": "application/json",
}

OPEN_FORMATS = {"CSV","JSON","GEOJSON","XML","RDF","TTL","TURTLE","N-TRIPLES","NT","JSON-LD","JSONLD","XLSX"}
SEMANTIC_FORMATS = {"RDF","TTL","TURTLE","N-TRIPLES","NT","JSON-LD","JSONLD"}

DATA_DICT_PATTERNS = [
    r"diccionario de datos", r"data dictionary", r"schema", r"esquema",
    r"documentaci[oó]n", r"metadatos", r"data model", r"glosario"
]

 # PUEDE CAMBIAR LOS CRITERIOS DE CATEGORIAS

CATEGORIAS_KEYWORDS = {
    "Transporte y Movilidad": [
        "transporte", "movilidad", "trafico", "tráfico", "carreteras",
        "vehiculos", "vehículos", "autobuses", "metro", "ciclistas",
        "aparcamientos", "taxis", "vialidad"
    ],
    "Ciencia y Tecnología": [
        "ciencia", "tecnologia", "tecnología", "innovacion", "innovación",
        "investigacion", "investigación", "i+d", "proyectos",
        "desarrollo", "sistemas", "tecnologías"
    ],
    "Salud": [
        "salud", "sanidad", "hospitales", "urgencias", "epidemiologia",
        "epidemiología", "asistencia sanitaria", "covid", "enfermedades",
        "vacunacion", "vacunación", "salud pública", "farmacias"
    ],
    "Educación": [
        "educacion", "educación", "formacion", "formación", "universidad",
        "universidades", "colegios", "institutos", "centros educativos",
        "profesorado", "alumnado", "matriculas", "matrículas"
    ],
    "Datos Geográficos y Medioambientales": [
        "medio ambiente", "medio-ambiente", "geografia", "geografía",
        "cartografia", "cartografía", "clima", "meteorologia", "meteorología",
        "contaminacion", "contaminación", "biodiversidad", "parques",
        "rios", "ríos", "fauna", "flora", "ecologia", "ecología"
    ],
    "Datos Demográficos y Estadísticos": [
        "demografia", "demografía", "estadistica", "estadística",
        "poblacion", "población", "municipios", "censos",
        "indicadores", "series temporales"
    ]
}

# -------------------------------
# Helpers
# -------------------------------
def safe_get(url, params=None):
    r = requests.get(url, params=params, headers=UA_HEADERS, timeout=TIMEOUT, allow_redirects=True)
    r.raise_for_status()
    ctype = (r.headers.get("Content-Type") or "").lower()
    if "json" not in ctype:
        preview = r.text[:500].replace("\n", " ")
        raise RuntimeError(f"Respuesta NO JSON desde {r.url} | Content-Type={ctype} | preview={preview}")
    return r

def head_or_get_public(url: str) -> bool:
    if not url or not str(url).startswith("http"):
        return False
    try:
        h = requests.head(url, headers=UA_HEADERS, timeout=TIMEOUT, allow_redirects=True)
        if h.status_code in (403, 405):
            g = requests.get(url, headers=UA_HEADERS, timeout=TIMEOUT, allow_redirects=True)
            return 200 <= g.status_code < 400
        return 200 <= h.status_code < 400
    except Exception:
        return False

def normalize_text(s: str) -> str:
    if not s:
        return ""
    s = s.lower()
    s = unicodedata.normalize("NFD", s)
    s = "".join(c for c in s if unicodedata.category(c) != "Mn")
    return s

def parse_dt(s: str):
    """Devuelve datetime UTC-aware o None (evita naive vs aware)."""
    if not s:
        return None
    try:
        s2 = str(s).strip().replace("Z", "+00:00")
        dt = datetime.fromisoformat(s2)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc)
    except Exception:
        return None

def meets_age(dt_utc):
    if not dt_utc:
        return True
    limite = datetime.now(timezone.utc) - timedelta(days=MIN_AGE_MONTHS * 30.44)
    return dt_utc <= limite

def contains_any(text: str, patterns) -> bool:
    if not text:
        return False
    t = str(text).lower()
    return any(re.search(p, t) for p in patterns)

def extras_to_dict(extras):
    d = {}
    if isinstance(extras, list):
        for it in extras:
            if isinstance(it, dict) and "key" in it:
                d[str(it.get("key"))] = it.get("value")
    return d

def classify_category_ckan(title, notes, groups, tags):
    for g in groups:
        gtxt = normalize_text(g.get("title") or g.get("name"))
        for cat, kws in CATEGORIAS_KEYWORDS.items():
            if any(kw in gtxt for kw in kws):
                return cat, "group"

    for t in tags:
        ttxt = normalize_text(t.get("name"))
        for cat, kws in CATEGORIAS_KEYWORDS.items():
            if any(kw in ttxt for kw in kws):
                return cat, "tag"

    blob = normalize_text(f"{title} {notes}")
    for cat, kws in CATEGORIAS_KEYWORDS.items():
        if any(kw in blob for kw in kws):
            return cat, "text"

    return "No definido", "none"

def detect_controlled_vocab_ckan(groups, tags):
    """
    Heurística CKAN:
    - groups => vocabulario controlado oficial
    - tags normalizados => vocabulario débil pero válido
    """
    if groups:
        return True, "group"

    if tags:
        # tags cortos, sin espacios largos, repetidos
        normalized_tags = [t.get("name","") for t in tags if isinstance(t, dict)]
        if any(len(t) <= 25 and "-" in t for t in normalized_tags):
            return True, "tag"

    return False, "none"

def portal_supports_dcat(site_base: str) -> bool:
    # Aragón: suele exponer catálogo DCAT (si responde, marcamos 1)
    return head_or_get_public(urljoin(site_base.rstrip("/") + "/", "catalog/dcat.json")) or \
           head_or_get_public(urljoin(site_base.rstrip("/") + "/", "catalog/dcat.rdf"))

def is_open_license(license_value: str) -> int:
    s = (license_value or "").strip().lower()
    if "creative commons" in s:
        return 1
    if "cc-by" in s:
        return 1
    if re.search(r"\bcc\s*by\b", s):
        return 1
    if "avisolegal" in s:
        return 1
    return 0

def extract_identifier(ds: dict, exd: dict) -> str:
    for k in ["identifier", "dct:identifier", "dcat:identifier"]:
        v = ds.get(k) or exd.get(k)
        if v:
            return str(v)
    # fallback fuerte
    return str(ds.get("id") or ds.get("name") or "")

def extract_doi(ds: dict, exd: dict) -> str:
    doi_regex = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
    for k in ["doi", "DOI"]:
        v = ds.get(k) or exd.get(k)
        if v:
            m = re.search(doi_regex, str(v), flags=re.I)
            if m:
                return m.group(1)
    for v in exd.values():
        if not v:
            continue
        m = re.search(doi_regex, str(v), flags=re.I)
        if m:
            return m.group(1)
    return ""

def find_update_frequency(exd: dict) -> str:
    for k in [
        "accrualPeriodicity", "dct:accrualPeriodicity", "dcat:accrualPeriodicity",
        "accrual_periodicity", "accrualperiodicity",
        "update_frequency", "updateFrequency", "update_frecuency",
        "frequency", "frecuencia_actualizacion", "periodicity", "periodicidad"
    ]:
        v = exd.get(k)
        if v:
            return str(v)
    return "No definido"

def find_update_frequency_from_rdf(dataset_name: str) -> str:
    """Intenta leer accrualPeriodicity desde el RDF del dataset."""
    if not dataset_name:
        return "No definido"

    rdf_url = f"{SITE_BASE}/dataset/{dataset_name}.rdf"

    try:
        r = requests.get(rdf_url, headers=UA_HEADERS, timeout=TIMEOUT, allow_redirects=True)
        if r.status_code >= 400:
            return "No definido"
        txt = r.text

        # Caso 1: recurso URI
        m = re.search(r"accrualPeriodicity[^>]+rdf:resource\s*=\s*\"([^\"]+)\"", txt, flags=re.I)
        if m:
            uri = m.group(1).strip()
            tail = uri.rstrip("/").split("/")[-1]
            return tail

        # Caso 2: literal
        m2 = re.search(r"accrualPeriodicity[^>]*>\s*([^<]+)\s*<", txt, flags=re.I)
        if m2:
            return m2.group(1).strip()

        return "No definido"
    except Exception:
        return "No definido"


def find_update_frequency_from_catalog_html(dataset_uri: str) -> str:
    """Intenta extraer 'Frecuencia de actualización' desde el HTML del dataset."""
    if not dataset_uri or not str(dataset_uri).startswith("http"):
        return "No definido"
    try:
        r = requests.get(dataset_uri, headers=UA_HEADERS, timeout=TIMEOUT, allow_redirects=True)
        if r.status_code >= 400:
            return "No definido"
        html = r.text

        m = re.search(
            r"Frecuencia\s+de\s+actualizaci[oó]n.*?</[^>]+>\s*([^<\n\r]+)",
            html,
            flags=re.I | re.S
        )
        if m:
            val = re.sub(r"\s{2,}", " ", m.group(1).strip())
            return val[:80].strip()

        return "No definido"
    except Exception:
        return "No definido"


# -------------------------------
# CKAN fetch
# -------------------------------
def ckan_package_search(start=0, rows=ROWS):
    url = urljoin(API_BASE, "package_search")
    params = {"start": start, "rows": rows}
    js = safe_get(url, params=params).json()
    if not js.get("success"):
        raise RuntimeError(f"CKAN package_search no success: {js}")
    return js["result"]

def ckan_package_show(ds_id):
    url = urljoin(API_BASE, "package_show")
    js = safe_get(url, params={"id": ds_id}).json()
    if not js.get("success"):
        raise RuntimeError(f"CKAN package_show no success: {ds_id}")
    return js["result"]

# -------------------------------
# Evaluación dataset -> fila
# -------------------------------
def evaluate_dataset_ckan(ds: dict, portal_has_dcat: bool):
    extras = extras_to_dict(ds.get("extras", []))

    ds_id = ds.get("id", "") or ""
    name = ds.get("name", "") or ""

    title = ds.get("title", "") or ""
    notes = ds.get("notes", "") or ""

    groups = ds.get("groups") or []
    tags = ds.get("tags") or []

    category, category_source = classify_category_ckan(title=title, notes=notes, groups=groups, tags=tags)
    uses_controlled_vocab, controlled_vocab_source = detect_controlled_vocab_ckan(groups=groups, tags=tags)

    # Fechas: Aragón muchas veces no trae issued/modified "puros"; usamos los de CKAN cuando falten
    # Fechas (raw)
    issued_raw = extras.get("issued") or extras.get("dct:issued") or ds.get("issued") or ds.get("metadata_created") or ""
    modified_raw = extras.get("modified") or extras.get("dct:modified") or ds.get("modified") or ds.get("metadata_modified") or ""

    # Fechas normalizadas (UTC ISO)
    issued_dt_utc = parse_dt(issued_raw)
    modified_dt_utc = parse_dt(modified_raw)

    issued_norm = issued_dt_utc.isoformat().replace("+00:00", "Z") if issued_dt_utc else ""
    modified_norm = modified_dt_utc.isoformat().replace("+00:00", "Z") if modified_dt_utc else ""

    meets_age_flag = 1 if meets_age(issued_dt_utc) else 0


    # Publisher
    publisher = ""
    if isinstance(ds.get("organization"), dict):
        publisher = ds["organization"].get("title") or ds["organization"].get("name") or ""
    publisher = publisher or ds.get("author", "") or ""

    # Licencia
    license_text = ds.get("license_title") or ds.get("license_id") or ""
    license_present = 1 if str(license_text).strip() else 0
    license_open = is_open_license(license_text)

    # Identificadores
    identifier = extract_identifier(ds, extras)
    doi = extract_doi(ds, extras)
    has_doi = 1 if doi else 0

    # Recursos
    resources = ds.get("resources") or []
    formats = []
    download_urls = []
    doc_hint_urls = []

    for r in resources:
        if not isinstance(r, dict):
            continue
        fmt = (r.get("format") or "").strip()
        if fmt:
            formats.append(fmt.upper())

        u = (r.get("url") or "").strip()
        if u:
            download_urls.append(u)
            doc_hint_urls.append(u)

        desc_r = (r.get("description") or "") + " " + (r.get("name") or "")
        if desc_r.strip():
            doc_hint_urls.append(desc_r)

    formats_unique = sorted(set([f for f in formats if f]))
    format_join = ", ".join(formats_unique)
    n_formats = len(formats_unique)

    # metadata_rdf_available: comprobamos si existe el RDF del dataset
    rdf_meta_url = RDF_TEMPLATE.format(name=name) if name else ""
    metadata_rdf_available = 1 if (rdf_meta_url and head_or_get_public(rdf_meta_url)) else 0
    if metadata_rdf_available and "RDF" not in formats_unique:
        formats_unique.append("RDF")
        formats_unique = sorted(set(formats_unique))
        format_join = ", ".join(formats_unique)
        n_formats = len(formats_unique)

    has_allowed_format = 1 if any(f in OPEN_FORMATS for f in formats_unique) else 0
    has_semantic_serialization = 1 if any(f in SEMANTIC_FORMATS for f in formats_unique) else 0

    # Data dictionary
    has_data_dictionary = 1 if (
        contains_any(title + " " + notes, DATA_DICT_PATTERNS) or
        contains_any(" ".join(doc_hint_urls), DATA_DICT_PATTERNS) or
        any(k.lower() in (str(extras.keys()).lower()) for k in ["schema", "dictionary", "diccionario"])
    ) else 0

    # dataset_uri (EL QUE TÚ QUIERES)
    dataset_uri = DATASET_LANDING_TEMPLATE.format(name=name) if name else (ds.get("url") or "")


    # download_url(s)
    download_url = download_urls[0] if download_urls else ""

    # update_frequency: extras -> rdf -> html
    update_frequency = find_update_frequency(extras)
    if update_frequency == "No definido":
        update_frequency = find_update_frequency_from_rdf(name)  # usa el slug
    if update_frequency == "No definido":
        update_frequency = find_update_frequency_from_catalog_html(dataset_uri)  # usa URL real


    frequency_documented = 1 if (update_frequency.strip() and update_frequency != "No definido") else 0

    # public_access_ok
    public_access_ok = 1 if (any(head_or_get_public(u) for u in download_urls) if download_urls else False) else 0

    
   # SE PUEDEN AGREGAR O QUITAR COLUMNAS DE RESULTADOS SIEMPRE QUE LAS DEFINAS 
    
    return {
        # ===== columnas que reclamaste (NO TOCAR) =====
        "portal": PORTAL,
        "api_type": API_TYPE,
        "portal_has_api_rest": 1,
        "portal_supports_dcat_dcatap": 1 if portal_has_dcat else 0,

        "identifier": identifier,
        "doi": doi,
        "has_doi": has_doi,

        "publisher": publisher,

        "download_url": download_url,
        "download_urls": " | ".join(download_urls),

        "license": license_text,
        "license_present": license_present,
        "license_open": license_open,

        # ===== core de tu tabla punto 1 =====
        "dataset_id": ds_id,
        "dataset_uri": dataset_uri,
        "title": title,
        "description": notes,
        "category": category,
        "category_source": category_source,

        "uses_controlled_vocab": 1 if uses_controlled_vocab else 0,
        "controlled_vocab_source": controlled_vocab_source,

        "issued": issued_raw,
        "modified": modified_raw,
        "meets_age_criterion": meets_age_flag,

        "format": format_join,
        "n_formats": n_formats,
        "metadata_rdf_available": metadata_rdf_available,

        "has_allowed_format": has_allowed_format,
        "has_semantic_serialization": has_semantic_serialization,
        "has_data_dictionary": has_data_dictionary,

        "update_frequency": update_frequency,
        "frequency_documented": frequency_documented,

        "public_access_ok": public_access_ok,
    }

def main():
    portal_has_dcat = portal_supports_dcat(SITE_BASE)

    rows = []
    start = 0
    total = None

    while True:
        result = ckan_package_search(start=start, rows=ROWS)
        if total is None:
            total = int(result.get("count", 0))

        datasets = result.get("results", []) or []
        if not datasets:
            break

        for ds in datasets:
            ds_full = ckan_package_show(ds.get("id") or ds.get("name"))
            rows.append(evaluate_dataset_ckan(ds_full, portal_has_dcat))
            time.sleep(SLEEP)

            if MAX_DATASETS and len(rows) >= MAX_DATASETS:
                break

        if MAX_DATASETS and len(rows) >= MAX_DATASETS:
            break

        start += ROWS
        if start >= total:
            break

    df = pd.DataFrame(rows)

    
  # SE PUEDEN CAMBIAR LOS NOMBRES DE LOS ARCHIVOS RESULTADOS CSV Y XLSX  
    
    
    out_csv = "Punto1_NOMBREPORTAL.csv"
    out_xlsx = "Punto1_NOMBREPORTAL.xlsx"
    df.to_csv(out_csv, index=False, encoding="utf-8-sig")
    df.to_excel(out_xlsx, index=False, engine="openpyxl")

    print(f"\n✅ Listo: {PORTAL}")
    print(f"Datasets extraídos: {len(df)} (MAX_DATASETS={MAX_DATASETS})")
    print(f"Archivos: {out_csv} | {out_xlsx}")

if __name__ == "__main__":
    main()


✅ Listo: Junta de Andalucía
Datasets extraídos: 600 (MAX_DATASETS=600)
Archivos: Punto1_Ayto_Andalucia_v1.csv | Punto1_Ayto_Andalucia_v1.xlsx
