In [1]:
# ===============================
# PORTALNOMBRE – OPENDATASOFT   EJEMPLO PORTAL:

# ===============================

import re
import time
import requests
import pandas as pd
from datetime import datetime, timezone
import unicodedata

PORTAL = "PORTAL"
API_TYPE = "OPENDATASOFT"     #TIPO DE API
API_BASE = "https://analisis.datosabiertos.jcyl.es/api/explore/v2.1"      #URL DE API
SITE_BASE = "https://analisis.datosabiertos.jcyl.es"                     #URL ENDPOINT DE API   

TIMEOUT = 45
SLEEP = 0.05
LIMIT = 100                                         
MAX_DATASETS = 600              # PUEDE CAMBIAR EL LIMITE DE DATASET

UA_HEADERS = {"User-Agent": "TFM-IIP-Metadata-Extractor/1.0"}

# ---------------- Helpers ----------------

def safe_get(url, params=None):
    r = requests.get(url, params=params, headers=UA_HEADERS, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def pick_first(*vals, default=""):
    for v in vals:
        if isinstance(v, str) and v.strip():
            return v.strip()
    return default

def parse_dt(s):
    if not s:
        return None
    try:
        dt = datetime.fromisoformat(str(s).replace("Z", "+00:00"))
        if not dt.tzinfo:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt
    except Exception:
        return None

def meets_age_criterion(issued_dt, years=1):
    if not issued_dt:
        return False
    return (datetime.now(timezone.utc) - issued_dt).days >= 365 * years

def head_or_get_public(url):
    if not url or not url.startswith("http"):
        return False
    try:
        h = requests.head(url, headers=UA_HEADERS, timeout=TIMEOUT, allow_redirects=True)
        if h.status_code in (403, 405):
            g = requests.get(url, headers=UA_HEADERS, timeout=TIMEOUT, allow_redirects=True)
            return 200 <= g.status_code < 400
        return 200 <= h.status_code < 400
    except Exception:
        return False

def is_open_license(text):
    s = (text or "").lower()
    if not s:
        return 0
    if any(x in s for x in ["noncommercial", "no comercial", "nd", "noderivatives"]):
        return 0
    if "cc by" in s or "cc0" in s or "public domain" in s or "aviso legal" in s:
        return 1
    return 0

def license_bucket_p1(text):
    if not text:
        return "Vacío legal"
    return "Apertura total" if is_open_license(text) else "Restringida"

# ---------------- Fetch ODS ----------------

def fetch_datasets():
    url = f"{API_BASE}/catalog/datasets"
    offset = 0
    out = []

    while True:
        js = safe_get(url, params={"limit": LIMIT, "offset": offset}).json()
        results = js.get("results", [])
        if not results:
            break
        out.extend(results)
        offset += LIMIT
        time.sleep(SLEEP)
        if MAX_DATASETS and len(out) >= MAX_DATASETS:
            return out[:MAX_DATASETS]
    return out

def fetch_dataset_detail(dataset_id):
    return safe_get(f"{API_BASE}/catalog/datasets/{dataset_id}").json()

# ---------------- MAIN ----------------

def main():
    rows = []
    items = fetch_datasets()

    for it in items:
        dataset_id = it.get("dataset_id")
        if not dataset_id:
            continue

        detail = fetch_dataset_detail(dataset_id)
        metas = detail.get("metas", {})
        metas_default = metas.get("default", {})
        metas_dcat = metas.get("dcat", {})

        title = pick_first(metas_default.get("title"), dataset_id)
        notes = pick_first(metas_default.get("description"))

        publisher = pick_first(
            metas_default.get("publisher"),
            metas_dcat.get("publisher"),
            default="Junta de Castilla y León"
        )

        license_text = pick_first(
            metas_default.get("license"),
            metas_dcat.get("license")
        )

        issued = pick_first(
            metas_default.get("metadata_processed"),
            metas_default.get("created")
        )

        modified = pick_first(
            metas_default.get("modified"),
            metas_default.get("data_processed")
        )

        issued_dt = parse_dt(issued)

        update_frequency = pick_first(metas_default.get("accrualperiodicity"))
        frequency_documented = bool(update_frequency)

        dataset_uri = f"{SITE_BASE}/explore/dataset/{dataset_id}/"
        download_url = f"{SITE_BASE}/explore/dataset/{dataset_id}/download/?format=csv"
        
        
        dataset_url_ok = head_or_get_public(dataset_uri)
        download_url_ok = head_or_get_public(download_url)
        public_access_ok = 1 if download_url_ok else 0

        rdf_meta_url = f"{SITE_BASE}/api/v2/catalog/datasets/{dataset_id}?format=dcat"
        metadata_rdf_available = 1 if head_or_get_public(rdf_meta_url) else 0

# SE PUEDEN AGREGAR O QUITAR COLUMNAS DE RESULTADOS SIEMPRE QUE LAS DEFINAS
        rows.append({
            "portal": PORTAL,
            "api_type": API_TYPE,
            "portal_has_api_rest": 1,
            "portal_supports_dcat_dcatap": 1,

            "identifier": "",
            "doi": "",
            "has_doi": 0,

            "publisher": publisher,

            "download_url": download_url,
            "download_urls": download_url,

            "license": license_text,
            "license_present": bool(license_text),
            "license_open": is_open_license(license_text),
            "license_bucket": license_bucket_p1(license_text),

            "dataset_id": dataset_id,
            "dataset_uri": dataset_uri,

            "title": title,
            "description": notes,
            "category": "",
            "category_source": "",
            "uses_controlled_vocab": 0,
            "controlled_vocab_source": "",

            "issued": issued,
            "modified": modified,
            "meets_age_criterion": meets_age_criterion(issued_dt),

            "format": "CSV",
            "n_formats": 1,
            "metadata_rdf_available": metadata_rdf_available,

            "has_allowed_format": 1,
            "has_semantic_serialization": metadata_rdf_available,
            "has_data_dictionary": 0,
            "update_frequency": update_frequency,
            "frequency_documented": frequency_documented,
            "public_access_ok": public_access_ok,
        })
  # SE PUEDEN CAMBIAR LOS NOMBRES DE LOS ARCHIVOS RESULTADOS CSV Y XLSX  
    df = pd.DataFrame(rows)
    df.to_csv("Punto1_NOMBREPORTAL.csv", index=False, encoding="utf-8-sig")
    df.to_excel("Punto1_NOMBREPORTAL.xlsx", index=False)

    print(f"✅ {PORTAL} – datasets: {len(df)}")

if __name__ == "__main__":
    main()

✅ Castilla y León – datasets: 419
