In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# ===========================================
# CONFIG
# ===========================================

PORTAL_NAME = "UK Research Datasets (Crossref + OpenAlex)"
OUTPUT_FILE = "Punto1_UKDA_PortalV2.xlsx"

MIN_AGE_MONTHS = 12
TODAY = datetime.today()
AGE_LIMIT = TODAY - timedelta(days=MIN_AGE_MONTHS * 30)

# ===========================================
# CATEGORIZACIÃ“N
# ===========================================

CATEGORIES = {
    "Social & Household Surveys": [
        "survey", "longitudinal", "household", "panel", "cohort", 
        "social survey", "ukhls", "british cohort", "bhps"
    ],
    "Economic & Labour Market": [
        "economic", "business", "productivity", "labour", 
        "employment", "industry"
    ],
    "Health & Education": [
        "health", "epidemiology", "wellbeing", "education", 
        "school", "hospital"
    ],
    "Environment & Census": [
        "census", "climate", "biodiversity", "environment",
        "demography", "population"
    ]
}

INSTITUTIONS = [
    "UK Data Archive",
    "UK Data Service",
    "University of Essex",
    "ESRC",
    "Office for National Statistics",
    "ONS",
    "UCL",
    "British Cohort"
]


# ===========================================
# HELPERS
# ===========================================

def safe_date(x):
    try:
        return datetime.strptime(x[:10], "%Y-%m-%d")
    except:
        return None

def classify_dataset(title, description):
    text = f"{title.lower()} {description.lower()}"
    for cat, keywords in CATEGORIES.items():
        if any(k in text for k in keywords):
            return cat
    return "Other"


def institution_match(affiliations):
    if not affiliations:
        return False
    text = str(affiliations).lower()
    return any(inst.lower() in text for inst in INSTITUTIONS)


# ===========================================
# CROSSREF QUERY
# ===========================================

def fetch_crossref_datasets():
    url = (
        "https://api.crossref.org/works?"
        "filter=type:dataset&rows=1000"
    )
    r = requests.get(url, timeout=20).json()
    return r.get("message", {}).get("items", [])


# ===========================================
# OPENALEX QUERY
# ===========================================

def fetch_openalex_datasets():
    url = (
        "https://api.openalex.org/works?"
        "filter=type:dataset&per-page=200"
    )
    r = requests.get(url, timeout=20).json()
    return r.get("results", [])


# ===========================================
# MAIN
# ===========================================

def main():

    print("\nðŸ”Ž Descargando datasets de Crossrefâ€¦")
    crossref_items = fetch_crossref_datasets()

    print("ðŸ”Ž Descargando datasets de OpenAlexâ€¦")
    openalex_items = fetch_openalex_datasets()

    rows = []

    print("\nðŸ“Œ Procesando datasetsâ€¦")

    # ---------------------------------------------------
    # 1) PROCESAR CROSSREF
    # ---------------------------------------------------
    for it in crossref_items:

        title = " | ".join(it.get("title", []))
        desc = it.get("abstract", "")
        doi = it.get("DOI")
        issued = safe_date(it.get("created", {}).get("date-time", ""))
        aff = str(it.get("author", ""))

        # Criterios de selecciÃ³n
        if not doi:
            continue
        if not issued:
            continue
        if issued > AGE_LIMIT:
            continue
        if not institution_match(aff):
            continue

        category = classify_dataset(title, desc)

        rows.append({
            "portal": PORTAL_NAME,
            "category": category,
            "dataset_uri": f"https://doi.org/{doi}",
            "title": title,
            "description": desc,
            "issued": issued.strftime("%Y-%m-%d"),
            "modified": "",
            "publisher": aff,
            "format": "DOI-DATASET",
            "download_url": f"https://doi.org/{doi}",
            "license": "",
            "source": "crossref"
        })

    # ---------------------------------------------------
    # 2) PROCESAR OPENALEX
    # ---------------------------------------------------
    for it in openalex_items:

        title = it.get("title", "")
        desc = it.get("abstract_inverted_index", "")
        doi = it.get("doi")
        issued = safe_date(it.get("publication_date", ""))
        aff = str(it.get("authorships", ""))

        if not doi:
            continue
        if not issued:
            continue
        if issued > AGE_LIMIT:
            continue
        if not institution_match(aff):
            continue

        category = classify_dataset(title, str(desc))

        rows.append({
            "portal": PORTAL_NAME,
            "category": category,
            "dataset_uri": f"https://doi.org/{doi}",
            "title": title,
            "description": desc,
            "issued": issued.strftime("%Y-%m-%d"),
            "modified": "",
            "publisher": aff,
            "format": "DOI-DATASET",
            "download_url": f"https://doi.org/{doi}",
            "license": "",
            "source": "openalex"
        })

    df = pd.DataFrame(rows)
    df.to_excel(OUTPUT_FILE, index=False)

    print(f"\nâœ” Archivo generado: {OUTPUT_FILE}")
    print(f"ðŸ“Š Total datasets finales: {len(df)}")


if __name__ == "__main__":
    main()