In [None]:
import pandas as pd
import requests
import re
import time
import random
from urllib.parse import quote
from deep_translator import GoogleTranslator

# =========================================================
# UTILIDADES
# =========================================================

def translate_to_en(text):
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except:
        return text

def clean_text(text):
    if not isinstance(text, str):
        return ""
    t = text.lower().strip()
    t = re.sub(r"[^a-zA-Z0-9 ]", " ", t)
    return re.sub(r"\s+", " ", t)


def extract_keywords(title, description, k=6):
    text = clean_text(f"{title} {description}")
    stop = {"the", "of", "and", "for", "data", "study"}
    words = [w for w in text.split() if len(w) > 4 and w not in stop]
    return list(dict.fromkeys(words))[:k]


# =========================================================
# FUENTES DE REUTILIZACIÃ“N
# =========================================================

def search_crossref(query):
    try:
        url = f"https://api.crossref.org/works?query={quote(query)}&rows=5"
        items = requests.get(url, timeout=10).json().get("message", {}).get("items", [])
        return [{
            "source": "crossref",
            "external_id": it.get("DOI"),
            "title": " | ".join(it.get("title", [])),
            "url": it.get("URL"),
            "date": it.get("created", {}).get("date-time", "")
        } for it in items]
    except:
        return []


def search_openalex(query):
    try:
        url = f"https://api.openalex.org/works?filter=title.search:{quote(query)}&per-page=5"
        items = requests.get(url, timeout=10).json().get("results", [])
        return [{
            "source": "openalex",
            "external_id": it.get("id"),
            "title": it.get("title"),
            "url": it.get("doi"),
            "date": it.get("publication_date")
        } for it in items]
    except:
        return []


def search_zenodo(query):
    try:
        url = f"https://zenodo.org/api/records/?q={quote(query)}&size=5"
        items = requests.get(url, timeout=10).json().get("hits", {}).get("hits", [])
        return [{
            "source": "zenodo",
            "external_id": it.get("id"),
            "title": it.get("metadata", {}).get("title"),
            "url": it.get("metadata", {}).get("doi"),
            "date": it.get("metadata", {}).get("publication_date")
        } for it in items]
    except:
        return []


def search_cordis(query):
    try:
        url = f"https://cordis.europa.eu/api/search?q={quote(query)}&num=5"
        items = requests.get(url, timeout=10).json().get("projects", [])
        return [{
            "source": "cordis",
            "external_id": it.get("id"),
            "title": it.get("title"),
            "url": f"https://cordis.europa.eu/project/id/{it.get('id')}",
            "date": it.get("startDate")
        } for it in items]
    except:
        return []


def search_github(query):
    try:
        url = f"https://api.github.com/search/code?q={quote(query)}+in:file&per_page=5"
        res = requests.get(url, timeout=10).json().get("items", [])
        return [{
            "source": "github",
            "external_id": it.get("repository", {}).get("full_name"),
            "title": it.get("name"),
            "url": it.get("html_url"),
            "date": it.get("repository", {}).get("pushed_at")
        } for it in res]
    except:
        return []


# =========================================================
# BÃšSQUEDA PRINCIPAL
# =========================================================

def search_reuse(dataset_uri, title, description, doi):

    title_clean = clean_text(title)
    title_en = translate_to_en(title_clean)
    keywords = extract_keywords(title, description)

    queries = []

    if doi:
        queries.append((doi, "DOI_MATCH"))

    queries.extend([
        (title_clean, "TITLE_ES"),
        (title_en, "TITLE_EN"),
        (dataset_uri, "URI_MATCH")
    ])

    for kw in keywords:
        queries.append((kw, "KEYWORD"))

    results = []

    SOURCES = [
        (search_crossref, "crossref"),
        (search_openalex, "openalex"),
        (search_zenodo, "zenodo"),
        (search_cordis, "cordis"),
        (search_github, "github")
    ]

    for q, match_type in queries:
        if not q.strip():
            continue

        for fn, src in SOURCES:
            hits = fn(q)
            for h in hits:
                h["match_type"] = match_type
                h["query_used"] = q
                results.append(h)

        time.sleep(0.25 + random.random() * 0.25)

    return results


# =========================================================
# MAIN
# =========================================================

def main():

    INPUT = "Punto1_UKDA_PortalV1 - copia.xlsx"
    OUTPUT = "Punto2_UK_ReutilizacionV1.xlsx"

    df = pd.read_excel(INPUT)
    rows = []

    for _, r in df.iterrows():
        dataset_uri = r["dataset_uri"]
        title = r["title"]
        description = r["description"]
        doi = r.get("doi", "")

        print(f"ðŸ”Ž Buscando reutilizaciÃ³n: {title}")

        hits = search_reuse(dataset_uri, title, description, doi)

        for h in hits:
            rows.append({
                "dataset_uri": dataset_uri,
                "title_dataset": title,
                "issued": r["issued"],
                "source": h.get("source"),
                "external_id": h.get("external_id"),
                "match_type": h.get("match_type"),
                "query_used": h.get("query_used"),
                "source_title": h.get("title"),
                "url": h.get("url"),
                "date": h.get("date")
            })

    out = pd.DataFrame(rows)
    out.to_excel(OUTPUT, index=False)

    print("\nâœ” Archivo generado:", OUTPUT)


if __name__ == "__main__":
    main()