In [None]:
import pandas as pd
import requests
import re
import time
import random
from urllib.parse import quote, urlparse
from collections import Counter

# ==========================================================
# CONFIGURACI√ìN
# ==========================================================

INPUT_FILE  = "Punto1_NOMBREARCHIVORESULTADO.xlsx"   # ARCHIVO RESULTADO DEL SCRIPT 1
OUTPUT_FILE = "Punto2_NOMBREARCHIVORESULTADO.xlsx"   # ARCHIVO QUE RESULTADO DE ESTE SCRIPT 2
PORTAL_NAME = "NOMBRE DEL PORTAL"                   # NOMBRE DEL PORTAL

# AQUI PUEDES USAR TU TOKEN DE GitHub, ponlo aqu√≠ para mejores resultados
GITHUB_TOKEN = None  # "EJEMPLO_xxxxxxxxxxxxxxxxxxxxx"


# ==========================================================
# UTILIDADES
# ==========================================================

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.strip().lower()
    text = re.sub(r"\s+", " ", text)
    return text


def extract_dataset_id(dataset_uri: str) -> str:
    try:
        return dataset_uri.rstrip("/").split("/")[-1]
    except:
        return ""


def extract_domain(dataset_uri: str) -> str:
    try:
        return urlparse(dataset_uri).netloc
    except:
        return ""


def extract_file_tokens(download_url: str):
    """
    A partir de la columna download_url genera tokens fuertes:
    nombres de ficheros sin extensi√≥n (transporte-bus, paradas-autobus, Ferris, TODO ESTO EJEMPLOSetc.).
    """
    tokens = set()
    if not isinstance(download_url, str):
        return tokens

    parts = [p.strip() for p in download_url.split(";") if p.strip()]
    for p in parts:
        # nombre de archivo
        fname = p.split("/")[-1]
        if not fname:
            continue
        base = fname.split(".")[0]  # sin extensi√≥n
        base_clean = clean_text(base)
        if base_clean:
            tokens.add(base_clean)
    return tokens


def build_strong_tokens(dataset_uri: str, title: str, download_url: str):
    """
    Construye un conjunto de tokens "fuertes" para detectar reutilizaci√≥n real:
    - slug del dataset (√∫ltimo segmento de la URI)
    - nombres de ficheros de descarga (sin extensi√≥n)
    - t√≠tulo del dataset normalizado completo
    """
    tokens = set()

    # slug
    slug = extract_dataset_id(dataset_uri)
    slug_clean = clean_text(slug)
    if slug_clean:
        tokens.add(slug_clean)

    # t√≠tulo completo normalizado
    title_clean = clean_text(title)
    if title_clean:
        tokens.add(title_clean)

    # tokens de archivos
    tokens |= extract_file_tokens(download_url)

    return tokens


# ==========================================================
# FUENTES DE REUTILIZACI√ìN
# (todas se consultan con identificadores fuertes)
# ==========================================================

def search_crossref(query: str):
    url = f"https://api.crossref.org/works?query={quote(query)}&rows=5"
    try:
        r = requests.get(url, timeout=10).json()
        items = r.get("message", {}).get("items", [])
        results = []
        for it in items:
            results.append({
                "source": "crossref",
                "external_id": it.get("DOI"),
                "title": " | ".join(it.get("title", [])),
                "url": it.get("URL"),
                "date": it.get("created", {}).get("date-time", "")
            })
        return results
    except:
        return []


def search_openalex(query: str):
    # Lo mantenemos por completitud, pero ser√° filtrado igual que crossref
    url = f"https://api.openalex.org/works?filter=title.search:{quote(query)}&per-page=5"
    try:
        r = requests.get(url, timeout=10).json()
        results = []
        for it in r.get("results", []):
            results.append({
                "source": "openalex",
                "external_id": it.get("id"),
                "title": it.get("title"),
                "url": it.get("doi"),
                "date": it.get("publication_date")
            })
        return results
    except:
        return []


def search_zenodo(query: str):
    url = f"https://zenodo.org/api/records/?q={quote(query)}&size=5"
    try:
        r = requests.get(url, timeout=10).json()
        hits = r.get("hits", {}).get("hits", [])
        results = []
        for it in hits:
            md = it.get("metadata", {})
            results.append({
                "source": "zenodo",
                "external_id": it.get("id"),
                "title": md.get("title"),
                "url": md.get("doi"),
                "date": md.get("publication_date")
            })
        return results
    except:
        return []


def search_github(query: str):
    headers = {"Accept": "application/vnd.github+json"}
    if GITHUB_TOKEN:
        headers["Authorization"] = f"token {GITHUB_TOKEN}"

    q = query.strip()
    if len(q) > 80:
        q = q[:80]

    url = f"https://api.github.com/search/code?q={quote(q)}+in:file&per_page=5"
    try:
        r = requests.get(url, headers=headers, timeout=10).json()
        items = r.get("items", [])
        results = []
        for it in items:
            repo = it.get("repository", {})
            results.append({
                "source": "github",
                "external_id": repo.get("full_name"),
                "title": it.get("name"),
                "url": it.get("html_url"),
                "date": repo.get("pushed_at") or repo.get("created_at", "")
            })
        return results
    except:
        return []


def search_kaggle(query: str):
    try:
        url = f"https://www.kaggle.com/datasets?search={quote(query)}"
        r = requests.get(url, timeout=10)
        html = r.text

        titles = re.findall(r'data-testid="title">(.*?)</span>', html)
        links  = re.findall(r'href="(/datasets/[^"]+)"', html)

        results = []
        for i, t in enumerate(titles[:5]):
            url_rel = links[i] if i < len(links) else ""
            results.append({
                "source": "kaggle",
                "external_id": "",
                "title": t,
                "url": "https://www.kaggle.com" + url_rel if url_rel else "",
                "date": ""
            })
        return results
    except:
        return []


def search_cordis(query: str):
    try:
        url = f"https://cordis.europa.eu/api/search?q={quote(query)}&num=5"
        r = requests.get(url, timeout=10).json()
        projects = r.get("projects") or r.get("results") or []
        results = []
        for it in projects:
            results.append({
                "source": "cordis",
                "external_id": it.get("id"),
                "title": it.get("title", ""),
                "url": str(it.get("rcn", "")),
                "date": it.get("startDate", "")
            })
        return results
    except:
        return []


def search_google_play(query: str):
    try:
        url = f"https://play.google.com/store/search?q={quote(query)}&c=apps"
        r = requests.get(url, timeout=10)
        html = r.text.lower()
        apps = re.findall(r'\"title\":\{\"label\":\"(.*?)\"', html)
        return [{
            "source": "google_play",
            "external_id": "",
            "title": a,
            "url": "",
            "date": ""
        } for a in apps[:5]]
    except:
        return []


def search_appstore(query: str):
    try:
        url = f"https://apps.apple.com/es/search?term={quote(query)}"
        r = requests.get(url, timeout=10)
        html = r.text
        apps = re.findall(
            r'<h3 class="we-truncate we-truncate--single-line we-truncate--interactive we-truncate--multi-line-medium">([^<]+)</h3>',
            html
        )
        return [{
            "source": "app_store",
            "external_id": "",
            "title": a,
            "url": "",
            "date": ""
        } for a in apps[:5]]
    except:
        return []


# ==========================================================
# REUTILIZACI√ìN REAL (incluye Crossref/Zenodo)
# ==========================================================

def is_real_reuse(hit: dict,
                  dataset_uri: str,
                  strong_tokens: set,
                  portal_markers: set,
                  query_used: str) -> bool:
    """
    Devuelve True SOLO si el resultado hace referencia real al dataset:
    - contiene la URL del dataset, o
    - contiene alg√∫n token fuerte (slug, nombre de fichero, t√≠tulo normalizado), o
    - contiene alg√∫n marcador del portal (datosabiertos.regiondemurcia.es, datos.lorca.es, nexo.carm.es)
    """

    source = hit.get("source", "")
    title  = clean_text(hit.get("title", "") or "")
    url    = clean_text(hit.get("url", "") or "")
    extid  = clean_text(str(hit.get("external_id", "") or ""))
    q      = clean_text(query_used or "")

    full_text = " ".join([title, url, extid, q])

    ds_uri  = clean_text(dataset_uri or "")

    # 1) URL EXACTA del dataset (muy fuerte)
    if ds_uri and ds_uri in full_text:
        return True

    # 2) Marcadores de portal + slug / token fuerte
    for mk in portal_markers:
        mk_clean = clean_text(mk)
        if mk_clean and mk_clean in full_text:
            # si adem√°s aparece alg√∫n token fuerte, es reutilizaci√≥n clara
            for tok in strong_tokens:
                tok_clean = clean_text(tok)
                if tok_clean and tok_clean in full_text:
                    return True
            # o, si el propio marcador es muy espec√≠fico (ej. datos.lorca.es/calle-cortada)
            if "/" in mk and mk_clean in full_text:
                return True

    # 3) Tokens fuertes solos (slug, nombres de fichero, t√≠tulo exacto)
    for tok in strong_tokens:
        tok_clean = clean_text(tok)
        if tok_clean and tok_clean in full_text:
            return True

    # Si no pasa ning√∫n criterio, NO consideramos reutilizaci√≥n
    return False


# ==========================================================
# CONSTRUCCI√ìN DE QUERIES POR DATASET
# ==========================================================

def build_queries(dataset_uri: str,
                  strong_tokens: set,
                  portal_markers: set):
    """
    Construye una lista corta de queries "fuertes":
    - la URI completa del dataset
    - el dominio
    - algunos tokens fuertes (slug, nombres de fichero)
    - marcadores de portal
    """

    queries = []

    ds_uri = dataset_uri.strip()
    if ds_uri:
        queries.append((ds_uri, "URI_STRICT"))

    domain = extract_domain(dataset_uri)
    if domain:
        queries.append((domain, "DOMAIN_STRICT"))

    # tokens fuertes (slug + nombres fichero + t√≠tulo normalizado)
    # limitamos a 3 para no disparar APIs
    for tok in list(strong_tokens)[:3]:
        if tok:
            queries.append((tok, "TOKEN_STRONG"))

    # marcadores del portal (datosabiertos.regiondemurcia.es, datos.lorca.es, nexo.carm.es)
    for mk in portal_markers:
        if mk:
            queries.append((mk, "PORTAL_MARKER"))

    # eliminar duplicados manteniendo orden
    seen = set()
    final_queries = []
    for q, mt in queries:
        q_norm = q.strip()
        if not q_norm:
            continue
        key = (q_norm, mt)
        if key not in seen:
            seen.add(key)
            final_queries.append((q_norm, mt))

    return final_queries[:8]


# ==========================================================
# B√öSQUEDA MULTIFUENTE POR DATASET
# ==========================================================

def search_reuse(dataset_uri: str,
                 title: str,
                 download_url: str):
    strong_tokens = build_strong_tokens(dataset_uri, title, download_url)

    # marcadores globales del portal Murcia
    portal_markers = {
        "https://datosabiertos.regiondemurcia.es",
        "datosabiertos.regiondemurcia.es",
        "http://datos.lorca.es",
        "datos.lorca.es",
        "http://nexo.carm.es",
        "nexo.carm.es",
    }

    queries = build_queries(dataset_uri, strong_tokens, portal_markers)

    results = []

    sources = [
        (search_crossref,    "crossref"),
        (search_openalex,    "openalex"),
        (search_zenodo,      "zenodo"),
        (search_github,      "github"),
        (search_kaggle,      "kaggle"),
        (search_cordis,      "cordis"),
        (search_google_play, "google_play"),
        (search_appstore,    "app_store"),
    ]

    for q, match_type in queries:
        for fn, src in sources:
            hits = fn(q)
            for h in hits:
                if is_real_reuse(h, dataset_uri, strong_tokens, portal_markers, q):
                    h["match_type"] = match_type
                    h["query_used"] = q
                    results.append(h)
        # Pausa para no abusar de las APIs
        time.sleep(0.25 + random.random() * 0.25)

    return results


# ==========================================================
# MAIN
# ==========================================================

def main():
    df = pd.read_excel(INPUT_FILE)

    rows = []
    counter = Counter()

    print(f"\nüîé Iniciando b√∫squeda de reutilizaci√≥n REAL (con Crossref/Zenodo filtrados) ‚Äì {PORTAL_NAME}\n")

    for _, r in df.iterrows():
        dataset_uri  = r.get("dataset_uri", "")
        title        = r.get("title", "")
        download_url = r.get("download_url", "")
        issued       = r.get("issued")

        print(f" ‚Üí Buscando reutilizaci√≥n para: {title}")

        hits = search_reuse(dataset_uri, title, download_url)
#SE PUEDEN MODIFICAR LOS RETORNOS RESULTADOS SIEMPRE QUE LOS DEFINAS EN EL SCRIPT   
        for h in hits:
            rows.append({
                "dataset_uri":   dataset_uri,
                "title_dataset": title,
                "issued":        issued,
                "source":        h.get("source"),
                "external_id":   h.get("external_id"),
                "match_type":    h.get("match_type"),
                "query_used":    h.get("query_used"),
                "source_title":  h.get("title"),
                "url":           h.get("url"),
                "date":          h.get("date"),
            })
            counter[h.get("source")] += 1

    out = pd.DataFrame(rows)
    out.to_excel(OUTPUT_FILE, index=False)

    print(f"\n Archivo generado correctamente ‚Üí {OUTPUT_FILE}\n")
    print(f" Recuento de huellas de reutilizaci√≥n REAL (filtradas) ‚Äì {PORTAL_NAME}:\n")
    if not counter:
        print("  (No se han encontrado huellas estrictas de reutilizaci√≥n en ninguna fuente)")
    else:
        for src, cnt in counter.items():
            print(f"  {src}: {cnt} coincidencias")


if __name__ == "__main__":
    main()