In [None]:
import pandas as pd
import requests
import re
import time
import random
from urllib.parse import quote, urlparse
from collections import Counter
from deep_translator import GoogleTranslator 


# ===============================================================
# UTILIDADES GENERALES
# ===============================================================

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()
    text = re.sub(r"[^a-zA-Z0-9√°√©√≠√≥√∫√± ]", " ", text)
    return re.sub(r"\s+", " ", text).strip()


def translate_to_en(text):
    """Traducci√≥n de texto con manejo de errores."""
    try:
        # Solo traduce si la longitud es razonable para evitar abusos o errores
        if len(text.split()) < 3 and len(text) < 15:
            return text
        return GoogleTranslator(source='auto', target='en').translate(text)
    except:
        return text


def extract_keywords(title, description, k=5):
    """Extrae palabras clave largas y √∫nicas."""
    text = clean_text((title or "") + " " + (description or ""))
    stop = {"de", "del", "la", "el", "y", "en", "por", "con", "los", "las", "un", "una", "o", "a", "para", "es", "que", "se", "como", "mas"}
    words = [w for w in text.split() if len(w) > 4 and w not in stop]
    
    # Usa Counter para obtener las palabras m√°s frecuentes
    word_counts = Counter(words)
    # Selecciona las palabras m√°s comunes hasta el l√≠mite k
    return [word for word, count in word_counts.most_common(k)]


def get_quoted_query(query, quoted):
    """Genera la cadena de consulta con o sin comillas."""
    query_clean = query.replace('"', '').strip()
    if not query_clean: return None
    if quoted:
        # B√∫squeda exacta (frase)
        return quote(f'"{query_clean}"')
    else:
        # B√∫squeda amplia (palabras separadas)
        return quote(query_clean)


# ===============================================================
# FUENTES DE REUTILIZACI√ìN (B√∫squeda por Contenido)
# ===============================================================

# ---------- CROSSREF ----------
def search_crossref(query, quoted=True):
    quoted_query = get_quoted_query(query, quoted)
    if not quoted_query: return []
    url = f"https://api.crossref.org/works?query.full={quoted_query}&rows=10"
    try:
        r = requests.get(url, timeout=15).json()
        items = r.get("message", {}).get("items", [])
        results = []
        for it in items:
            results.append({
                "source": "crossref",
                "external_id": it.get("DOI"),
                "title": " | ".join(it.get("title", [])),
                "url": it.get("URL"),
                "date": it.get("created", {}).get("date-time", "")
            })
        return results
    except:
        return []


# ---------- OPENALEX ----------
def search_openalex(query, quoted=True):
    quoted_query = get_quoted_query(query, quoted)
    if not quoted_query: return []
    # default.search busca en t√≠tulo y abstract.
    url = f"https://api.openalex.org/works?filter=default.search:{quoted_query}&per-page=10"
    try:
        r = requests.get(url, timeout=15).json()
        results = []
        for it in r.get("results", []):
            results.append({
                "source": "openalex",
                "external_id": it.get("id"),
                "title": it.get("title"),
                "url": it.get("doi"),
                "date": it.get("publication_date")
            })
        return results
    except:
        return []


# ---------- ZENODO ----------
def search_zenodo(query, quoted=True):
    quoted_query = get_quoted_query(query, quoted)
    if not quoted_query: return []
    url = f"https://zenodo.org/api/records/?q={quoted_query}&size=10"
    try:
        r = requests.get(url, timeout=15).json()
        hits = r.get("hits", {}).get("hits", [])
        results = []
        for it in hits:
            md = it.get("metadata", {})
            results.append({
                "source": "zenodo",
                "external_id": it.get("id"),
                "title": md.get("title"),
                "url": md.get("doi"),
                "date": md.get("publication_date")
            })
        return results
    except:
        return []


# ---------- GITHUB - SIN TOKEN ----------
def search_github(query, is_uri=False):
    # GitHub no necesita el par√°metro `quoted` en el mismo sentido, se maneja en q_final
    
    headers = {"Accept": "application/vnd.github.v3.text-match+json", 
               'User-Agent': 'TFM-Data-Reuse-Analyzer/1.0'}

    if is_uri:
        q_final = f'"{query}" in:file extension:md,ipynb,json,py'
    elif "." in query:
        q_final = f'filename:"{query}" in:path'
    else:
        # B√∫squeda por palabras clave sin comillas para ser amplio
        q_final = query 
        if len(q_final.split()) > 1:
            q_final = f'"{q_final}" in:readme' # Si es una frase, la busca en readme
        else:
            q_final = f'{q_final} in:readme'
        
    url = f"https://api.github.com/search/code?q={quote(q_final)}&per_page=10"
    
    try:
        r = requests.get(url, headers=headers, timeout=15).json()
        items = r.get("items", [])
        
        # üö® Manejo de Rate Limit (CR√çTICO sin Token)
        if 'message' in r and 'rate limit' in r['message']:
            print("üö® L√≠mite de tasa de GitHub alcanzado (sin token). Pausando 60s...")
            time.sleep(60) 
            return []
        if 'message' in r and r['message'] == 'Not Found': 
            return []
            
        results = []
        for it in items:
            repo = it.get("repository", {})
            results.append({
                "source": "github",
                "external_id": repo.get("full_name"),
                "title": it.get("name"),
                "url": it.get("html_url"),
                "date": repo.get("pushed_at") or repo.get("created_at", "")
            })
        return results
    except Exception:
        return []


# ---------- KAGGLE (Scraping de datasets) ----------
def search_kaggle(query, quoted=True):
    # Kaggle API no usa el concepto de quoted/unquoted, la URL de b√∫squeda es simple
    query_clean = get_quoted_query(query, False) # Solo quotea sin comillas
    if not query_clean: return []
    
    try:
        url = f"https://www.kaggle.com/datasets?search={query_clean}"
        r = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
        html = r.text
        titles = re.findall(r'data-testid="title">(.*?)</span>', html)
        links  = re.findall(r'href="(/datasets/[^"]+)"', html)

        results = []
        for i, t in enumerate(titles[:5]):
            link = links[i] if i < len(links) else ""
            results.append({
                "source": "kaggle",
                "external_id": "",
                "title": clean_text(t),
                "url": "https://www.kaggle.com" + link if link else "",
                "date": ""
            })
        return results
    except:
        return []


# ---------- CORDIS ----------
def search_cordis(query, quoted=True):
    quoted_query = get_quoted_query(query, quoted)
    if not quoted_query: return []
    
    try:
        url = f"https://cordis.europa.eu/api/search?q={quoted_query}&num=5"
        r = requests.get(url, timeout=15).json()
        results = []
        for it in r.get("projects", []):
            results.append({
                "source": "cordis",
                "external_id": it.get("id"),
                "title": it.get("title"),
                "url": f"https://cordis.europa.eu/project/id/{it.get('id')}",
                "date": it.get("startDate", "")
            })
        return results
    except:
        return []


# ===============================================================
# FUNCI√ìN PRINCIPAL DE B√öSQUEDA JER√ÅRQUICA (REFUERZO DE KEYWORDS)
# ===============================================================

def search_reuse(r):

    dataset_uri = r["dataset_uri"]
    title = r["title"]
    #  CR√çTICO: Necesitamos la descripci√≥n para extraer palabras clave.
    description = r["description"] 
    file_names = r["file_names"]

    queries = []

    # 1. ALTA PRECISI√ìN: URI Exacta
    queries.append({"q": dataset_uri, "match_type": "URI_STRICT", "is_uri": True, "quoted": True})

    # 2. MEDIA PRECISI√ìN: Nombres de archivo √∫nicos
    if isinstance(file_names, str):
        file_names = eval(file_names) # Convierte la cadena de lista a lista real si viene de Excel
    
    for fn in file_names:
        if len(fn) > 8 and '.' in fn:
            queries.append({"q": fn, "match_type": "FILENAME", "is_uri": False, "quoted": True})

    # 3. BAJA PRECISI√ìN: T√≠tulo Exacto (como frase)
    title_clean = clean_text(title)
    if len(title_clean.split()) > 3:
        queries.append({"q": title_clean, "match_type": "TITLE_EXACT", "is_uri": False, "quoted": True})

    # 4.  NUEVO: BAJA PRECISI√ìN (AMPLIA): Palabras clave traducidas (crucial para OpenAlex/Crossref)
    keywords_es = extract_keywords(title, description, k=5)
    
    if keywords_es:
        # Busca las palabras clave combinadas, traducidas y SIN comillas para b√∫squeda amplia
        keywords_en = " ".join([translate_to_en(k) for k in keywords_es])
        
        # Debe haber al menos dos palabras clave para una b√∫squeda significativa
        if keywords_en and len(keywords_en.split()) > 1:
            queries.append({"q": keywords_en, "match_type": "KEYWORD_BROAD", "is_uri": False, "quoted": False})
            
    results = []
    
    sources = [
        (search_openalex, "openalex"),
        (search_crossref, "crossref"),
        (search_zenodo, "zenodo"),
        (search_github, "github"),
        (search_kaggle, "kaggle"),
        (search_cordis, "cordis"),
    ]

    for query_data in queries:
        q = query_data["q"]
        match_type = query_data["match_type"]
        is_uri = query_data["is_uri"]
        quoted = query_data["quoted"]
        
        if not q or not str(q).strip(): continue

        for fn, src in sources:
            
            # GitHub tiene un manejo especial de la b√∫squeda
            if src == "github":
                hits = fn(q, is_uri=is_uri)
            else:
                # El resto de fuentes usan el flag 'quoted'
                hits = fn(q, quoted=quoted)

            for h in hits:
                h["match_type"] = match_type
                results.append(h)

        time.sleep(0.5 + random.random() * 0.5)

    return results


# ===============================================================
# MAIN PUNTO 2
# ===============================================================

def main():

    #  NOTA: AQUI LOS ARCHIVOS
    INPUT = "Punto1_NOMBREARCHIVORESULTADO.xlsx"                # ARCHIVO RESULTADO DEL SCRIPT 1
    OUTPUT = "Punto2_NOMBREARCHIVORESULTADO.xlsx"   # ARCHIVO QUE RESULTADO DE ESTE SCRIPT 2

    try:
        df = pd.read_excel(INPUT)
    except FileNotFoundError:
        print(f"Error: No se encontr√≥ el archivo de entrada '{INPUT}'. Aseg√∫rate de ejecutar el Punto 1 y que el archivo exista.")
        return

    #  CR√çTICO: Asegurarse de que las columnas necesarias existan
    required_cols = ["dataset_uri", "title", "description", "file_names", "category", "issued"]
    if not all(col in df.columns for col in required_cols):
        print(f"Error: El archivo de entrada '{INPUT}' no contiene todas las columnas requeridas para la b√∫squeda de reutilizaci√≥n ({', '.join(required_cols)}).")
        return

    rows = []
    counter = Counter()

    print("\n Iniciando b√∫squeda OPTIMIZADA y REFORZADA de reutilizaci√≥n (V5)...\n")
    print(f"Total de datasets a procesar: {len(df)}")

    for idx, r in df.iterrows():

        print(f"[{idx+1}/{len(df)}] ‚Üí Buscando para: {r['title']}")

        hits = search_reuse(r)

        for h in hits:
            rows.append({
                "dataset_uri": r["dataset_uri"],
                "title_dataset": r["title"],
                "category": r["category"],
                "issued": r["issued"],
                
                "source": h.get("source"),
                "external_id": h.get("external_id"),
                "match_type": h.get("match_type"),
                "source_title": h.get("title"),
                "url": h.get("url"),
                "date": h.get("date")
            })

            counter[h.get("source")] += 1
        
    out = pd.DataFrame(rows)
    
    # Desambiguaci√≥n: Prioriza la URI_STRICT y elimina duplicados de huella
    match_order = {'URI_STRICT': 0, 'FILENAME': 1, 'TITLE_EXACT': 2, 'KEYWORD_BROAD': 3}
    out_final = out.sort_values(by='match_type', key=lambda x: x.map(match_order), ascending=True)
    out_final = out_final.drop_duplicates(subset=['dataset_uri', 'external_id', 'source'], keep='first')
    
    out_final.to_excel(OUTPUT, index=False)

    print("\n Archivo generado correctamente ‚Üí", OUTPUT)

    print("\n Recuento de huellas √∫nicas por fuente:\n")
    for src, cnt in out_final["source"].value_counts().items():
        print(f"  {src}: {cnt} huellas √∫nicas")


if __name__ == "__main__":
    main()