## 🧰 Librerías e importaciones

In [None]:
import re
import time
import pandas as pd
import dateparser
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager

## 💻 Iniciar sesión del navegador

In [None]:
options = Options()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [None]:
df = pd.read_csv("experiencias_tripadvisor_mallorca_limpio.csv")  # CSV con columna "enlace_experiencia"
comentarios_df = pd.DataFrame(columns=[
    "titulo", "usuario", "pais", "comentario", "fecha", "categoria", "review_score"
])


## 🧩 Funciones auxiliares

In [None]:
def extract_comment(card):
    try:
        return card.find_element(By.CSS_SELECTOR, 'div.biGQs._P.pZUbB.AWdfh').text.strip()
    except:
        return ""

def extract_user(card):
    try:
        return card.find_element(By.CSS_SELECTOR, 'span.biGQs._P.ezezH a').text.strip()
    except:
        return ""

def extract_country(card):
    try:
        country_div = card.find_elements(By.CSS_SELECTOR, 'div.vYLts div.biGQs._P.navcl')
        if country_div and len(country_div) > 1:
            return country_div[1].text.strip()
        elif country_div:
            return country_div[0].text.strip()
    except:
        pass
    return ""


def extract_score(card):
    try:
        svg = card.find_element(By.CSS_SELECTOR, 'svg[data-automation="bubbleRatingImage"]')
        title_tag = svg.find_element(By.TAG_NAME, 'title')
        score_text = title_tag.get_attribute("innerText")  # ejemplo: "5 of 5 bubbles"
        m = re.search(r'(\d+)\s*of\s*5', score_text)
        if m:
            return int(m.group(1))
    except:
        pass
    return None

def extract_date_and_categoria(card):
    try:
        fecha_div = card.find_element(By.CSS_SELECTOR, 'div.RpeCd').text.strip()
        if '•' in fecha_div:
            fecha_raw, categoria_raw = map(str.strip, fecha_div.split('•'))
        else:
            fecha_raw = fecha_div
            categoria_raw = ""
        fecha_dt = dateparser.parse(fecha_raw, languages=['es','en'])
        fecha = fecha_dt.strftime('%Y-%m-%d') if fecha_dt else ""
        return fecha, categoria_raw
    except:
        return "", ""

def parse_review(card, titulo_experiencia):
    fecha, categoria = extract_date_and_categoria(card)
    return {
        "titulo": titulo_experiencia,
        "usuario": extract_user(card),
        "pais": extract_country(card),
        "comentario": extract_comment(card),
        "fecha": fecha,
        "categoria": categoria,
        "review_score": extract_score(card)
    }


def click_ver_todos():
    try:
        btn = driver.find_elements(By.XPATH, '//a[contains(text(), "Ver todos los comentarios") or contains(text(), "See all reviews")]')
        if btn:
            driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn[0])
            time.sleep(1.5)
            btn[0].click()
            time.sleep(2)
            return True
    except:
        pass
    return False

def click_next_page():
    try:
        next_btn = driver.find_element(By.CSS_SELECTOR, 'div.HXmEd a[data-smoke-attr="pagination-next-arrow"]')
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", next_btn)
        time.sleep(1.5)
        next_btn.click()
        time.sleep(2)
        return True
    except:
        return False


## 🔎 Scrapping

In [None]:
def scrapear_experiencia(url):
    driver.get(url)
    time.sleep(3)
    
    # Cerrar cookies
    try:
        WebDriverWait(driver, 3).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        ).click()
    except:
        pass
    
    # Click "Ver todos los comentarios"
    click_ver_todos()
    
    # Título de la experiencia
    try:
        titulo = driver.find_element(By.CSS_SELECTOR, 'h1[data-automation="mainH1"]').text.strip()
    except:
        titulo = "Sin título"
    
    comentarios = []
    
    while True:
        # Scroll lento para cargar tarjetas
        driver.execute_script("window.scrollBy(0, 500);")
        time.sleep(1)
        
        cards = driver.find_elements(By.CSS_SELECTOR, 'div._c[data-automation="reviewCard"]')
        if not cards:
            break
        for card in cards:
            try:
                data = parse_review(card, titulo)
                comentarios.append(data)
            except StaleElementReferenceException:
                continue
        if not click_next_page():
            break
    return comentarios

for idx, row in df.iterrows():
    url = row["enlace_experiencia"]
    print(f"Procesando {idx+1}/{len(df)}: {url}")
    resultados = scrapear_experiencia(url)
    if resultados:
        comentarios_df = pd.concat([comentarios_df, pd.DataFrame(resultados)], ignore_index=True)
    time.sleep(1)


## 💾 Guardar CSV final

In [None]:
comentarios_df["review_score"] = pd.to_numeric(comentarios_df["review_score"], errors="coerce")
comentarios_df.to_csv("trip_reviews_mallorca.csv", index=False)
print("Archivo guardado: trip_reviews_valencia.csv")

driver.quit()