In [None]:
# === Travel Reviews Pipeline (Booking + GetYourGuide + Ticketmaster + Reddit + Twitter)
# Requisitos extra se instalan solos si faltan (praw, langdetect, playwright, nest_asyncio, pytz)
import sys, subprocess, importlib
def _ensure(pkg):
    try:
        return importlib.import_module(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        return importlib.import_module(pkg)

praw = _ensure("praw")
try:
    langdetect = _ensure("langdetect"); from langdetect import detect; HAS_LANGDETECT=True
except Exception:
    HAS_LANGDETECT=False

# para Twitter
playwright = _ensure("playwright")
_na = _ensure("nest_asyncio"); _na.apply()
pytz = _ensure("pytz")

import asyncio
from playwright.async_api import async_playwright

import os, re, time, random, logging
from datetime import datetime, timedelta
from typing import List, Dict, Optional
from urllib.parse import urlsplit, urlunsplit, quote
import pandas as pd
import requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, InvalidArgumentException, NoSuchElementException, StaleElementReferenceException
from IPython.display import display

# ================== CONFIG (EDITA AQUÍ) ==================
HEADLESS = True
ENABLE_BOOKING_ATTRACTIONS = True
ENABLE_GETYOURGUIDE = True
ENABLE_TWITTER = True                      

# Ticketmaster
TICKETMASTER_API_KEY = "AsIPMT0pnI4AdbyDWkMWRHBbaG2i8vba"

# Reddit (PRAW)
REDDIT_CLIENT_ID     = "0HAMZIgeCyxEjt61rbilyA"
REDDIT_CLIENT_SECRET = "iL6VAggmYaMq7AR0T6n94opTJx8sNQ"
REDDIT_USER_AGENT    = "Turismo1675"

# Twitter (Playwright con sesión guardada)
TWITTER_SESSION_JSON = "twitter_sesion.json"     # debe existir (guardado previamente)
TWITTER_KEYWORDS = ["viaje","hotel","restaurante","nightlife","que hacer","playa","ocio"]
TWITTER_MAX_TWEETS_PER_KEYWORD = 80

# Búsquedas
CITIES = ["Barcelona"]
REDDIT_KEYWORDS = ["viaje","hotel","restaurante","nightlife","que hacer","playa","ocio"]
REDDIT_MAX_POSTS_PER_KEYWORD = 3

# Límites
MAX_ITEMS = 10
REVIEWS_PER_ITEM = 5

WAIT_S = 0.35   # sleeps cortos en scroll/clicks
WAIT_M = 8 
# =========================================================

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
log = logging.getLogger("pipeline")
UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

# -------- Clasificador por keywords (experiencia) --------
class ExperienceClassifier:
    KW = {
        "museo": ["museo","museum","pinacoteca","galeria de arte","exposicion","exhibicion","exhibition","arte","ciencia","fotografia","fundacion","galeria"],
        "vida nocturna": ["vida nocturna","noche","club","discoteca","pub","bar de copas","fiesta","nightlife","karaoke","cocktail","coctel"],
        "deporte": ["deporte","futbol","baloncesto","surf","kayak","senderismo","hiking","ciclismo","bici","mtb","ski","esqui","rafting","buceo","snorkel","diving","escalada","golf","tenis","padel","parapente","yoga","running","maraton"],
        "playa": ["playa","beach","caleta","cala","bahia","costa","chiringuito","litoral"],
        "comida": ["tapas","comida","gastron","food","restaurante","bodega","vino","cerveza","beer","degustacion","cata","mercado","paella","brunch","cafeteria","bar de tapas"],
        "cultura": ["catedral","iglesia","monasterio","histori","patrimonio","arquitect","teatro","palacio","castillo","yacimiento","barrio","plaza","monumento","mirador","flamenco","gaudi","park guell","montjuic"],
        "hotel": ["hotel","hostal","albergue","parador","resort","apartahotel"],
        "tour": ["tour","visita guiada","free tour","excursion","recorrido","ruta","paseo","hop-on","city tour","fast track"],
        "ocio": ["zoo","acuario","aquarium","parque de atracciones","parque tematico","water park","teleferico"],
        "eventos": ["concierto","festival","show","espectaculo","obra de teatro","musical","conferencia","feria","congreso"]
    }
    @classmethod
    def classify(cls, title: str, text: str="")->str:
        content = f"{title} {text}".lower()
        for cat, kws in cls.KW.items():
            for kw in kws:
                if kw in content: return cat
        return "otros"

SPANISH_MONTHS = {'enero':1,'febrero':2,'marzo':3,'abril':4,'mayo':5,'junio':6,'julio':7,'agosto':8,'septiembre':9,'setiembre':9,'octubre':10,'noviembre':11,'diciembre':12}
EN_MONTHS = {'january':1,'february':2,'march':3,'april':4,'may':5,'june':6,'july':7,'august':8,'september':9,'october':10,'november':11,'december':12}

def _parse_es(text: str)->Optional[str]:
    if not text: return None
    t = text.strip().lower()
    if 'se aloj' in t: return None
    m = re.search(r'(\d{1,2})\s+de\s+([a-záéíóú]+)\s+de\s+(\d{4})', t)
    if m:
        d=int(m.group(1)); mon_name=m.group(2).translate(str.maketrans('áéíóú','aeiou')); mon=SPANISH_MONTHS.get(mon_name,0); y=int(m.group(3))
        if mon:
            try: return datetime(y,mon,d).strftime('%Y-%m-%d')
            except: return None
    m = re.search(r'([a-záéíóú]+)\s+de\s+(\d{4})', t)
    if m:
        mon_name=m.group(1).translate(str.maketrans('áéíóú','aeiou')); mon=SPANISH_MONTHS.get(mon_name,0); y=int(m.group(2))
        if mon: return f"{y:04d}-{mon:02d}-01"
    return None

def _parse_en(text: str)->Optional[str]:
    if not text: return None
    t = text.strip().lower()
    m = re.search(r'([a-z]+)\s+(\d{1,2}),\s*(\d{4})', t)
    if m:
        mon=EN_MONTHS.get(m.group(1),0)
        if mon:
            try: return datetime(int(m.group(3)), mon, int(m.group(2))).strftime('%Y-%m-%d')
            except: return None
    m = re.search(r'([a-z]+)\s+(\d{4})', t)
    if m:
        mon=EN_MONTHS.get(m.group(1),0)
        if mon: return f"{int(m.group(2)):04d}-{mon:02d}-01"
    return None

def normalize_review_date(text:str)->Optional[str]:
    return _parse_es(text) or _parse_en(text)

def canonical_booking_path(hotel_url: str)->str:
    p = urlsplit(hotel_url)
    return urlunsplit((p.scheme or 'https', p.netloc or 'www.booking.com', quote(p.path or ''), '', ''))

def booking_review_urls(hotel_url:str, lang='es')->List[str]:
    base = canonical_booking_path(hotel_url); p = urlsplit(base); out=[]
    m = re.match(r"^/hotel/([^/]+)/([^/?#]+)", p.path or "")
    if m:
        country, slug = m.groups()
        out.append(urlunsplit((p.scheme,p.netloc,f"/reviews/{country}/hotel/{slug}", f"r_lang={lang}&sort=latest", '')))
    out.append(urlunsplit((p.scheme,p.netloc,p.path, f"tab=reviews&lang={lang}", "")))
    out.append(urlunsplit((p.scheme,p.netloc,p.path, "", "tab-reviews")))
    if base not in out: out.append(base)
    return out

# -------------- Driver (Selenium) --------------
class DM:
    def __init__(self, headless=True):
        self.headless=headless; self.driver=None
    def up(self):
        if self.driver:
            try: _=self.driver.current_url; return self.driver
            except: self.quit()
        o=webdriver.ChromeOptions()
        if self.headless: o.add_argument('--headless=new')
        o.add_argument('--no-sandbox'); o.add_argument('--disable-dev-shm-usage')
        o.add_argument('--disable-blink-features=AutomationControlled')
        o.add_argument(f'--user-agent={UA}')
        o.add_argument('--window-size=1400,2500')
        o.add_experimental_option("excludeSwitches", ["enable-automation"])
        o.add_experimental_option('useAutomationExtension', False)
        try: o.page_load_strategy = 'eager'
        except Exception: pass
        self.driver=webdriver.Chrome(service=Service(), options=o)
        try:
            self.driver.execute_script("Object.defineProperty(navigator,'webdriver',{get:()=>undefined})")
        except Exception: pass
        self.driver.set_page_load_timeout(45)
        return self.driver
    def quit(self):
        try:
            if self.driver: self.driver.quit()
        finally:
            self.driver=None
    def close_popups(self,d):
        sels=[
            "#onetrust-accept-btn-handler","button[id^='onetrust-accept']","[id*='accept'][role='button']",
            "button[aria-label*='Aceptar']","button[aria-label*='Accept']",
            "[role='dialog'] button[aria-label*='Dismiss']",".modal-mask button",
            "#_evidon-accept-button","[data-automation='closeModal']",".ui_close_x","button[aria-label='Close']"
        ]
        for s in sels:
            try:
                els = d.find_elements(By.CSS_SELECTOR, s)
                for el in els:
                    if el.is_displayed() and el.is_enabled():
                        d.execute_script("arguments[0].click();", el); time.sleep(0.05)
            except: pass
    def text(self,root,sel):
        try: return root.find_element(By.CSS_SELECTOR, sel).text.strip()
        except: return ""
    def blocked(self,d)->bool:
        html = (d.page_source or "").lower()
        kw = ["are you a robot","not a robot","verify you are a human","px-captcha",
              "asegúrate de que no eres un robot","comprueba que no eres un robot",
              "we're sorry, but the service is not available"]
        return any(k in html for k in kw)

# -------------- Booking Hoteles --------------
class BookingHotels:
    def __init__(self, dm:DM): self.dm=dm
    def list(self, city, checkin, checkout, max_hotels=30)->List[Dict]:
        d=self.dm.up(); out=[]
        base_q=f"checkin={checkin}&checkout={checkout}&group_adults=2&lang=es&sb=1"
        urls=[f"https://www.booking.com/searchresults.html?ss={city}&{base_q}",
              f"https://www.booking.com/searchresults.es.html?ss={city}&{base_q}"]
        try:
            for url in urls:
                d.get(url); self.dm.close_popups(d); time.sleep(random.uniform(0.4,0.9))
                if self.dm.blocked(d): continue
                try:
                    WebDriverWait(d,12).until(EC.any_of(
                        EC.presence_of_all_elements_located((By.CSS_SELECTOR,'[data-testid="property-card"]')),
                        EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.sr_property_block'))
                    ))
                except TimeoutException: pass
                for _ in range(2): d.execute_script("window.scrollBy(0, 1400);"); time.sleep(0.3)
                cards=d.find_elements(By.CSS_SELECTOR,'[data-testid="property-card"], .sr_property_block, div[data-component*="property-card"]')
                if not cards: continue
                log.info(f"Booking hoteles encontrados: {len(cards)}")
                for it in cards[:max_hotels]:
                    url_h=None
                    for sel in ('a[data-testid="title-link"]','[data-testid="title"] a','h3 a','.sr-hotel__name a','a[href*="/hotel/"]'):
                        try:
                            a=it.find_element(By.CSS_SELECTOR,sel); href=a.get_attribute('href')
                            if href and '/hotel/' in href: url_h=href; break
                        except: continue
                    if not url_h: continue
                    title=""
                    for sel in ('[data-testid="title"]','a[data-testid="title-link"]','h3 a','.sr-hotel__name a'):
                        try:
                            t=it.find_element(By.CSS_SELECTOR,sel).text.strip()
                            if t: title=t; break
                        except: continue
                    out.append({"title":title or "Hotel","url":url_h,"source":"booking_hotels"})
                if out: break
        finally:
            self.dm.quit()
        return out
    def _ready_reviews(self,d)->bool:
        try:
            WebDriverWait(d,10).until(EC.any_of(
                EC.presence_of_element_located((By.CSS_SELECTOR,'[data-testid="review-card"]')),
                EC.presence_of_element_located((By.CSS_SELECTOR,'#review_list .review_item_review'))
            )); return True
        except TimeoutException: return False
    def _rating(self,card)->Optional[int]:
        for sel in ('[data-testid="review-score"]','.bui-review-score__badge','[aria-label*="Puntuación"]','.review-score-badge','meta[itemprop="ratingValue"]'):
            try:
                el=card.find_element(By.CSS_SELECTOR,sel)
                raw=(el.get_attribute('aria-label') or el.get_attribute('content') or el.text or '').strip()
                m=re.search(r'(\d+[.,]?\d*)', raw)
                if m:
                    v=float(m.group(1).replace(',','.'))
                    return max(1,min(5,int(round(v/2 if v>5 else v))))
            except: pass
        return None
    def _date(self,card)->str:
        txt=self.dm.text(card,'[data-testid="review-date"]')
        iso=normalize_review_date(txt) or normalize_review_date(card.text)
        if iso: return iso
        for sel,attr in (('time[datetime]','datetime'),('meta[itemprop="datePublished"]','content')):
            try:
                val=card.find_element(By.CSS_SELECTOR,sel).get_attribute(attr) or ''
                if val: return (val[:10] if len(val)>=10 else val)
            except: pass
        return datetime.now().strftime('%Y-%m-%d')
    def reviews(self, hotel_url:str, max_reviews:int)->List[Dict]:
        if not hotel_url: return []
        d=self.dm.up(); out=[]
        try:
            for u in booking_review_urls(hotel_url):
                try: d.get(u)
                except InvalidArgumentException: continue
                self.dm.close_popups(d)
                if self.dm.blocked(d): continue
                if not self._ready_reviews(d): continue
                for _ in range(2): d.execute_script("window.scrollBy(0,1200)"); time.sleep(0.2)
                cards=d.find_elements(By.CSS_SELECTOR,'[data-testid="review-card"], #review_list .review_item_review')
                for c in cards:
                    if len(out)>=max_reviews: break
                    pos=self.dm.text(c,'[data-testid="review-positive-text"]') or self.dm.text(c,'p.review_pos span[itemprop="reviewBody"]') or self.dm.text(c,'p.review_pos')
                    neg=self.dm.text(c,'[data-testid="review-negative-text"]') or self.dm.text(c,'p.review_neg span[itemprop="reviewBody"]') or self.dm.text(c,'p.review_neg')
                    body=self.dm.text(c,'.c-review__body')
                    txt=" | ".join([t for t in (pos,neg,body) if t]).strip()
                    if not txt: continue
                    out.append({'texto':txt,'rating':self._rating(c),'fecha':self._date(c)})
                if out: break
        finally:
            self.dm.quit()
        return out

# -------------- Booking Atracciones (opcional) --------------
class BookingAttractions:
    def __init__(self, dm:DM): self.dm=dm
    def list(self, city, max_attractions=40)->List[Dict]:
        d=self.dm.up(); out=[]
        slug=city.lower().replace(' ','-')
        url=f"https://www.booking.com/attractions/searchresults/es/{slug}.es.html"
        try:
            d.get(url); self.dm.close_popups(d)
            if self.dm.blocked(d): return out
            try: WebDriverWait(d,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'[data-testid="card"]')))
            except TimeoutException: pass
            d.execute_script("window.scrollTo(0, document.body.scrollHeight);"); time.sleep(0.4)
            cards=d.find_elements(By.CSS_SELECTOR,'[data-testid="card"]')
            log.info(f"Booking (atracciones) encontradas: {len(cards)}")
            for it in cards[:max_attractions]:
                link=None; title=""
                for sel in ('a[href*="/attractions/"]','a[data-testid="card-cta"]','h3 a','a'):
                    try:
                        a=it.find_element(By.CSS_SELECTOR,sel); href=a.get_attribute('href') or ''
                        if '/attractions/' in href: link=href; break
                    except: continue
                if not link: continue
                for sel in ('[data-testid="card-title"]','h3 a','a[data-testid="card-cta"]','h3','a'):
                    try:
                        t=it.find_element(By.CSS_SELECTOR,sel).text.strip()
                        if t: title=t; break
                    except: continue
                out.append({"title":title or "Atracción","url":link,"source":"booking_attractions"})
        finally:
        pass
    def _open_reviews_modal(self, d)->Optional[object]:
        try:
            btn=d.find_element(By.CSS_SELECTOR,'[data-testid="reviews-link"]')
            d.execute_script("arguments[0].scrollIntoView({block:\'center\'});", btn); time.sleep(0.1)
            d.execute_script("arguments[0].click();", btn); time.sleep(0.4)
        except NoSuchElementException: pass
        for sel in ("[data-testid='reviews-modal']",".a9f1d9ba2c","div[role='dialog'][aria-modal='true']"):
            try: return d.find_element(By.CSS_SELECTOR, sel)
            except: continue
        return None
    def _scroll_container(self, d, container):
        try: d.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", container); time.sleep(0.3)
        except: pass
    def _next_page(self, d, container)->bool:
        sels = ('[data-testid="reviews-pagination"] button[aria-label="Siguiente"]','button[aria-label="Siguiente"]','[data-testid="pagination-next-button"]')
        for s in sels:
            try:
                nxt = container.find_element(By.CSS_SELECTOR, s)
                if nxt.get_attribute("disabled") is not None: return False
                d.execute_script("arguments[0].scrollIntoView({block:\'center\'});", nxt); time.sleep(0.1)
                d.execute_script("arguments[0].click();", nxt); time.sleep(0.7)
                return True
            except: continue
        return False
    def reviews(self, url:str, max_reviews:int)->List[Dict]:
        if not url: return []
        d=self.dm.up(); out=[]; seen=set()
        try:
            d.get(url); self.dm.close_popups(d)
            if self.dm.blocked(d): return out
            title=""
            for sel in ("[data-testid='product-title']", ".a4ac75716e.css-1mpsda3", "h1"):
                try:
                    t=d.find_element(By.CSS_SELECTOR, sel).text.strip()
                    if t: title=t; break
                except: continue
            modal=self._open_reviews_modal(d)
            source = modal if modal else d
            last_count=-1; idle_loops=0
            while len(out) < max_reviews:
                try: items = source.find_elements(By.CSS_SELECTOR,'[data-testid="review-item"], [data-testid="review"]')
                except StaleElementReferenceException: items = source.find_elements(By.CSS_SELECTOR,'[data-testid="review-item"], [data-testid="review"]')
                for it in items:
                    if len(out)>=max_reviews: break
                    for sel in ('[data-testid="review-read-more"]','button[aria-expanded="false"]'):
                        for btn in it.find_elements(By.CSS_SELECTOR, sel):
                            try: d.execute_script("arguments[0].click();", btn); time.sleep(0.05)
                            except: pass
                    txt=""
                    for sel in (".b99b6ef58f","[data-testid='review-text']","p"):
                        try: txt=it.find_element(By.CSS_SELECTOR, sel).text.strip()
                        except: txt=""
                        if txt: break
                    if not txt: continue
                    if txt in seen: continue
                    seen.add(txt)
                    score=None
                    try:
                        raw = it.find_element(By.CSS_SELECTOR,'[data-testid="review-star-rating"]').get_attribute('aria-label') or ''
                        m=re.search(r'(\d+)', raw); score=int(m.group(1)) if m else None
                    except: pass
                    fecha = datetime.now().strftime('%Y-%m-%d')  # a veces no hay fecha
                    out.append({'titulo': title or "Actividad", 'texto': txt, 'rating': score, 'fecha': fecha})
                if len(out) >= max_reviews: break
                curr_count = len(items)
                if curr_count == last_count:
                    pag_ok = (modal and self._next_page(d, modal))
                    if not pag_ok:
                        if modal: self._scroll_container(d, modal)
                        else: d.execute_script("window.scrollBy(0, 1000)"); time.sleep(0.3)
                        idle_loops += 1
                        if idle_loops >= 4: break
                else:
                    idle_loops = 0
                last_count = curr_count
            return out
        finally:
            self.dm.quit()

# -------------- GetYourGuide (listado + reseñas) --------------
class GetYourGuide:
    CITY_L_MAP = {"barcelona":"barcelona-l45","madrid":"madrid-l46","sevilla":"sevilla-l48","valencia":"valencia-l49","malaga":"malaga-l402"}
    def __init__(self, dm:DM): self.dm=dm
    def _city_urls(self, city:str)->List[str]:
        slug=city.lower().strip().replace(" ","-")
        urls=[]
        if slug in self.CITY_L_MAP:
            urls.append(f"https://www.getyourguide.es/{self.CITY_L_MAP[slug]}/")
        urls += [f"https://www.getyourguide.es/{slug}/", f"https://www.getyourguide.es/s/?q={quote(city)}"]
        return urls
    def list(self, city:str, max_items:int=30)->List[Dict]:
        d=self.dm.up(); out=[]
        try:
            for url in self._city_urls(city):
                d.get(url); time.sleep(1.0); self.dm.close_popups(d)
                # Cargar "Ver más"
                for _ in range(10):
                    try:
                        btns = d.find_elements(By.CSS_SELECTOR, "button:contains('Ver más'), button:contains('Show more')")
                        clicked=False
                        for b in btns:
                            dis = b.get_attribute("disabled")
                            if b.is_displayed() and (not dis):
                                d.execute_script("arguments[0].scrollIntoView({block:'center'});", b); time.sleep(0.1)
                                d.execute_script("arguments[0].click();", b); clicked=True; time.sleep(1.0); break
                        if not clicked: break
                    except: break
                # Scroll para cargar cards
                prev=0
                for _ in range(40):
                    d.execute_script("window.scrollBy(0, 1600);"); time.sleep(0.3)
                    cards=d.find_elements(By.CSS_SELECTOR,'a.vertical-activity-card__container')
                    if len(cards)==prev: break
                    prev=len(cards)
                cards=d.find_elements(By.CSS_SELECTOR,'a.vertical-activity-card__container')
                if not cards: continue
                for c in cards[:max_items]:
                    try:
                        href=c.get_attribute('href') or ''
                        title=""
                        for s in ("h3[data-test-id='activity-card-title'] span","h3","[data-test-id='activity-card-title']"):
                            try:
                                t=c.find_element(By.CSS_SELECTOR,s).text
                                title=(t or "").strip()
                                if title: break
                            except: continue
                        if href:
                            out.append({"title": title or "Actividad", "url": href, "source":"getyourguide"})
                    except: continue
                if out: break
        finally:
            self.dm.quit()
        log.info(f"GetYourGuide items: {len(out)}")
        return out

    def _expand_reviews(self, d):
        try:
            btn = WebDriverWait(d,4).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button[data-test-id='activity-review-see-more-reviews-button']")))
            d.execute_script("arguments[0].scrollIntoView({block:'center'});", btn); time.sleep(0.2)
            d.execute_script("arguments[0].click();", btn); time.sleep(0.7)
        except: pass

    def reviews(self, url:str, max_reviews:int=5)->List[Dict]:
        if not url: return []
        d=self.dm.up(); out=[]; seen=set()
        try:
            d.get(url); time.sleep(1.0); self.dm.close_popups(d)
            try: title=d.find_element(By.CSS_SELECTOR, "h1#adp-title-text").text.strip()
            except: title=""
            idle=0; last_n=-1
            while len(out)<max_reviews and idle<6:
                self._expand_reviews(d)
                d.execute_script("window.scrollBy(0, 1200);"); time.sleep(0.4)
                cards = d.find_elements(By.CSS_SELECTOR,"section.review-card[data-test-id='activity-review-card']")
                for c in cards:
                    if len(out)>=max_reviews: break
                    try:
                        more=c.find_element(By.CSS_SELECTOR,"button[aria-expanded='false']")
                        d.execute_script("arguments[0].click();", more); time.sleep(0.05)
                    except: pass
                    txt=""
                    for sel in ("div.toggle-content__content", "p", "[data-test-id='activity-review-card-text']"):
                        try: 
                            txt=c.find_element(By.CSS_SELECTOR, sel).text.strip()
                        except: txt=""
                        if txt: break
                    if not txt or txt in seen: continue
                    seen.add(txt)
                    score=None
                    try:
                        raw = c.find_element(By.CSS_SELECTOR,"span.rating-star__label").text.strip()
                        m = re.search(r'(\d+[.,]?)', raw)
                        if m:
                            val=float(m.group(1).replace(',','.'))
                            score=int(round(max(1,min(5,val))))
                    except: pass
                    date_txt=""
                    for sel in ("span.review-card__author-date","span.review-card___author-date"):
                        try:
                            date_txt=c.find_element(By.CSS_SELECTOR, sel).text.strip()
                            if date_txt: break
                        except: continue
                    fecha = normalize_review_date(date_txt) or datetime.now().strftime('%Y-%m-%d')
                    out.append({'titulo': title or "Actividad", 'texto': txt, 'rating': score, 'fecha': fecha})
            return out
        finally:
            self.dm.quit()

# -------------- Ticketmaster --------------
class Ticketmaster:
    BASE='https://app.ticketmaster.com/discovery/v2/events.json'
    def __init__(self, api_key:str): self.key=api_key
    def _backoff(self,n): time.sleep(min(20,2**(n-1))+random.uniform(0.2,0.8))
    def fetch(self, cities:List[str], limit:int=50)->List[Dict]:
        out=[]
        if not self.key:
            log.warning("Falta TICKETMASTER_API_KEY; se omiten eventos.")
            return out
        today = datetime.today().strftime('%Y-%m-%dT00:00:00Z')
        headers={"User-Agent":UA}
        for city in cities:
            log.info(f"Buscando eventos en {city}")
            page=0; got=0
            while got<limit:
                params={'apikey':self.key,'countryCode':'ES','city':city,'startDateTime':today,'sort':'date,asc','size':min(100,limit-got),'page':page}
                try:
                    r=requests.get(self.BASE, params=params, headers=headers, timeout=25)
                    if r.status_code==200:
                        data=r.json(); evs=data.get('_embedded',{}).get('events',[])
                        if not evs: break
                        for e in evs:
                            got+=1
                            out.append({
                                'titulo': e.get('name',''),
                                'texto' : e.get('info','') or '',
                                'sentimiento': None,
                                'experiencia': ExperienceClassifier.classify(e.get('name',''), e.get('info','') or ''),
                                'fecha': e.get('dates',{}).get('start',{}).get('localDate') or datetime.now().strftime('%Y-%m-%d'),
                                'fuente': 'ticketmaster'
                            })
                        page+=1; time.sleep(random.uniform(0.3,0.8))
                    elif r.status_code in (429,500,502,503,504):
                        log.warning(f"Ticketmaster {city} {r.status_code}, reintentando…")
                        ok=False
                        for n in range(1,5):
                            self._backoff(n)
                            r=requests.get(self.BASE, params=params, headers=headers, timeout=25)
                            if r.status_code==200: ok=True; break
                        if not ok: break
                    else:
                        break
                except Exception as e:
                    log.warning(f"Ticketmaster error {city}: {e}"); break
        return out

# -------------- Reddit --------------
class RedditScraper:
    def __init__(self, client_id, client_secret, user_agent):
        self.enabled = bool(client_id and client_secret and user_agent)
        self.reddit = None
        if self.enabled:
            try:
                self.reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent, check_for_async=False)
            except Exception as e:
                log.warning(f"No se pudo inicializar PRAW: {e}")
                self.enabled = False
    def _lang_ok(self, text:str)->bool:
        if not HAS_LANGDETECT: return True
        try: return detect(text or '') in ('es','en')
        except Exception: return True
    def search_city(self, city:str, keywords:List[str], per_keyword:int)->List[Dict]:
        rows=[]; seen=set()
        if not self.enabled or not self.reddit: return rows
        for kw in keywords:
            query = f'"{city}" {kw}'
            log.info(f"Reddit: buscando '{query}'")
            try:
                for post in self.reddit.subreddit('all').search(query, sort='new', limit=per_keyword):
                    try:
                        title=(post.title or '').strip(); body=(post.selftext or '').strip()
                        if not (title or body): continue
                        text_all=f"{title} {body}".strip()
                        if not self._lang_ok(text_all): continue
                        key=getattr(post,'id',None) or f"{title[:40]}|{body[:40]}"
                        if key in seen: continue
                        seen.add(key)
                        fecha=datetime.utcfromtimestamp(getattr(post,'created_utc', time.time())).strftime('%Y-%m-%d')
                        rows.append({
                            'titulo': title,
                            'texto': body if body else title,
                            'sentimiento': None,
                            'experiencia': ExperienceClassifier.classify(title, body),
                            'fecha': fecha,
                            'fuente': 'reddit'
                        })
                    except Exception: continue
                time.sleep(random.uniform(0.3,0.9))
            except Exception as e:
                log.warning(f"Reddit error '{query}': {e}"); time.sleep(1.0); continue
        return rows

# -------------- Twitter (Playwright + sesión guardada) --------------
class TwitterScraper:
    def __init__(self, session_json:str=None, headless:bool=True):
        self.session_json = session_json
        self.headless = headless
        self.enabled = bool(session_json and os.path.exists(session_json))

    async def _scrape_query_async(self, p, query:str, max_tweets:int)->List[Dict]:
        rows=[]; seen=set()
        browser = await p.chromium.launch(headless=self.headless)
        try:
            ctx = await browser.new_context(storage_state=self.session_json)
            page = await ctx.new_page()
            search_url = f"https://twitter.com/search?q={quote(query)}&src=typed_query&f=live"
            await page.goto(search_url)
            # cookies (si salen)
            try:
                btn = await page.wait_for_selector('div[role="button"]:has-text("Aceptar todas las cookies")', timeout=5000)
                await btn.click()
            except: pass

            await asyncio.sleep(2.5)
            no_new=0
            while len(rows) < max_tweets and no_new < 6:
                before=len(rows)
                await page.keyboard.press("PageDown")
                await asyncio.sleep(1.6)
                cards = await page.query_selector_all('article[data-testid="tweet"]')
                for c in cards:
                    if len(rows) >= max_tweets: break
                    # expandir "mostrar más"
                    try:
                        sm = await c.query_selector('button[data-testid="tweet-text-show-more-link"]')
                        if sm: await sm.click(); await asyncio.sleep(0.2)
                    except: pass
                    # texto
                    text_el = await c.query_selector('[data-testid="tweetText"]')
                    text = (await text_el.inner_text()) if text_el else None
                    if not text: continue
                    # fecha
                    fecha_local=None
                    t = await c.query_selector('time')
                    if t:
                        iso = await t.get_attribute('datetime')
                        if iso:
                            try:
                                dt_utc = datetime.fromisoformat(iso.replace("Z","+00:00"))
                                tz_madrid = pytz.timezone("Europe/Madrid")
                                fecha_local = dt_utc.astimezone(tz_madrid).strftime("%Y-%m-%d")
                            except:
                                try: fecha_local = iso[:10]
                                except: fecha_local = datetime.utcnow().strftime("%Y-%m-%d")
                    key = (text.strip(), fecha_local or "")
                    if key in seen: continue
                    seen.add(key)
                    rows.append({
                        'titulo': (text[:80] + ("…" if len(text)>80 else "")),
                        'texto': text,
                        'sentimiento': None,  # sin nota => vacío
                        'experiencia': ExperienceClassifier.classify(text, ""),
                        'fecha': fecha_local or datetime.utcnow().strftime("%Y-%m-%d"),
                        'fuente': 'twitter'
                    })
                no_new = no_new + 1 if len(rows)==before else 0
            await ctx.close()
        finally:
            await browser.close()
        return rows

    def search_city(self, city:str, keywords:List[str], per_keyword:int)->List[Dict]:
        if not self.enabled:
            log.info("Twitter: sesión no encontrada o desactivado; se omite.")
            return []
        async def runner():
            out=[]
            async with async_playwright() as p:
                for kw in keywords:
                    query=f"{city} {kw}"
                    log.info(f"Twitter: buscando '{query}'")
                    try:
                        res = await self._scrape_query_async(p, query, per_keyword)
                        out.extend(res)
                    except Exception as e:
                        log.warning(f"Twitter fallo '{query}': {e}")
                        await asyncio.sleep(1.0)
            return out
        # Ejecutar en Jupyter con nest_asyncio aplicado
        return asyncio.run(runner())

# -------------- Pipeline --------------
class Pipeline:
    def __init__(self, headless=True):
        self.dm=DM(headless=headless)
        self.bh=BookingHotels(self.dm)
        self.ba=BookingAttractions(self.dm)
        self.gyg=GetYourGuide(self.dm)
        self.tm=Ticketmaster(TICKETMASTER_API_KEY)
        self.rs=RedditScraper(REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT)
        self.tw=TwitterScraper(TWITTER_SESSION_JSON, headless=headless) if ENABLE_TWITTER else TwitterScraper(None, headless=headless)

    def run(self, cities:List[str], checkin:Optional[str], checkout:Optional[str],
        max_items:int=6, reviews_per_item:int=3)->pd.DataFrame:
    if not checkin: checkin=(datetime.now()+timedelta(days=1)).strftime('%Y-%m-%d')
    if not checkout: checkout=(datetime.now()+timedelta(days=2)).strftime('%Y-%m-%d')
    rows=[]

    for city in cities:
        log.info(f"\n=== Ciudad: {city} ===")

        # ---------- 1) MISMO driver para Booking Hoteles + Atracciones ----------
        self.dm.up()
        try:
            # Booking Hoteles
            try:
                hotels = self.bh.list(city, checkin, checkout, max_items)
            except Exception as e:
                log.warning(f"Booking hoteles error: {e}"); hotels=[]

            for h in hotels[:min(6, max_items)]:
                log.info(f"Reviews hotel: {h['title']}")
                try:
                    revs = self.bh.reviews(h['url'], reviews_per_item)
                    for r in revs:
                        rows.append({
                            'titulo': h['title'],
                            'texto': r['texto'],
                            'sentimiento': r['rating'],          # puede ser None (queda vacío)
                            'experiencia': 'hotel',               # forzado para hoteles
                            'fecha': r['fecha'],
                            'fuente': 'booking_hotels'
                        })
                except Exception as e:
                    log.warning(f"Reviews hotel fallo: {e}")

            # Booking Atracciones (si está activo) usando el MISMO driver
            if ENABLE_BOOKING_ATTRACTIONS:
                try:
                    atts = self.ba.list(city, max_items)
                except Exception as e:
                    log.warning(f"Booking atracciones error: {e}"); atts=[]

                for a in atts[:min(6, max_items)]:
                    log.info(f"Reviews atracción: {a['title']}")
                    try:
                        revs = self.ba.reviews(a['url'], reviews_per_item)
                        for r in revs:
                            rows.append({
                                'titulo': r['titulo'],
                                'texto': r['texto'],
                                'sentimiento': r['rating'],       # puede ser None
                                'experiencia': ExperienceClassifier.classify(r['titulo'], r['texto']),
                                'fecha': r['fecha'],
                                'fuente': 'booking_attractions'
                            })
                    except Exception as e:
                        log.warning(f"Reviews atracción fallo: {e}")
        finally:
            # Cerramos el driver de Booking
            self.dm.quit()

        # ---------- 2) Nuevo driver para GetYourGuide ----------
        if ENABLE_GETYOURGUIDE:
            self.dm.up()
            try:
                try:
                    items = self.gyg.list(city, max_items)
                except Exception as e:
                    log.warning(f"GYG listado error: {e}"); items=[]

                for it in items[:min(6, max_items)]:
                    log.info(f"GYG reviews: {it['title']}")
                    try:
                        revs = self.gyg.reviews(it['url'], reviews_per_item)
                        for r in revs:
                            rows.append({
                                'titulo': r['titulo'],
                                'texto': r['texto'],
                                'sentimiento': r['rating'],       # puede ser None
                                'experiencia': ExperienceClassifier.classify(r['titulo'], r['texto']),
                                'fecha': r['fecha'],
                                'fuente': 'getyourguide'
                            })
                    except Exception as e:
                        log.warning(f"GYG reviews fallo: {e}")
            finally:
                # Cerramos el driver de GYG
                self.dm.quit()

        # ---------- 3) Fuentes sin Selenium ----------
        # Ticketmaster (sin nota)
        rows.extend(self.tm.fetch([city], limit=max_items))
        # Reddit (sin nota)
        rows.extend(self.rs.search_city(city, REDDIT_KEYWORDS, REDDIT_MAX_POSTS_PER_KEYWORD))

    df = pd.DataFrame(rows, columns=['titulo','texto','sentimiento','experiencia','fecha','fuente'])
    if not df.empty:
        def _iso(x):
            try: return datetime.fromisoformat(str(x)[:10]).strftime('%Y-%m-%d')
            except: return datetime.now().strftime('%Y-%m-%d')
        df['fecha'] = df['fecha'].astype(str).map(_iso)
    log.info(f"Pipeline completado. Registros: {len(df)}")
    return df

            # Ticketmaster -> sin nota (vacío) + clasificador
            rows.extend(self.tm.fetch([city], limit=max_items))

            # Reddit -> sin nota (vacío) + clasificador
            rows.extend(self.rs.search_city(city, REDDIT_KEYWORDS, REDDIT_MAX_POSTS_PER_KEYWORD))

            # Twitter -> sin nota (vacío) + clasificador
            if ENABLE_TWITTER:
                rows.extend(self.tw.search_city(city, TWITTER_KEYWORDS or REDDIT_KEYWORDS, TWITTER_MAX_TWEETS_PER_KEYWORD))

        df=pd.DataFrame(rows, columns=['titulo','texto','sentimiento','experiencia','fecha','fuente'])
        if not df.empty:
            # Normaliza fecha a ISO pero NO tocamos sentimiento (para dejar vacíos donde no hay nota)
            def _iso(x):
                try: return datetime.fromisoformat(str(x)[:10]).strftime('%Y-%m-%d')
                except: return datetime.now().strftime('%Y-%m-%d')
            df['fecha']=df['fecha'].astype(str).map(_iso)
        log.info(f"Pipeline completado. Registros: {len(df)}")
        return df

    def save(self, df:pd.DataFrame, filename:Optional[str]=None)->str:
        if filename is None: filename=f"reviews_scraping_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(filename, index=False, encoding='utf-8')
        log.info(f"Guardado: {filename}")
        return filename

# -------- Ejecutar --------
pipe = Pipeline(headless=HEADLESS)
df = pipe.run(CITIES, checkin=None, checkout=None, max_items=MAX_ITEMS, reviews_per_item=REVIEWS_PER_ITEM)
display(df.head(30))
out = pipe.save(df)
print("Total registros:", len(df))
print("Archivo CSV:", out)



In [4]:
!playwright install chromium.

Failed to install browsers
Error: Invalid installation targets: 'chromium.'. Expecting one of: chromium, chromium-headless-shell, chromium-tip-of-tree-headless-shell, chrome, chrome-beta, msedge, msedge-beta, msedge-dev, _bidiChromium, firefox, webkit


In [11]:
# === Travel Reviews Pipeline (Booking + Attractions + GetYourGuide + Ticketmaster + Reddit + Twitter)
# Ejecutar TODO en una sola celda de Jupyter

# ---------- dependencias opcionales ----------
import sys, subprocess, importlib, os
def _ensure(pkg, pip_name=None, post=None):
    try:
        return importlib.import_module(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or pkg])
        if post: subprocess.check_call([sys.executable, "-m"] + post)
        return importlib.import_module(pkg)

# Web + scraping
# (Selenium ya suele venir con notebook; si no, instala webdriver-manager manualmente)
praw        = _ensure("praw")
try:
    langdetect = _ensure("langdetect"); from langdetect import detect; HAS_LANG=True
except Exception:
    HAS_LANG=False
pytz        = _ensure("pytz")
playwright  = _ensure("playwright")
nest_asyncio= _ensure("nest_asyncio")
# aseguramos navegador de Playwright (una vez)
try:
    from playwright.async_api import async_playwright
except Exception:
    _ensure("playwright", post=["playwright","install","chromium"])
    from playwright.async_api import async_playwright

# ---------- imports estándar ----------
import re, time, random, logging, asyncio
from datetime import datetime, timedelta
from typing import List, Dict, Optional
from urllib.parse import urlsplit, urlunsplit, quote
import pandas as pd
import requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, InvalidArgumentException, NoSuchElementException, StaleElementReferenceException
from IPython.display import display

nest_asyncio.apply()

# ================== CONFIG (EDITA AQUÍ) ==================
HEADLESS = False

ENABLE_BOOKING_ATTRACTIONS = True
ENABLE_GETYOURGUIDE       = True
ENABLE_TICKETMASTER       = True
ENABLE_REDDIT             = True
ENABLE_TWITTER            = True    # requiere tener twitter_sesion.json en el cwd

# Ticketmaster
TICKETMASTER_API_KEY = "AsIPMT0pnI4AdbyDWkMWRHBbaG2i8vba"

# Reddit (PRAW)
REDDIT_CLIENT_ID     = "0HAMZIgeCyxEjt61rbilyA"
REDDIT_CLIENT_SECRET = "iL6VAggmYaMq7AR0T6n94opTJx8sNQ"
REDDIT_USER_AGENT    = "Turismo1675"

# Twitter (Playwright) - archivo de estado ya logueado
TWITTER_SESSION_FILE = "twitter_sesion.json"
TWITTER_MAX_TWEETS_PER_QUERY = 3

# Búsquedas
CITIES = ["Barcelona"]
REDDIT_KEYWORDS = ["viaje","hotel","restaurante","nightlife","que hacer","playa","ocio"]
TWITTER_KEYWORDS = ["viaje","hotel","restaurante","nightlife","turismo","planes"]
REDDIT_MAX_POSTS_PER_KEYWORD = 3

# Límites
MAX_ITEMS = 5
REVIEWS_PER_ITEM = 4
# =========================================================

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
log = logging.getLogger("pipeline")
UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

# -------- Clasificador por keywords (experiencia) --------
class ExperienceClassifier:
    KW = {
        "museo": ["museo","museum","pinacoteca","galeria de arte","exposicion","exhibicion","exhibition","arte","ciencia","fotografia","fundacion","galeria"],
        "vida nocturna": ["vida nocturna","noche","club","discoteca","pub","bar de copas","fiesta","nightlife","karaoke","cocktail","coctel"],
        "deporte": ["deporte","futbol","baloncesto","surf","kayak","senderismo","hiking","ciclismo","bici","mtb","ski","esqui","rafting","buceo","snorkel","diving","escalada","golf","tenis","padel","parapente","yoga","running","maraton"],
        "playa": ["playa","beach","caleta","cala","bahia","costa","chiringuito","litoral"],
        "comida": ["tapas","comida","gastron","food","restaurante","bodega","vino","cerveza","beer","degustacion","cata","mercado","paella","brunch","cafeteria","bar de tapas"],
        "cultura": ["catedral","iglesia","monasterio","histori","patrimonio","arquitect","teatro","palacio","castillo","yacimiento","barrio","plaza","monumento","mirador","flamenco","gaudi","park guell","montjuic"],
        "hotel": ["hotel","hostal","albergue","parador","resort","apartahotel"],
        "tour": ["tour","visita guiada","free tour","excursion","recorrido","ruta","paseo","hop-on","city tour","fast track"],
        "ocio": ["zoo","acuario","aquarium","parque de atracciones","parque tematico","water park","teleferico"],
        "eventos": ["concierto","festival","show","espectaculo","obra de teatro","musical","conferencia","feria","congreso"]
    }
    @classmethod
    def classify(cls, title: str, text: str="")->str:
        content = f"{title} {text}".lower()
        for cat, kws in cls.KW.items():
            for kw in kws:
                if kw in content: return cat
        return "otros"

SPANISH_MONTHS = {'enero':1,'febrero':2,'marzo':3,'abril':4,'mayo':5,'junio':6,'julio':7,'agosto':8,'septiembre':9,'setiembre':9,'octubre':10,'noviembre':11,'diciembre':12}
EN_MONTHS = {'january':1,'february':2,'march':3,'april':4,'may':5,'june':6,'july':7,'august':8,'september':9,'october':10,'november':11,'december':12}

def _parse_es(text: str)->Optional[str]:
    if not text: return None
    t = text.strip().lower()
    if 'se aloj' in t: return None
    m = re.search(r'(\d{1,2})\s+de\s+([a-záéíóú]+)\s+de\s+(\d{4})', t)
    if m:
        d=int(m.group(1)); mon_name=m.group(2).translate(str.maketrans('áéíóú','aeiou')); mon=SPANISH_MONTHS.get(mon_name,0); y=int(m.group(3))
        if mon:
            try: return datetime(y,mon,d).strftime('%Y-%m-%d')
            except: return None
    m = re.search(r'([a-záéíóú]+)\s+de\s+(\d{4})', t)
    if m:
        mon_name=m.group(1).translate(str.maketrans('áéíóú','aeiou')); mon=SPANISH_MONTHS.get(mon_name,0); y=int(m.group(2))
        if mon: return f"{y:04d}-{mon:02d}-01"
    return None

def _parse_en(text: str)->Optional[str]:
    if not text: return None
    t = text.strip().lower()
    m = re.search(r'([a-z]+)\s+(\d{1,2}),\s*(\d{4})', t)
    if m:
        mon=EN_MONTHS.get(m.group(1),0)
        if mon:
            try: return datetime(int(m.group(3)), mon, int(m.group(2))).strftime('%Y-%m-%d')
            except: return None
    m = re.search(r'([a-z]+)\s+(\d{4})', t)
    if m:
        mon=EN_MONTHS.get(m.group(1),0)
        if mon: return f"{int(m.group(2)):04d}-{mon:02d}-01"
    return None

def normalize_review_date(text:str)->Optional[str]:
    return _parse_es(text) or _parse_en(text)

def canonical_booking_path(hotel_url: str)->str:
    p = urlsplit(hotel_url)
    return urlunsplit((p.scheme or 'https', p.netloc or 'www.booking.com', quote(p.path or ''), '', ''))

def booking_review_urls(hotel_url:str, lang='es')->List[str]:
    base = canonical_booking_path(hotel_url); p = urlsplit(base); out=[]
    m = re.match(r"^/hotel/([^/]+)/([^/?#]+)", p.path or "")
    if m:
        country, slug = m.groups()
        out.append(urlunsplit((p.scheme,p.netloc,f"/reviews/{country}/hotel/{slug}", f"r_lang={lang}&sort=latest", '')))
    out.append(urlunsplit((p.scheme,p.netloc,p.path, f"tab=reviews&lang={lang}", "")))
    out.append(urlunsplit((p.scheme,p.netloc,p.path, "", "tab-reviews")))
    if base not in out: out.append(base)
    return out

# -------------- Driver manager --------------
class DM:
    def __init__(self, headless=True):
        self.headless=headless; self.driver=None
    def up(self):
        if self.driver:
            try: _=self.driver.current_url; return self.driver
            except: self.quit()
        o=webdriver.ChromeOptions()
        if self.headless: o.add_argument('--headless=new')
        o.add_argument('--no-sandbox'); o.add_argument('--disable-dev-shm-usage')
        o.add_argument('--disable-blink-features=AutomationControlled')
        o.add_argument(f'--user-agent={UA}')
        o.add_argument('--window-size=1400,2500')
        o.add_experimental_option("excludeSwitches", ["enable-automation"])
        o.add_experimental_option('useAutomationExtension', False)
        try: o.page_load_strategy = 'eager'
        except Exception: pass
        self.driver=webdriver.Chrome(service=Service(), options=o)
        try:
            self.driver.execute_script("Object.defineProperty(navigator,'webdriver',{get:()=>undefined})")
        except Exception: pass
        self.driver.set_page_load_timeout(45)
        return self.driver
    def quit(self):
        try:
            if self.driver: self.driver.quit()
        finally:
            self.driver=None
    def close_popups(self,d):
        sels=[
            "#onetrust-accept-btn-handler","button[id^='onetrust-accept']","[id*='accept'][role='button']",
            "button[aria-label*='Aceptar']","button[aria-label*='Accept']",
            "[role='dialog'] button[aria-label*='Dismiss']",".modal-mask button",
            "#_evidon-accept-button","[data-automation='closeModal']",".ui_close_x","button[aria-label='Close']"
        ]
        for s in sels:
            try:
                els = d.find_elements(By.CSS_SELECTOR, s)
                for el in els:
                    if el.is_displayed() and el.is_enabled():
                        d.execute_script("arguments[0].click();", el); time.sleep(0.05)
            except: pass
    def text(self,root,sel):
        try: return root.find_element(By.CSS_SELECTOR, sel).text.strip()
        except: return ""
    def blocked(self,d)->bool:
        html = (d.page_source or "").lower()
        kw = ["are you a robot","not a robot","verify you are a human","px-captcha",
              "asegúrate de que no eres un robot","comprueba que no eres un robot",
              "we're sorry, but the service is not available"]
        return any(k in html for k in kw)

# -------------- Booking Hoteles --------------
class BookingHotels:
    def __init__(self, dm:DM): self.dm=dm
    def list(self, city, checkin, checkout, max_hotels=30)->List[Dict]:
        d=self.dm.up(); out=[]
        base_q=f"checkin={checkin}&checkout={checkout}&group_adults=2&lang=es&sb=1"
        urls=[f"https://www.booking.com/searchresults.html?ss={city}&{base_q}",
              f"https://www.booking.com/searchresults.es.html?ss={city}&{base_q}"]
        for url in urls:
            try:
                d.get(url); self.dm.close_popups(d)
                WebDriverWait(d,10).until(EC.any_of(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR,'[data-testid="property-card"]')),
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.sr_property_block'))
                ))
            except TimeoutException:
                continue
            for _ in range(2): d.execute_script("window.scrollBy(0, 1400);"); time.sleep(0.15)
            cards=d.find_elements(By.CSS_SELECTOR,'[data-testid="property-card"], .sr_property_block, div[data-component*="property-card"]')
            if not cards: continue
            log.info(f"Booking hoteles encontrados: {len(cards)}")
            for it in cards[:max_hotels]:
                url_h=None
                for sel in ('a[data-testid="title-link"]','[data-testid="title"] a','h3 a','.sr-hotel__name a','a[href*="/hotel/"]'):
                    try:
                        a=it.find_element(By.CSS_SELECTOR,sel); href=a.get_attribute('href')
                        if href and '/hotel/' in href: url_h=href; break
                    except: continue
                if not url_h: continue
                title=""
                for sel in ('[data-testid="title"]','a[data-testid="title-link"]','h3 a','.sr-hotel__name a'):
                    try:
                        t=it.find_element(By.CSS_SELECTOR,sel).text.strip()
                        if t: title=t; break
                    except: continue
                out.append({"title":title or "Hotel","url":url_h,"source":"booking_hotels"})
            if out: break
        return out
    def _ready_reviews(self,d)->bool:
        try:
            WebDriverWait(d,10).until(EC.any_of(
                EC.presence_of_element_located((By.CSS_SELECTOR,'[data-testid="review-card"]')),
                EC.presence_of_element_located((By.CSS_SELECTOR,'#review_list .review_item_review'))
            )); return True
        except TimeoutException: return False
    def _rating(self,card)->Optional[int]:
        for sel in ('[data-testid="review-score"]','.bui-review-score__badge','[aria-label*="Puntuación"]','.review-score-badge','meta[itemprop="ratingValue"]'):
            try:
                el=card.find_element(By.CSS_SELECTOR,sel)
                raw=(el.get_attribute('aria-label') or el.get_attribute('content') or el.text or '').strip()
                m=re.search(r'(\d+[.,]?\d*)', raw)
                if m:
                    v=float(m.group(1).replace(',','.'))
                    return max(1,min(5,int(round(v/2 if v>5 else v))))
            except: pass
        return None
    def _date(self,card)->str:
        txt=self.dm.text(card,'[data-testid="review-date"]') or self.dm.text(card,'.c-review-block__date')
        iso=normalize_review_date(txt) or normalize_review_date(card.text)
        if iso: return iso
        for sel,attr in (('time[datetime]','datetime'),('meta[itemprop="datePublished"]','content')):
            try:
                val=card.find_element(By.CSS_SELECTOR,sel).get_attribute(attr) or ''
                if val: return (val[:10] if len(val)>=10 else val)
            except: pass
        return datetime.now().strftime('%Y-%m-%d')
    def reviews(self, hotel_url:str, max_reviews:int)->List[Dict]:
        if not hotel_url: return []
        d=self.dm.up(); out=[]
        for u in booking_review_urls(hotel_url):
            try:
                d.get(u)
            except InvalidArgumentException:
                continue
            self.dm.close_popups(d)
            if self.dm.blocked(d): continue
            if not self._ready_reviews(d): continue
            for _ in range(2): d.execute_script("window.scrollBy(0,1200)"); time.sleep(0.1)
            cards=d.find_elements(By.CSS_SELECTOR,'[data-testid="review-card"], #review_list .review_item_review')
            for c in cards:
                if len(out)>=max_reviews: break
                pos=self.dm.text(c,'[data-testid="review-positive-text"]') or self.dm.text(c,'p.review_pos span[itemprop="reviewBody"]') or self.dm.text(c,'p.review_pos')
                neg=self.dm.text(c,'[data-testid="review-negative-text"]') or self.dm.text(c,'p.review_neg span[itemprop="reviewBody"]') or self.dm.text(c,'p.review_neg')
                body=self.dm.text(c,'.c-review__body')
                txt=" | ".join([t for t in (pos,neg,body) if t]).strip()
                if not txt: continue
                out.append({'texto':txt,'rating':self._rating(c),'fecha':self._date(c)})
            if out: break
        return out

# -------------- Booking Atracciones --------------
class BookingAttractions:
    def __init__(self, dm:DM): self.dm=dm
    def list(self, city, max_attractions=40)->List[Dict]:
        d=self.dm.up(); out=[]
        slug=city.lower().replace(' ','-')
        url=f"https://www.booking.com/attractions/searchresults/es/{slug}.es.html"
        try:
            d.get(url); self.dm.close_popups(d)
            WebDriverWait(d,8).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'[data-testid="card"]')))
        except TimeoutException:
            return out
        d.execute_script("window.scrollTo(0, document.body.scrollHeight);"); time.sleep(0.2)
        cards=d.find_elements(By.CSS_SELECTOR,'[data-testid="card"]')
        log.info(f"Booking (atracciones) encontradas: {len(cards)}")
        for it in cards[:max_attractions]:
            link=None; title=""
            try:
                a=it.find_element(By.CSS_SELECTOR,'a[href*="/attractions/"]')
                link=a.get_attribute("href") or None
            except: link=None
            for sel in ('[data-testid="card-title"]','h3 a','a[data-testid="card-cta"]','h3','a'):
                try:
                    t=it.find_element(By.CSS_SELECTOR,sel).text.strip()
                    if t: title=t; break
                except: continue
            if link:
                out.append({"title":title or "Atracción","url":link,"source":"booking_attractions"})
        return out
    def _open_reviews_modal(self, d)->Optional[object]:
        try:
            btn=d.find_element(By.CSS_SELECTOR,'[data-testid="reviews-link"]')
            d.execute_script("arguments[0].scrollIntoView({block:'center'});", btn); time.sleep(0.05)
            d.execute_script("arguments[0].click();", btn); time.sleep(0.3)
        except NoSuchElementException: pass
        for sel in ("[data-testid='reviews-modal']",".a9f1d9ba2c","div[role='dialog'][aria-modal='true']"):
            try: return d.find_element(By.CSS_SELECTOR, sel)
            except: continue
        return None
    def _next_page(self, d, container)->bool:
        sels = ('[data-testid="reviews-pagination"]//button[@aria-label="Siguiente"]',)
        try:
            nxt = container.find_element(By.XPATH, "//button[@aria-label='Siguiente' and not(@disabled)]")
            d.execute_script("arguments[0].scrollIntoView({block:'center'});", nxt); time.sleep(0.05)
            d.execute_script("arguments[0].click();", nxt); time.sleep(0.4)
            return True
        except: return False
    def reviews(self, url:str, max_reviews:int)->List[Dict]:
        if not url: return []
        d=self.dm.up(); out=[]; seen=set()
        d.get(url); self.dm.close_popups(d)
        title=""
        for sel in ("[data-testid='product-title']", ".a4ac75716e.css-1mpsda3", "h1"):
            try:
                t=d.find_element(By.CSS_SELECTOR, sel).text.strip()
                if t: title=t; break
            except: continue
        modal=self._open_reviews_modal(d)
        source = modal if modal else d
        idle=0
        while len(out) < max_reviews and idle < 5:
            items = source.find_elements(By.CSS_SELECTOR,'[data-testid="review-item"], [data-testid="review"]')
            start=len(out)
            for it in items:
                if len(out)>=max_reviews: break
                txt=""
                for sel in (".b99b6ef58f","[data-testid='review-text']","p"):
                    try: txt=it.find_element(By.CSS_SELECTOR, sel).text.strip()
                    except: txt=""
                    if txt: break
                if not txt or txt in seen: continue
                seen.add(txt)
                score=None
                try:
                    raw = it.find_element(By.CSS_SELECTOR,'[data-testid="review-star-rating"]').get_attribute('aria-label') or ''
                    m=re.search(r'(\d+)', raw); score=int(m.group(1)) if m else None
                except: pass
                fecha = datetime.now().strftime('%Y-%m-%d')
                out.append({'titulo': title or "Actividad", 'texto': txt, 'rating': score, 'fecha': fecha})
            if len(out)==start:
                if modal and self._next_page(d, modal): idle=0
                else: idle+=1; d.execute_script("window.scrollBy(0, 900)"); time.sleep(0.2)
        return out

# -------------- GetYourGuide (listado + reseñas) --------------
class GetYourGuide:
    CITY_L_MAP = {"barcelona":"barcelona-l45","madrid":"madrid-l46","sevilla":"sevilla-l48","valencia":"valencia-l49","malaga":"malaga-l402"}
    def __init__(self, dm:DM): self.dm=dm
    def _city_urls(self, city:str)->List[str]:
        slug=city.lower().strip().replace(" ","-")
        urls=[]
        if slug in self.CITY_L_MAP:
            urls.append(f"https://www.getyourguide.es/{self.CITY_L_MAP[slug]}/")
        urls += [f"https://www.getyourguide.es/{slug}/", f"https://www.getyourguide.es/s/?q={quote(city)}"]
        return urls
    def list(self, city:str, max_items:int=30)->List[Dict]:
        d=self.dm.up(); out=[]
        for url in self._city_urls(city):
            try:
                d.get(url); time.sleep(0.6); self.dm.close_popups(d)
                # Click "Ver más" via XPATH (no :contains en CSS)
                for _ in range(8):
                    try:
                        btn = WebDriverWait(d,2).until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'Ver más') or contains(.,'Show more')]")))
                        d.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
                        d.execute_script("arguments[0].click();", btn); time.sleep(0.4)
                    except TimeoutException:
                        break
                # Scroll para cargar cards
                prev=0
                for _ in range(30):
                    d.execute_script("window.scrollBy(0, 1600);"); time.sleep(0.15)
                    cards=d.find_elements(By.CSS_SELECTOR,'a.vertical-activity-card__container')
                    if len(cards)==prev: break
                    prev=len(cards)
                cards=d.find_elements(By.CSS_SELECTOR,'a.vertical-activity-card__container')
                if not cards: continue
                for c in cards[:max_items]:
                    try:
                        href=c.get_attribute('href') or ''
                        title=""
                        for s in ("h3[data-test-id='activity-card-title'] span","h3","[data-test-id='activity-card-title']"):
                            try:
                                t=c.find_element(By.CSS_SELECTOR,s).text
                                title=(t or "").strip()
                                if title: break
                            except: continue
                        if href:
                            out.append({"title": title or "Actividad", "url": href, "source":"getyourguide"})
                    except: continue
                if out: break
            except Exception:
                continue
        log.info(f"GetYourGuide items: {len(out)}")
        return out
    def reviews(self, url:str, max_reviews:int=5)->List[Dict]:
        if not url: return []
        d=self.dm.up(); out=[]; seen=set()
        d.get(url); time.sleep(0.6); self.dm.close_popups(d)
        try: title=d.find_element(By.CSS_SELECTOR, "h1#adp-title-text").text.strip()
        except: title=""
        # expand boton ver más reseñas
        try:
            btn = WebDriverWait(d,2).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button[data-test-id='activity-review-see-more-reviews-button']")))
            d.execute_script("arguments[0].scrollIntoView({block:'center'});", btn); d.execute_script("arguments[0].click();", btn); time.sleep(0.3)
        except TimeoutException:
            pass
        idle=0
        while len(out)<max_reviews and idle<6:
            d.execute_script("window.scrollBy(0, 1200);"); time.sleep(0.2)
            cards = d.find_elements(By.CSS_SELECTOR,"section.review-card[data-test-id='activity-review-card']")
            got = 0
            for c in cards:
                if len(out)>=max_reviews: break
                try:
                    more=c.find_element(By.CSS_SELECTOR,"button[aria-expanded='false']")
                    d.execute_script("arguments[0].click();", more); time.sleep(0.05)
                except: pass
                txt=""
                for sel in ("div.toggle-content__content", "p", "[data-test-id='activity-review-card-text']"):
                    try: 
                        txt=c.find_element(By.CSS_SELECTOR, sel).text.strip()
                    except: txt=""
                    if txt: break
                if not txt or txt in seen: continue
                seen.add(txt)
                score=None
                try:
                    raw = c.find_element(By.CSS_SELECTOR,"span.rating-star__label").text.strip()
                    m = re.search(r'(\d+[.,]?)', raw)
                    if m:
                        val=float(m.group(1).replace(',','.'))
                        score=int(round(max(1,min(5,val))))
                except: pass
                date_txt=""
                for sel in ("span.review-card__author-date","span.review-card___author-date"):
                    try:
                        date_txt=c.find_element(By.CSS_SELECTOR, sel).text.strip()
                        if date_txt: break
                    except: continue
                fecha = normalize_review_date(date_txt) or datetime.now().strftime('%Y-%m-%d')
                out.append({'titulo': title or "Actividad", 'texto': txt, 'rating': score, 'fecha': fecha})
                got += 1
            idle = 0 if got else (idle+1)
        return out

# -------------- Ticketmaster --------------
class Ticketmaster:
    BASE='https://app.ticketmaster.com/discovery/v2/events.json'
    def __init__(self, api_key:str): self.key=api_key
    def _backoff(self,n): time.sleep(min(15,2**(n-1))+random.uniform(0.1,0.4))
    def fetch(self, cities:List[str], limit:int=50)->List[Dict]:
        out=[]
        if not self.key or not ENABLE_TICKETMASTER:
            return out
        today = datetime.today().strftime('%Y-%m-%dT00:00:00Z')
        headers={"User-Agent":UA}
        for city in cities:
            log.info(f"Buscando eventos en {city}")
            page=0; got=0
            while got<limit:
                params={'apikey':self.key,'countryCode':'ES','city':city,'startDateTime':today,'sort':'date,asc','size':min(100,limit-got),'page':page}
                try:
                    r=requests.get(self.BASE, params=params, headers=headers, timeout=20)
                    if r.status_code==200:
                        data=r.json(); evs=data.get('_embedded',{}).get('events',[])
                        if not evs: break
                        for e in evs:
                            got+=1
                            out.append({
                                'titulo': e.get('name',''),
                                'texto' : e.get('info','') or '',
                                'sentimiento': None,  # vacío
                                'experiencia': ExperienceClassifier.classify(e.get('name',''), e.get('info','') or ''),
                                'fecha': e.get('dates',{}).get('start',{}).get('localDate') or datetime.now().strftime('%Y-%m-%d'),
                                'fuente': 'ticketmaster'
                            })
                        page+=1
                    elif r.status_code in (429,500,502,503,504):
                        for n in range(1,4):
                            self._backoff(n)
                            r=requests.get(self.BASE, params=params, headers=headers, timeout=20)
                            if r.status_code==200: break
                        else:
                            break
                    else:
                        break
                except Exception:
                    break
        return out

# -------------- Reddit --------------
class RedditScraper:
    def __init__(self, client_id, client_secret, user_agent):
        self.enabled = (ENABLE_REDDIT and client_id and client_secret and user_agent)
        self.reddit = None
        if self.enabled:
            try:
                self.reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent, check_for_async=False)
            except Exception as e:
                log.warning(f"No se pudo inicializar PRAW: {e}")
                self.enabled = False
    def _lang_ok(self, text:str)->bool:
        if not HAS_LANG: return True
        try: return detect(text or '') in ('es','en')
        except Exception: return True
    def search_city(self, city:str, keywords:List[str], per_keyword:int)->List[Dict]:
        rows=[]; seen=set()
        if not self.enabled or not self.reddit: return rows
        for kw in keywords:
            query = f'"{city}" {kw}'
            log.info(f"Reddit: buscando '{query}'")
            try:
                for post in self.reddit.subreddit('all').search(query, sort='new', limit=per_keyword):
                    try:
                        title=(post.title or '').strip(); body=(post.selftext or '').strip()
                        if not (title or body): continue
                        text_all=f"{title} {body}".strip()
                        if not self._lang_ok(text_all): continue
                        key=getattr(post,'id',None) or f"{title[:40]}|{body[:40]}"
                        if key in seen: continue
                        seen.add(key)
                        fecha=datetime.utcfromtimestamp(getattr(post,'created_utc', time.time())).strftime('%Y-%m-%d')
                        rows.append({
                            'titulo': title,
                            'texto': body if body else title,
                            'sentimiento': None,
                            'experiencia': ExperienceClassifier.classify(title, body),
                            'fecha': fecha,
                            'fuente': 'reddit'
                        })
                    except Exception: continue
                time.sleep(0.2)
            except Exception as e:
                log.warning(f"Reddit error '{query}': {e}"); time.sleep(0.5); continue
        return rows

# -------------- Twitter (Playwright con sesión guardada) --------------
class TwitterScraper:
    def __init__(self, session_file:str, headless:bool=True):
        self.session_file=session_file
        self.headless=headless
        self.enabled = ENABLE_TWITTER and os.path.exists(session_file)
        if not self.enabled:
            log.info("Twitter deshabilitado (no se encontró session json o flag).")
    async def _scrape_query(self, query:str, max_tweets:int=120)->List[Dict]:
        rows=[]
        if not self.enabled: return rows
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=self.headless)
            context = await browser.new_context(storage_state=self.session_file, user_agent=UA, viewport={"width":1280,"height":1600})
            page = await context.new_page()
            url = f"https://twitter.com/search?q={quote(query)}&src=typed_query&f=live"
            await page.goto(url, wait_until="domcontentloaded")
            try:
                # cookies (según región)
                btn = await page.query_selector('div[role="button"] :text("Aceptar todas las cookies")')
                if btn: await btn.click()
            except: pass
            await asyncio.sleep(2)
            tweets=set()
            idle=0
            while len(rows)<max_tweets and idle<5:
                cards = await page.query_selector_all('article[data-testid="tweet"]')
                before = len(rows)
                for c in cards:
                    try:
                        t = await c.query_selector('[data-testid="tweetText"]')
                        text = (await t.inner_text()).strip() if t else ""
                        tm = await c.query_selector('time')
                        fecha_local=None
                        if tm:
                            dt = await tm.get_attribute('datetime')
                            if dt:
                                try:
                                    from datetime import datetime as dtc
                                    import pytz
                                    dt_utc = dtc.fromisoformat(dt.replace("Z","+00:00"))
                                    tz = pytz.timezone("Europe/Madrid")
                                    fecha_local = dt_utc.astimezone(tz).strftime("%Y-%m-%d %H:%M:%S")
                                except Exception:
                                    fecha_local = dt
                        key=(text,fecha_local)
                        if text and key not in tweets:
                            tweets.add(key)
                            rows.append({
                                'titulo': f'Twitter: {query}',
                                'texto': text,
                                'sentimiento': None,
                                'experiencia': ExperienceClassifier.classify(query, text),
                                'fecha': (fecha_local or datetime.now().strftime("%Y-%m-%d")),
                                'fuente': 'twitter'
                            })
                            if len(rows)>=max_tweets: break
                    except Exception:
                        continue
                if len(rows)==before:
                    idle+=1
                else:
                    idle=0
                await page.keyboard.press("PageDown")
                await asyncio.sleep(1.2)
            await browser.close()
        return rows
    def search_city(self, city:str, keywords:List[str], per_keyword:int=120)->List[Dict]:
        if not self.enabled: return []
        loop = asyncio.get_event_loop()
        rows=[]
        for kw in keywords:
            q=f"{city} {kw}"
            log.info(f"Twitter: '{q}'")
            try:
                rows.extend(loop.run_until_complete(self._scrape_query(q, per_keyword)))
            except RuntimeError:
                # si ya hay loop activo (colab/nb), usa create_task
                rows.extend(asyncio.run(self._scrape_query(q, per_keyword)))
        return rows

# -------------- Pipeline --------------
class Pipeline:
    def __init__(self, headless=True):
        self.dm=DM(headless=headless)
        self.bh=BookingHotels(self.dm)
        self.ba=BookingAttractions(self.dm)
        self.gyg=GetYourGuide(self.dm)
        self.tm=Ticketmaster(TICKETMASTER_API_KEY)
        self.rs=RedditScraper(REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT)
        self.ts=TwitterScraper(TWITTER_SESSION_FILE, headless=True)

    def run(self, cities:List[str], checkin:Optional[str], checkout:Optional[str], max_items:int=6, reviews_per_item:int=3)->pd.DataFrame:
        if not checkin: checkin=(datetime.now()+timedelta(days=1)).strftime('%Y-%m-%d')
        if not checkout: checkout=(datetime.now()+timedelta(days=2)).strftime('%Y-%m-%d')
        rows=[]
        d = self.dm.up()  # una sola sesión selenium reutilizada
        try:
            for city in cities:
                log.info(f"\n=== Ciudad: {city} ===")

                # Booking Hoteles -> experiencia = 'hotel'; sentimiento = rating real
                try: hotels=self.bh.list(city, checkin, checkout, max_items)
                except Exception as e: log.warning(f"Booking hoteles error: {e}"); hotels=[]
                for h in hotels[:min(6,max_items)]:
                    log.info(f"Reviews hotel: {h['title']}")
                    try:
                        revs=self.bh.reviews(h['url'], reviews_per_item)
                        for r in revs:
                            rows.append({
                                'titulo':h['title'],
                                'texto':r['texto'],
                                'sentimiento': r['rating'],   # None si no hay
                                'experiencia':'hotel',
                                'fecha':r['fecha'],
                                'fuente':'booking_hotels'
                            })
                    except Exception as e:
                        log.warning(f"Reviews hotel fallo: {e}")

                # Booking Atracciones
                if ENABLE_BOOKING_ATTRACTIONS:
                    try: atts=self.ba.list(city, max_items)
                    except Exception as e: log.warning(f"Booking atracciones error: {e}"); atts=[]
                    for a in atts[:min(6,max_items)]:
                        log.info(f"Reviews atracción: {a['title']}")
                        try:
                            revs=self.ba.reviews(a['url'], reviews_per_item)
                            for r in revs:
                                rows.append({
                                    'titulo':r['titulo'],
                                    'texto':r['texto'],
                                    'sentimiento': r['rating'],
                                    'experiencia': ExperienceClassifier.classify(r['titulo'], r['texto']),
                                    'fecha':r['fecha'],
                                    'fuente':'booking_attractions'
                                })
                        except Exception as e:
                            log.warning(f"Reviews atracción fallo: {e}")

                # GetYourGuide
                if ENABLE_GETYOURGUIDE:
                    try: items=self.gyg.list(city, max_items)
                    except Exception as e: log.warning(f"GYG listado error: {e}"); items=[]
                    for it in items[:min(6,max_items)]:
                        log.info(f"GYG reviews: {it['title']}")
                        try:
                            revs=self.gyg.reviews(it['url'], reviews_per_item)
                            for r in revs:
                                rows.append({
                                    'titulo':r['titulo'],
                                    'texto':r['texto'],
                                    'sentimiento': r['rating'],
                                    'experiencia': ExperienceClassifier.classify(r['titulo'], r['texto']),
                                    'fecha':r['fecha'],
                                    'fuente':'getyourguide'
                                })
                        except Exception as e:
                            log.warning(f"GYG reviews fallo: {e}")

                # Ticketmaster
                rows.extend(self.tm.fetch([city], limit=max_items))

                # Reddit
                rows.extend(self.rs.search_city(city, REDDIT_KEYWORDS, REDDIT_MAX_POSTS_PER_KEYWORD))

                # Twitter
                rows.extend(self.ts.search_city(city, TWITTER_KEYWORDS, TWITTER_MAX_TWEETS_PER_QUERY))
        finally:
            self.dm.quit()

        df=pd.DataFrame(rows, columns=['titulo','texto','sentimiento','experiencia','fecha','fuente'])
        if not df.empty:
            def _iso(x):
                x=str(x)
                try: return datetime.fromisoformat(x[:10]).strftime('%Y-%m-%d')
                except: 
                    try: return normalize_review_date(x) or datetime.now().strftime('%Y-%m-%d')
                    except: return datetime.now().strftime('%Y-%m-%d')
            df['fecha']=df['fecha'].astype(str).map(_iso)
        log.info(f"Pipeline completado. Registros: {len(df)}")
        return df

    def save(self, df:pd.DataFrame, filename:Optional[str]=None)->str:
        if filename is None: filename=f"reviews_scraping_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(filename, index=False, encoding='utf-8')
        log.info(f"Guardado: {filename}")
        return filename

# -------- Ejecutar --------
pipe = Pipeline(headless=HEADLESS)
df = pipe.run(CITIES, checkin=None, checkout=None, max_items=MAX_ITEMS, reviews_per_item=REVIEWS_PER_ITEM)
display(df.head(30))
out = pipe.save(df)
print("Total registros:", len(df))
print("Archivo CSV:", out)


2025-09-16 20:41:55,157 - INFO - 
=== Ciudad: Barcelona ===
2025-09-16 20:41:59,666 - INFO - Booking hoteles encontrados: 25
2025-09-16 20:41:59,964 - INFO - Reviews hotel: BYPILLOW Wander
2025-09-16 20:42:28,506 - INFO - Reviews hotel: Hotel Royal Passeig de Gracia
2025-09-16 20:42:57,598 - INFO - Reviews hotel: Exe Plaza Catalunya
2025-09-16 20:43:27,987 - INFO - Reviews hotel: Vincci Gala
2025-09-16 20:43:59,230 - INFO - Reviews hotel: Catalonia La Boquería
2025-09-16 20:44:29,207 - INFO - Booking (atracciones) encontradas: 15
2025-09-16 20:44:29,300 - INFO - Reviews atracción: Entrada a la Sagrada Familia con audioguía
2025-09-16 20:44:31,758 - INFO - Reviews atracción: Entrada al Park Güell
2025-09-16 20:44:33,846 - INFO - Reviews atracción: Tour en autobús turístico
2025-09-16 20:44:36,037 - INFO - Reviews atracción: Billete para el teleférico de Montjuïc
2025-09-16 20:44:38,050 - INFO - Reviews atracción: Entrada a la Sagrada Familia
2025-09-16 20:44:46,233 - INFO - GetYourGuide

Unnamed: 0,titulo,texto,sentimiento,experiencia,fecha,fuente
0,BYPILLOW Wander,"Buena ubicación, personal amable, la cama cómo...",4.0,hotel,2025-07-31,booking_hotels
1,BYPILLOW Wander,Ubicacion y atencion | Que hubo una noche que ...,4.0,hotel,2025-07-07,booking_hotels
2,BYPILLOW Wander,La atención y la habitación están muy bien | T...,4.0,hotel,2025-06-27,booking_hotels
3,BYPILLOW Wander,"La atención del personal, buena ubicación.",4.0,hotel,2025-06-20,booking_hotels
4,Hotel Royal Passeig de Gracia,La ubicación nos encantó !!\nEn la habitación ...,5.0,hotel,2025-09-04,booking_hotels
5,Hotel Royal Passeig de Gracia,Excelente ubicación! Desayuno con muchas opcio...,5.0,hotel,2025-08-25,booking_hotels
6,Hotel Royal Passeig de Gracia,La atención de los recepcionistas. Nos ayudaro...,5.0,hotel,2025-08-07,booking_hotels
7,Hotel Royal Passeig de Gracia,"Todo. El personal muy amable, efectivo y atento",5.0,hotel,2025-08-06,booking_hotels
8,Exe Plaza Catalunya,Excelente la ubicación y la atención,5.0,hotel,2025-09-16,booking_hotels
9,Exe Plaza Catalunya,"Ubicación, instalaciones y persona amable",5.0,hotel,2025-09-15,booking_hotels


2025-09-16 20:45:37,042 - INFO - Guardado: reviews_scraping_20250916_204537.csv


Total registros: 100
Archivo CSV: reviews_scraping_20250916_204537.csv
