#  Scraping de Civitatis

[texto del enlace](https://)## 📦 Instalación

In [None]:
pip install selenium

## ⚙️ Configuración de destinos

Elegimos los 8 destinos de nuestro estudio


In [None]:
# Lista y mapeo de ciudades ➜ slugs Civitatis (ES)
CITY_ORDER = [
    "Tenerife", "Barcelona", "Madrid", "Malaga",
    "Gran Canaria", "Seville", "Valencia", "Palma de Mallorca"
]

SLUG_MAP = {
    "Tenerife": "tenerife",
    "Barcelona": "barcelona",
    "Madrid": "madrid",
    "Malaga": "malaga",
    "Gran Canaria": "gran-canaria",
    "Sevilla": "sevilla",
    "Valencia": "valencia",
    "Palma de Mallorca": "palma-de-mallorca"
}

BASE_URL = "https://www.civitatis.com/es/{slug}/"

# Carpeta de salida (opcional). Por defecto guardamos en el directorio de trabajo.
OUTPUT_DIR = "."


## 🧰 Librerías e importaciones

In [None]:
import os
import re
import csv
import time
import random
from pathlib import Path
from typing import List, Dict, Any, Optional

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException, TimeoutException, StaleElementReferenceException
)


## 🔧 Funciones auxiliares

Funciones para navegar por la web, auto scroll y avisos



In [None]:
def handle_no_such_element_exception(func):
    """Devuelve el resultado de `func()` o None si el elemento no está disponible todavía."""
    try:
        return func()
    except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
        return None


def auto_scroll(driver, pause_time: float = 2.0, max_scrolls: int = 10) -> None:
    """Desplaza hacia abajo para forzar la carga perezosa del listado."""
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


## 🚗 Driver y utilidades de navegación

In [None]:
def make_driver(headless: bool = False):
    """Crea el driver de Chrome. Si necesitas modo headless, pon `headless=True`."""
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--start-maximized")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")
    # Si necesitas desactivar imágenes/JS, puedes añadir preferencias aquí.
    driver = webdriver.Chrome(service=Service(), options=options)
    return driver


def accept_cookies_if_present(driver, timeout: int = 10) -> None:
    """Intenta aceptar el banner de cookies de Didomi si aparece."""
    try:
        btn = WebDriverWait(driver, timeout).until(
            EC.element_to_be_clickable((By.ID, "didomi-notice-agree-button"))
        )
        btn.click()
        time.sleep(1)
    except Exception:
        pass  # no hay cookies o ya aceptadas


## 🕷️ Scraper de una ciudad

1. Abre la URL de la ciudad.
2. Acepta cookies (si aparecen).
3. Carga el listado y realiza *auto scroll*.
4. Recorre **todas las tarjetas** (`article.comfort-card`), entrando por título o card para capturar la **URL de detalle**, vuelve atrás y extrae:
   - `title`, `address` (texto de la card), `review_score`, `review_count`, `description`, `price`.
5. Avanza con el botón `#activity-list-next-page` hasta agotar páginas.
6. Guarda los datos en `CSV` con el **mismo esquema** de columnas.


In [None]:
def scrape_civitatis_city(city_name: str, slug: str, base_url: str = BASE_URL, out_dir: str = OUTPUT_DIR) -> str:
    url = base_url.format(slug=slug)
    print(f"🌐 {city_name} → {url}")

    driver = make_driver(headless=False)
    items: List[Dict[str, Any]] = []
    seen_ids = set()

    try:
        driver.get(url)
        accept_cookies_if_present(driver)
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'article.comfort-card'))
        )

        page = 1
        while True:
            print(f"📄 Página {page}: cargando y desplazando...")
            auto_scroll(driver, pause_time=2, max_scrolls=10)

            activities = driver.find_elements(By.CSS_SELECTOR, 'article.comfort-card')
            print(f"🔍 Se encontraron {len(activities)} actividades en esta página.")

            i = 0
            while i < len(activities):
                activities = driver.find_elements(By.CSS_SELECTOR, 'article.comfort-card')
                if i >= len(activities):
                    break
                activity = activities[i]

                activity_id = activity.get_attribute("data-cs-override-id")
                if not activity_id or activity_id in seen_ids:
                    i += 1
                    continue
                seen_ids.add(activity_id)

                # Intentar entrar en el detalle para capturar URL
                url_detail = ""
                try:
                    driver.execute_script("arguments[0].scrollIntoView(true);", activity)
                    try:
                        title_elem = activity.find_element(By.CSS_SELECTOR, ".comfort-card__title")
                        driver.execute_script("arguments[0].click();", title_elem)
                    except Exception:
                        driver.execute_script("arguments[0].click();", activity)
                    time.sleep(2)
                    url_detail = driver.current_url
                    driver.back()
                    time.sleep(2)
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'article.comfort-card'))
                    )
                except Exception as e:
                    print(f"No se pudo hacer clic en {activity_id}: {e}")
                    url_detail = ""

                # Extraer info desde la tarjeta (como en el original)
                activities = driver.find_elements(By.CSS_SELECTOR, 'article.comfort-card')
                if i < len(activities):
                    activity = activities[i]
                    title = handle_no_such_element_exception(lambda: activity.find_element(By.CSS_SELECTOR, '.comfort-card__title').text)
                    address = handle_no_such_element_exception(lambda: activity.find_element(By.CSS_SELECTOR, '.__city').text)
                    review_score = handle_no_such_element_exception(lambda: activity.find_element(By.CSS_SELECTOR, '.m-rating--text').text)
                    review_count = handle_no_such_element_exception(lambda: activity.find_element(By.CSS_SELECTOR, '.text--rating-total').text)
                    if review_count:
                        numbers = re.findall(r'\d[\d\.]*', review_count)
                        review_count = numbers[0] if numbers else None
                    description = handle_no_such_element_exception(lambda: activity.find_element(By.CSS_SELECTOR, '.comfort-card__text').text)
                    price = handle_no_such_element_exception(lambda: WebDriverWait(activity, 2).until(
                        lambda d: activity.find_element(By.CSS_SELECTOR, '.comfort-card__price__text__wrapper')
                    ).text)
                else:
                    title = address = review_score = review_count = description = price = None

                items.append({
                    "id": activity_id,
                    "url": url_detail,
                    "title": title,
                    "address": address,
                    "review_score": review_score,
                    "review_count": review_count,
                    "description": description,
                    "price": price
                })
                i += 1

            # Paginación
            try:
                next_button = driver.find_element(By.ID, 'activity-list-next-page')
                if "disabled" in next_button.get_attribute("class"):
                    print("🚫 Botón 'Siguiente' desactivado. Fin.")
                    break
                driver.execute_script("arguments[0].click();", next_button)
                print("➡ Pasando a la siguiente página...")
                time.sleep(random.uniform(2, 4))
                page += 1
            except NoSuchElementException:
                print("🚫 Botón 'Siguiente' no encontrado. Fin.")
                break

    finally:
        driver.quit()

    # Guardado con el mismo esquema de columnas
    out_path = os.path.join(out_dir, f"atracciones_{slug}.csv")
    with open(out_path, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "id","url","title","address","review_score","review_count","description","price"
        ])
        writer.writeheader()
        writer.writerows(items)

    print(f"✅ Guardado: {out_path} ({len(items)} filas)")
    return out_path


## 🏷️ Post-proceso: normalizar `address` a la ciudad

Guardamos cada csv con su nombre correspondiente


In [None]:
def force_address_to_city(csv_in: str, city_name: str) -> str:
    rows = []
    with open(csv_in, "r", encoding="utf-8") as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            row["address"] = city_name
            rows.append(row)

    csv_out = os.path.splitext(csv_in)[0] + "_ciudad_.csv"
    with open(csv_out, "w", newline="", encoding="utf-8") as outfile:
        writer = csv.DictWriter(outfile, fieldnames=[
            "id","url","title","address","review_score","review_count","description","price"
        ])
        writer.writeheader()
        writer.writerows(rows)
    print(f"🏷️ Address normalizado → {csv_out}")
    return csv_out


## ▶️ Ejecutar para las 8 ciudades

El siguiente bloque recorre las 8 ciudades **en el orden indicado**. Tras cada scraping,
también genera el CSV con `address` normalizado al nombre de ciudad.


In [None]:
def run_all_cities(city_order = CITY_ORDER, slug_map = SLUG_MAP):
    resultados = []
    for city in city_order:
        slug = slug_map[city]
        try:
            raw_csv = scrape_civitatis_city(city, slug)
            final_csv = force_address_to_city(raw_csv, city)
            resultados.append((city, raw_csv, final_csv))
        except Exception as e:
            print(f"❌ Error con {city}: {e}")
    return resultados


run_all_cities()
