In [12]:
import sys
import os
from pathlib import Path
import pandas as pd
import json
from apify_client import ApifyClient

# Subir 2 niveles: notebooks/scraping/ → notebooks/ → tfme-horeca-intelligence/
project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))

# Importar configuración
from config import APIFY_API_KEY, DB_PATH
from src.utils.database import get_db_connection, insert_dataframe_to_table

# Configuración de Apify
client = ApifyClient("")

# URLs de búsqueda
SEARCH_URLS = [
    #"https://www.tripadvisor.es/Restaurants-g187499-oa120-Girona_Province_of_Girona_Catalonia.html", <-- ya completado
    #"https://www.tripadvisor.es/Restaurants-g187499-oa150-Girona_Province_of_Girona_Catalonia.html", <-- ya completado
    "https://www.tripadvisor.es/Restaurants-g187499-oa180-Girona_Province_of_Girona_Catalonia.html",
    #"https://www.tripadvisor.es/Restaurants-g187499-oa210-Girona_Province_of_Girona_Catalonia.html", 
    #"https://www.tripadvisor.es/Restaurants-g187499-oa240-Girona_Province_of_Girona_Catalonia.html",
    #"https://www.tripadvisor.es/Restaurants-g187499-oa270-Girona_Province_of_Girona_Catalonia.html"
]

In [2]:
def run_apify_actor_slow():
    """Ejecuta el actor de Apify con configuración ultra-lenta para evitar bloqueos"""
    print("🚀 Ejecutando actor de Apify (modo lento)...")
    print(f"📋 Número de URLs: {len(SEARCH_URLS)}")
    
    try:
        run = client.actor("maxcopell/tripadvisor").call(
            run_input={
                "startUrls": [{"url": url} for url in SEARCH_URLS],
                "maxItems": 20,
                "proxyConfiguration": {
                    "useApifyProxy": True,
                    "apifyProxyGroups": ["RESIDENTIAL"],
                    "apifyProxyCountry": "ES"
                },
                "maxConcurrency": 1,
                "minTimeBetweenRequests": 10000,
                "searchLanguage": "es",
                "reviews": False,
            },
            wait_secs=600
        )
        
        print("📦 Recopilando datos...")
        dataset = client.dataset(run["defaultDatasetId"]).list_items().items
        
        if not dataset:
            print("❌ No se encontraron datos")
            return None
        
        print(f"✅ Se obtuvieron {len(dataset)} restaurantes")
        return dataset
        
    except Exception as e:
        print(f"❌ Error ejecutando el actor de Apify: {e}")
        return None

In [3]:
# Ejecutar el actor y obtener datos
dataset = run_apify_actor_slow()

🚀 Ejecutando actor de Apify (modo lento)...
📋 Número de URLs: 1


[36m[apify.tripadvisor runId:OdWhUcRvZmlLngHsY][0m -> Status: RUNNING, Message: 
[36m[apify.tripadvisor runId:OdWhUcRvZmlLngHsY][0m -> 2025-09-20T19:23:31.715Z ACTOR: Pulling Docker image of build 0EU8Lvjqh6AkaDl65 from registry.
[36m[apify.tripadvisor runId:OdWhUcRvZmlLngHsY][0m -> 2025-09-20T19:23:31.717Z ACTOR: Creating Docker container.
[36m[apify.tripadvisor runId:OdWhUcRvZmlLngHsY][0m -> 2025-09-20T19:23:31.847Z ACTOR: Starting Docker container.
[36m[apify.tripadvisor runId:OdWhUcRvZmlLngHsY][0m -> 2025-09-20T19:23:32.118Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c npm run start:prod --silent
[36m[apify.tripadvisor runId:OdWhUcRvZmlLngHsY][0m -> 2025-09-20T19:23:33.307Z [32mINFO[39m  System info[90m {"apifyVersion":"3.1.15","apifyClientVersion":"2.8.4","crawleeVersion":"3.7.2","osType":"Linux","nodeVersion":"v20.19.4"}[39m
[36m[apify.tripadvisor runId:OdWhUcRvZmlLngHsY][0m -> 2025-09-20T19:23:33.440Z [32mINFO[39m 

📦 Recopilando datos...
✅ Se obtuvieron 20 restaurantes


In [4]:
def extract_url_path(full_url):
    """Extrae solo la parte del path de la URL"""
    if full_url and 'tripadvisor.com' in full_url:
        return full_url.split('tripadvisor.com')[-1]
    return full_url

def extract_cuisines(cuisines_data):
    """Convierte cocinas a string"""
    if not cuisines_data:
        return ''
    
    if isinstance(cuisines_data, list):
        if cuisines_data and isinstance(cuisines_data[0], dict):
            return ', '.join([c.get('name', '') for c in cuisines_data])
        return ', '.join(cuisines_data)
    return str(cuisines_data)

In [5]:
# Explorar los datos
print("🔍 EXPLORACIÓN DE DATOS OBTENIDOS")
print("="*50)

if dataset:
    # Crear DataFrame para exploración
    df_raw = pd.DataFrame(dataset)
    print(f"📊 Total de items: {len(df_raw)}")
    print(f"📋 Columnas: {list(df_raw.columns)}")
    
    # Mostrar primeras filas
    print("\n👀 Primeras filas:")
    display(df_raw.head())
    
    # Ver estructura del primer item
    print("\n📋 Estructura del primer restaurante:")
    first_item = dataset[0]
    for key, value in first_item.items():
        print(f"  {key}: {type(value).__name__} = {str(value)[:80]}...")
else:
    print("No hay datos para explorar")

🔍 EXPLORACIÓN DE DATOS OBTENIDOS
📊 Total de items: 20
📋 Columnas: ['reviewTags', 'id', 'type', 'category', 'subcategories', 'name', 'locationString', 'description', 'image', 'photoCount', 'rankingPosition', 'rating', 'rawRanking', 'phone', 'address', 'addressObj', 'localName', 'localAddress', 'localLangCode', 'email', 'latitude', 'longitude', 'webUrl', 'website', 'rankingString', 'rankingDenominator', 'rankingSource', 'neighborhoodLocations', 'nearestMetroStations', 'ancestorLocations', 'ratingHistogram', 'numberOfReviews', 'isClosed', 'isLongClosed', 'openNowText', 'priceLevel', 'dishes', 'hours', 'menuWebUrl', 'ownersTopReasons', 'isNearbyResult', 'photos', 'travelerChoiceAward', 'cuisines', 'dietaryRestrictions', 'establishmentTypes', 'features', 'mealTypes', 'isClaimedIcon', 'isClaimedText', 'orderOnline', 'input']

👀 Primeras filas:


Unnamed: 0,reviewTags,id,type,category,subcategories,name,locationString,description,image,photoCount,...,travelerChoiceAward,cuisines,dietaryRestrictions,establishmentTypes,features,mealTypes,isClaimedIcon,isClaimedText,orderOnline,input
0,[],12455037,RESTAURANT,restaurant,[Sit down],Bellavista Restaurant,"Girona, Province of Girona, Catalonia",Our Bellavista Restaurant offers local and Med...,https://media-cdn.tripadvisor.com/media/photo-...,15,...,,"[Mediterranean, Spanish, Fusion]",[],[Restaurants],"[Reservations, Seating, Wheelchair Accessible,...",[Dinner],False,,[],https://www.tripadvisor.es/Restaurants-g187499...
1,[],7660694,RESTAURANT,restaurant,[],Savoy,"Girona, Province of Girona, Catalonia",,https://media-cdn.tripadvisor.com/media/photo-...,46,...,,"[Cafe, Mediterranean, Spanish]",[],[Restaurants],"[Reservations, Seating, Wheelchair Accessible,...","[Breakfast, Lunch, Dinner]",False,,[],https://www.tripadvisor.es/Restaurants-g187499...
2,[],4243420,RESTAURANT,restaurant,[Sit down],Raco del Pernil,"Girona, Province of Girona, Catalonia",,https://media-cdn.tripadvisor.com/media/photo-...,35,...,,"[Mediterranean, Spanish]",[],[Restaurants],"[Reservations, Outdoor Seating, Seating, Wheel...","[Lunch, Dinner]",False,,[],https://www.tripadvisor.es/Restaurants-g187499...
3,[],17712371,RESTAURANT,restaurant,[Café],Tandem Cafe Girona,"Girona, Province of Girona, Catalonia",,https://media-cdn.tripadvisor.com/media/photo-...,23,...,,"[Cafe, Seafood, Mediterranean, Spanish]",[],[Restaurants],"[Reservations, Seating, Wheelchair Accessible,...","[Breakfast, Lunch, Dinner, Brunch, Drinks]",False,,[],https://www.tripadvisor.es/Restaurants-g187499...
4,[],12717475,RESTAURANT,restaurant,[],Blessing Restaurant,"Girona, Province of Girona, Catalonia",,https://media-cdn.tripadvisor.com/media/photo-...,11,...,,"[Latin, Spanish]",[],[Restaurants],"[Reservations, Seating, Wheelchair Accessible,...","[Lunch, Dinner, Brunch]",False,,[],https://www.tripadvisor.es/Restaurants-g187499...



📋 Estructura del primer restaurante:
  reviewTags: list = []...
  id: str = 12455037...
  type: str = RESTAURANT...
  category: str = restaurant...
  subcategories: list = ['Sit down']...
  name: str = Bellavista Restaurant...
  locationString: str = Girona, Province of Girona, Catalonia...
  description: str = Our Bellavista Restaurant offers local and Mediterranean cuisine made with fresh...
  image: str = https://media-cdn.tripadvisor.com/media/photo-m/1280/19/87/b6/01/restaurante-pal...
  photoCount: int = 15...
  rankingPosition: int = 231...
  rating: int = 4...
  rawRanking: float = 3.1246275901794434...
  phone: str = +34 872 08 06 70...
  address: str = Pujada Polvorins, 1, 17004 Girona Spain...
  addressObj: dict = {'street1': 'Pujada Polvorins, 1', 'street2': None, 'city': 'Girona', 'state': N...
  localName: str = Restaurante Bellavista...
  localAddress: NoneType = None...
  localLangCode: str = es...
  email: str = pbellavista@ac-hotels.com...
  latitude: float = 41.9789

In [6]:
if dataset:
    # Procesar datos para la BD
    restaurantes_data = []
    
    for item in dataset:
        # Convertir listas a strings
        platos = item.get('dishes', '')
        if isinstance(platos, list):
            platos = ', '.join(platos)  # Convertir lista a string separado por comas
        
        tipo_cocina = extract_cuisines(item.get('cuisines', []))
        if isinstance(tipo_cocina, list):
            tipo_cocina = ', '.join(tipo_cocina)
        
        restaurante = {
            'tripadvisor_id' : item.get('id', ''),
            'nombre' : item.get('name',''),
            'telefono' : item.get('phone',''),
            'direccion' : item.get('address',''),
            'localizacion' : item.get('locationString',''),
            'email' : item.get('email',''),
            'latitud' : item.get('latitude',''),
            'longitud' : item.get('longitude',''),
            'website' : extract_url_path(item.get('website','')),                        
            'rating' : item.get('rating',''),
            'ranking' : item.get('rankingPosition',''),
            'rango_precio': str(item.get('priceLevel', '')),
            'platos' : platos,  # Ahora es string, no lista
            'tipo_cocina': tipo_cocina
        }
        restaurantes_data.append(restaurante)
    
    df_clean = pd.DataFrame(restaurantes_data)
    print(f"✅ Datos procesados: {len(df_clean)} restaurantes")
    
    # Verificación adicional
    print(f"🔍 Tipo de platos en primera fila: {type(df_clean['platos'].iloc[0])}")
    print(f"🔍 Valor de platos en primera fila: {df_clean['platos'].iloc[0]}")
    
    print("\n📋 Preview de datos limpios:")
    display(df_clean.head())
else:
    print("No hay datos para procesar")

✅ Datos procesados: 20 restaurantes
🔍 Tipo de platos en primera fila: <class 'str'>
🔍 Valor de platos en primera fila: 

📋 Preview de datos limpios:


Unnamed: 0,tripadvisor_id,nombre,telefono,direccion,localizacion,email,latitud,longitud,website,rating,ranking,rango_precio,platos,tipo_cocina
0,12455037,Bellavista Restaurant,+34 872 08 06 70,"Pujada Polvorins, 1, 17004 Girona Spain","Girona, Province of Girona, Catalonia",pbellavista@ac-hotels.com,41.97896,2.82933,http://www.espanol.marriott.com/hotels/hotel-i...,4.0,231,$$ - $$$,,"Mediterranean, Spanish, Fusion"
1,7660694,Savoy,+34 972 20 38 00,"Calle Nou, 3, 17001 Girona Spain","Girona, Province of Girona, Catalonia",info@savoygirona.com,41.98267,2.82253,http://savoygirona.com,3.5,147,$$ - $$$,"Salad, Cocido","Cafe, Mediterranean, Spanish"
2,4243420,Raco del Pernil,+34 972 21 36 56,"Plaza Poeta Marquina Num. 5, 17002 Girona Spain","Girona, Province of Girona, Catalonia",,41.98117,2.81796,http://www.racodelpernil.com,4.0,198,$$ - $$$,"Tapas, Prosciutto, Cocido, Coca, Pernil","Mediterranean, Spanish"
3,17712371,Tandem Cafe Girona,+34 872 99 37 64,"Carrer de Pau Vila i Dinares, 6, 17007 Girona ...","Girona, Province of Girona, Catalonia",tandemcafe.girona@gmail.com,41.972656,2.816484,,3.5,317,$$ - $$$,Cocido,"Cafe, Seafood, Mediterranean, Spanish"
4,12717475,Blessing Restaurant,+34 872 21 27 90,"Carrer Barcelona, 48, 17002 Girona Spain","Girona, Province of Girona, Catalonia",garciamidence27@hotmail.es,41.97742,2.81725,https://blessingrestaurantct.com/,4.0,227,$,Cocido,"Latin, Spanish"


In [13]:
if 'df_clean' in locals() and not df_clean.empty:
    conn = None
    try:
        print("💾 Guardando en base de datos...")
        conn = get_db_connection()
        
        # VERIFICACIÓN simple
        cursor = conn.cursor()
        cursor.execute("PRAGMA database_list;")
        db_info = cursor.fetchall()
        print("📋 Base de datos conectada:", db_info[0][2])
        
        # Verificar si la tabla existe
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='Restaurantes'")
        if cursor.fetchone():
            print("✅ Tabla Restaurantes encontrada")
            
            # Insertar datos
            rows_inserted = insert_dataframe_to_table(df_clean, 'Restaurantes', conn)
            conn.commit()
            print(f"✅ {rows_inserted} filas insertadas")
        else:
            print("❌ Tabla Restaurantes no existe")
        
        if conn:
            conn.close()
        
    except Exception as e:
        print(f"❌ Error guardando en BD: {e}")
        if conn:
            conn.rollback()
else:
    print("No hay datos limpios para guardar")

💾 Guardando en base de datos...
📍 Ruta absoluta del archivo: /Users/administrator/MASTER_Data_Science/TFM/tfme-horeca-intelligence/src/utils/database.py
📍 Project root: /Users/administrator/MASTER_Data_Science/TFM/tfme-horeca-intelligence
📍 DB path: tfm_database.db
❌ Error guardando en BD: 'str' object has no attribute 'exists'
