In [1]:
import sys
import os
from pathlib import Path
import pandas as pd
import json
from apify_client import ApifyClient

# Configuración de paths
project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))

# Importar configuración
from config import APIFY_API_KEY, DB_PATH
from src.utils.database import get_db_connection, insert_dataframe_to_table

# Configuración de Apify
client = ApifyClient(APIFY_API_KEY)

# URLs de búsqueda 
SEARCH_URLS = [
    #"https://www.tripadvisor.es/Restaurants-g187499-oa120-Girona_Province_of_Girona_Catalonia.html", 
    "https://www.tripadvisor.es/Restaurants-g187499-oa150-Girona_Province_of_Girona_Catalonia.html",
    #"https://www.tripadvisor.es/Restaurants-g187499-oa180-Girona_Province_of_Girona_Catalonia.html",
]

In [2]:
def run_apify_actor_slow():
    """Ejecuta el actor de Apify con configuración ultra-lenta para evitar bloqueos"""
    print("🚀 Ejecutando actor de Apify (modo lento)...")
    print(f"📋 Número de URLs: {len(SEARCH_URLS)}")
    
    try:
        run = client.actor("maxcopell/tripadvisor").call(
            run_input={
                "startUrls": [{"url": url} for url in SEARCH_URLS],
                "maxItems": 20,
                "proxyConfiguration": {
                    "useApifyProxy": True,
                    "apifyProxyGroups": ["RESIDENTIAL"],
                    "apifyProxyCountry": "ES"
                },
                "maxConcurrency": 1,
                "minTimeBetweenRequests": 10000,
                "searchLanguage": "es",
                "reviews": False,
            },
            wait_secs=600
        )
        
        print("📦 Recopilando datos...")
        dataset = client.dataset(run["defaultDatasetId"]).list_items().items
        
        if not dataset:
            print("❌ No se encontraron datos")
            return None
        
        print(f"✅ Se obtuvieron {len(dataset)} restaurantes")
        return dataset
        
    except Exception as e:
        print(f"❌ Error ejecutando el actor de Apify: {e}")
        return None

In [3]:
# Ejecutar el actor y obtener datos
dataset = run_apify_actor_slow()

🚀 Ejecutando actor de Apify (modo lento)...
📋 Número de URLs: 1


[36m[apify.tripadvisor runId:KQnGLjeX9HRD4hhr2][0m -> Status: RUNNING, Message: 
[36m[apify.tripadvisor runId:KQnGLjeX9HRD4hhr2][0m -> 2025-09-21T18:10:30.018Z ACTOR: Pulling Docker image of build 0EU8Lvjqh6AkaDl65 from registry.
[36m[apify.tripadvisor runId:KQnGLjeX9HRD4hhr2][0m -> 2025-09-21T18:10:30.021Z ACTOR: Creating Docker container.
[36m[apify.tripadvisor runId:KQnGLjeX9HRD4hhr2][0m -> 2025-09-21T18:10:30.202Z ACTOR: Starting Docker container.
[36m[apify.tripadvisor runId:KQnGLjeX9HRD4hhr2][0m -> 2025-09-21T18:10:30.432Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c npm run start:prod --silent
[36m[apify.tripadvisor runId:KQnGLjeX9HRD4hhr2][0m -> 2025-09-21T18:10:33.078Z [32mINFO[39m  System info[90m {"apifyVersion":"3.1.15","apifyClientVersion":"2.8.4","crawleeVersion":"3.7.2","osType":"Linux","nodeVersion":"v20.19.4"}[39m
[36m[apify.tripadvisor runId:KQnGLjeX9HRD4hhr2][0m -> 2025-09-21T18:10:33.207Z [32mINFO[39m 

📦 Recopilando datos...
✅ Se obtuvieron 20 restaurantes


In [4]:
def extract_url_path(full_url):
    """Extrae solo la parte del path de la URL"""
    if full_url and 'tripadvisor.com' in full_url:
        return full_url.split('tripadvisor.com')[-1]
    return full_url

def extract_cuisines(cuisines_data):
    """Convierte cocinas a string"""
    if not cuisines_data:
        return ''
    
    if isinstance(cuisines_data, list):
        if cuisines_data and isinstance(cuisines_data[0], dict):
            return ', '.join([c.get('name', '') for c in cuisines_data])
        return ', '.join(cuisines_data)
    return str(cuisines_data)

In [5]:
# Explorar los datos
print("🔍 EXPLORACIÓN DE DATOS OBTENIDOS")
print("="*50)

if dataset:
    # Crear DataFrame para exploración
    df_raw = pd.DataFrame(dataset)
    print(f"📊 Total de items: {len(df_raw)}")
    print(f"📋 Columnas: {list(df_raw.columns)}")
    
    # Mostrar primeras filas
    print("\n👀 Primeras filas:")
    display(df_raw.head())
    
    # Ver estructura del primer item
    print("\n📋 Estructura del primer restaurante:")
    first_item = dataset[0]
    for key, value in first_item.items():
        print(f"  {key}: {type(value).__name__} = {str(value)[:80]}...")
else:
    print("No hay datos para explorar")

🔍 EXPLORACIÓN DE DATOS OBTENIDOS
📊 Total de items: 20
📋 Columnas: ['id', 'type', 'category', 'subcategories', 'name', 'locationString', 'description', 'image', 'photoCount', 'rankingPosition', 'rating', 'rawRanking', 'phone', 'address', 'addressObj', 'localName', 'localAddress', 'localLangCode', 'email', 'latitude', 'longitude', 'webUrl', 'website', 'rankingString', 'rankingDenominator', 'rankingSource', 'neighborhoodLocations', 'nearestMetroStations', 'ancestorLocations', 'ratingHistogram', 'numberOfReviews', 'isClosed', 'isLongClosed', 'openNowText', 'priceLevel', 'dishes', 'hours', 'menuWebUrl', 'ownersTopReasons', 'isNearbyResult', 'reviewTags', 'photos', 'travelerChoiceAward', 'cuisines', 'dietaryRestrictions', 'establishmentTypes', 'features', 'mealTypes', 'isClaimedIcon', 'isClaimedText', 'orderOnline', 'input']

👀 Primeras filas:


Unnamed: 0,id,type,category,subcategories,name,locationString,description,image,photoCount,rankingPosition,...,travelerChoiceAward,cuisines,dietaryRestrictions,establishmentTypes,features,mealTypes,isClaimedIcon,isClaimedText,orderOnline,input
0,32976830,RESTAURANT,restaurant,[],Osteria Del Ponte,"Girona, Province of Girona, Catalonia",,https://media-cdn.tripadvisor.com/media/photo-...,1,225,...,,[],[],[Restaurants],[],"[Lunch, Dinner]",False,,[],https://www.tripadvisor.es/Restaurants-g187499...
1,11549212,RESTAURANT,restaurant,[],Nelson,"Girona, Province of Girona, Catalonia",,https://media-cdn.tripadvisor.com/media/photo-...,14,135,...,,"[Mediterranean, European, Spanish]","[Vegetarian friendly, Vegan options]",[Restaurants],"[Reservations, Seating, Highchairs Available, ...","[Breakfast, Lunch, Dinner]",False,,[],https://www.tripadvisor.es/Restaurants-g187499...
2,7389399,RESTAURANT,restaurant,[],El Forn,"Girona, Province of Girona, Catalonia",,https://media-cdn.tripadvisor.com/media/photo-...,8,142,...,,"[Mediterranean, Spanish]","[Vegetarian friendly, Vegan options]",[Restaurants],"[Reservations, Outdoor Seating, Seating, Highc...","[Lunch, Dinner]",False,,[],https://www.tripadvisor.es/Restaurants-g187499...
3,23177354,RESTAURANT,restaurant,[Sit down],Da Vinci Girona,"Girona, Province of Girona, Catalonia",Restaurant-cafeteria with a large outdoor terr...,https://media-cdn.tripadvisor.com/media/photo-...,74,156,...,,"[Pizza, Cafe, Mediterranean, Spanish, Dining b...",[],[Restaurants],"[Reservations, Outdoor Seating, Seating, Telev...","[Breakfast, Lunch, Dinner, Brunch, Late Night,...",False,,[],https://www.tripadvisor.es/Restaurants-g187499...
4,2715456,RESTAURANT,restaurant,[Sit down],La Pedra,"Girona, Province of Girona, Catalonia",,https://media-cdn.tripadvisor.com/media/photo-...,10,206,...,,"[Bar, Mediterranean, Spanish, Pub]",[],[Restaurants],"[Seating, Table Service]",[],False,,[],https://www.tripadvisor.es/Restaurants-g187499...



📋 Estructura del primer restaurante:
  id: str = 32976830...
  type: str = RESTAURANT...
  category: str = restaurant...
  subcategories: list = []...
  name: str = Osteria Del Ponte...
  locationString: str = Girona, Province of Girona, Catalonia...
  description: NoneType = None...
  image: str = https://media-cdn.tripadvisor.com/media/photo-w/2f/f0/f4/9d/caption.jpg...
  photoCount: int = 1...
  rankingPosition: int = 225...
  rating: float = 4.8...
  rawRanking: float = 3.1296956539154053...
  phone: str = +34 972 51 91 49...
  address: str = Calle De Les Hortes 1, 17001 Girona Spain...
  addressObj: dict = {'street1': 'Calle De Les Hortes 1', 'street2': None, 'city': 'Girona', 'state':...
  localName: str = Osteria Del Ponte...
  localAddress: str = Calle De Les Hortes 1, 17001...
  localLangCode: str = es...
  email: NoneType = None...
  latitude: float = 41.984566...
  longitude: float = 2.823535...
  webUrl: str = https://www.tripadvisor.com/Restaurant_Review-g187499-d32976830

In [6]:
if dataset:
    # Procesar datos para la BD
    restaurantes_data = []
    
    for item in dataset:
        # Función de limpieza para rango_precio
        def clean_price_level(price_level):
            if price_level is None:
                return ''
            elif isinstance(price_level, (int, float)):
                return str(price_level)
            elif isinstance(price_level, str):
                return price_level
            else:
                return str(price_level)
        
        # Convertir listas a strings
        platos = item.get('dishes', '')
        if isinstance(platos, list):
            platos = ', '.join(platos)
        
        tipo_cocina = extract_cuisines(item.get('cuisines', []))
        if isinstance(tipo_cocina, list):
            tipo_cocina = ', '.join(tipo_cocina)
        
        restaurante = {
            'tripadvisor_id' : item.get('id', ''),
            'nombre' : item.get('name',''),
            'telefono' : item.get('phone',''),
            'direccion' : item.get('address',''),
            'localizacion' : item.get('locationString',''),
            'email' : item.get('email',''),
            'latitud' : item.get('latitude',''),
            'longitud' : item.get('longitude',''),
            'website' : extract_url_path(item.get('website','')),                        
            'rating' : item.get('rating',''),
            'ranking' : item.get('rankingPosition',''),
            'rango_precio': clean_price_level(item.get('priceLevel', '')),
            'platos' : platos,
            'tipo_cocina': tipo_cocina,
            'tripadvisor_web' : item.get('webUrl','')
        }
        restaurantes_data.append(restaurante)
    
    df_clean = pd.DataFrame(restaurantes_data)
    
    # Limpieza adicional por si acaso
    df_clean['rango_precio'] = df_clean['rango_precio'].fillna('').astype(str)
    
    print(f"✅ Datos procesados: {len(df_clean)} restaurantes")

✅ Datos procesados: 20 restaurantes


In [7]:
if 'df_clean' in locals() and not df_clean.empty:
    conn = None
    try:
        print("💾 Guardando en base de datos...")
        conn = get_db_connection()
        
        # VERIFICACIÓN simple
        cursor = conn.cursor()
        cursor.execute("PRAGMA database_list;")
        db_info = cursor.fetchall()
        print("📋 Base de datos conectada:", db_info[0][2])
        
        # Verificar si la tabla existe
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='Restaurantes'")
        if cursor.fetchone():
            print("✅ Tabla Restaurantes encontrada")
            
            # Insertar datos
            rows_inserted = insert_dataframe_to_table(df_clean, 'Restaurantes', conn)
            conn.commit()
            print(f"✅ {rows_inserted} filas insertadas")
        else:
            print("❌ Tabla Restaurantes no existe")
        
        if conn:
            conn.close()
        
    except Exception as e:
        print(f"❌ Error guardando en BD: {e}")
        if conn:
            conn.rollback()
else:
    print("No hay datos limpios para guardar")

💾 Guardando en base de datos...
📍 Conectando a: /Users/administrator/MASTER_Data_Science/TFM/tfme-horeca-intelligence/tfm_database.db
📍 ¿Existe el archivo? True
✅ Conexión a la BD SQLite exitosa.
📋 Base de datos conectada: /Users/administrator/MASTER_Data_Science/TFM/tfme-horeca-intelligence/tfm_database.db
✅ Tabla Restaurantes encontrada
✅ 20 filas insertadas en la tabla 'Restaurantes'.
✅ 20 filas insertadas
