In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pickle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import geopandas as gpd
from shapely.geometry import Point
import warnings
import re
warnings.filterwarnings('ignore')

In [2]:
class TourismDataPreprocessor:
    def __init__(self):
        self.scalers = {}
        self.encoders = {}
        
    def load_and_clean_datasets(self, file_paths):
        """
        Carga y limpia todos los datasets
        """
        datasets = {}
        
        # Cargar datasets principales
        print("Cargando datasets principales...")
        
        # Actividades (5 archivos)
        activity_files = file_paths['activities']
        activities_list = []
        for file in activity_files:
            df = pd.read_csv(file)
            activities_list.append(df)
        datasets['activities'] = pd.concat(activities_list, ignore_index=True)
        
        # Comentarios con análisis de sentimiento y clima
        datasets['reviews'] = pd.read_csv(file_paths['reviews'])
        
        # Datos de turismo ONU (7 archivos)
        un_tourism_files = file_paths['un_tourism']
        un_data_list = []
        for file in un_tourism_files:
            df = pd.read_csv(file)
            un_data_list.append(df)
        datasets['un_tourism'] = pd.concat(un_data_list, ignore_index=True)
        
        # Meta Data for Good
        datasets['commuting_zones'] = pd.read_csv(file_paths['commuting_zones'])
        datasets['movement_data'] = pd.read_csv(file_paths['movement_data'])
        
        # Google Trends
        datasets['search_trends'] = pd.read_csv(file_paths['search_trends'])
        datasets['monthly_interest'] = pd.read_csv(file_paths['monthly_interest'])
        
        return datasets
    
    def match_reviews_to_activities(self, activities_df, sentiment_df):
        """
        Empareja reviews con actividades basándose en títulos y crea dataset expandido
        """
        print("Emparejando reviews con actividades...")
        
        # Verificar columnas necesarias
        required_activity_cols = ['titulo', 'ciudad', 'id', 'precio', 'rating']
        required_sentiment_cols = ['texto', 'ciudad', 'sentimiento', 'confianza']
        
        missing_activity_cols = set(required_activity_cols) - set(activities_df.columns)
        missing_sentiment_cols = set(required_sentiment_cols) - set(sentiment_df.columns)
        
        if missing_activity_cols:
            print(f"Columnas faltantes en activities: {missing_activity_cols}")
        if missing_sentiment_cols:
            print(f"Columnas faltantes en sentiment: {missing_sentiment_cols}")
        
        # NORMALIZAR NOMBRES DE CIUDADES
        print("Normalizando nombres de ciudades...")
        activities_df = activities_df.copy()
        sentiment_df = sentiment_df.copy()
        
        # Función para normalizar nombres de ciudades
        def normalize_city_name(city_name):
            if pd.isna(city_name):
                return 'Unknown'
            
            city_str = str(city_name).lower().strip()
            
            # Mapeo de normalizaciones
            city_mappings = {
                'barcelona': 'Barcelona',
                'barcelona1': 'Barcelona',
                'madrid': 'Madrid',
                'malaga': 'Malaga',
                'málaga': 'Malaga',
                'sevilla': 'Sevilla',
                'valencia': 'Valencia',
                'tenerife': 'Tenerife',
                'gran canaria': 'Gran Canaria',
                'canaria': 'Gran Canaria',
                'canarias': 'Gran Canaria',
                'mallorca': 'Mallorca',
                'palma de mallorca': 'Mallorca',
                'palma': 'Mallorca'
            }
            
            return city_mappings.get(city_str, city_name)
        
        # Aplicar normalización
        activities_df['ciudad_original'] = activities_df['ciudad']
        sentiment_df['ciudad_original'] = sentiment_df['ciudad']
        
        activities_df['ciudad'] = activities_df['ciudad'].apply(normalize_city_name)
        sentiment_df['ciudad'] = sentiment_df['ciudad'].apply(normalize_city_name)
        
        print("Ciudades después de normalización:")
        print(f"  Activities: {sorted(activities_df['ciudad'].unique())}")
        print(f"  Sentiment: {sorted(sentiment_df['ciudad'].unique())}")
        
        # Crear dataset expandido donde cada review se asocia con su actividad
        matched_reviews = []
        total_matches = 0
        successful_matches = 0
        errors_count = 0
        
        print(f"Procesando {len(activities_df)} actividades...")
        
        for idx, activity in activities_df.iterrows():
            if idx % 1000 == 0:
                print(f"Progreso: {idx}/{len(activities_df)} ({idx/len(activities_df)*100:.1f}%) - Matches: {total_matches}")
            
            try:
                activity_title = str(activity['titulo']).lower().strip()
                activity_city = activity['ciudad']
                item_id = activity.get('id', f'item_{idx}')
                
                # Saltear si el título está vacío o es muy corto
                if len(activity_title) < 3:
                    continue
                
                # NORMALIZAR TÍTULO - Remover caracteres especiales problemáticos
                import string
                # Crear tabla de traducción para remover puntuación problemática
                translator = str.maketrans('', '', '!¡¿?()[]{}*+^$|\\')
                activity_title_clean = activity_title.translate(translator)
                
                # Método 1: Búsqueda exacta sin regex
                try:
                    # Buscar reviews que contengan el título (búsqueda simple)
                    matching_reviews_mask = (
                        sentiment_df['texto'].str.lower().str.contains(activity_title_clean, na=False, regex=False) &
                        (sentiment_df['ciudad'] == activity_city)
                    )
                    
                    matching_reviews_subset = sentiment_df[matching_reviews_mask].copy()
                    
                except Exception as simple_error:
                    # Método 2: Búsqueda palabra por palabra
                    try:
                        words = activity_title_clean.split()
                        if len(words) >= 2:  # Solo si tiene al menos 2 palabras
                            # Buscar reviews que contengan al menos 70% de las palabras del título
                            mask = sentiment_df['ciudad'] == activity_city
                            word_matches = 0
                            for word in words:
                                if len(word) > 3:  # Solo palabras significativas
                                    try:
                                        word_mask = sentiment_df['texto'].str.lower().str.contains(word, na=False, regex=False)
                                        if word_mask.any():
                                            mask = mask & word_mask
                                            word_matches += 1
                                    except:
                                        continue
                            
                            # Solo considerar match si encontramos suficientes palabras
                            if word_matches >= max(1, len([w for w in words if len(w) > 3]) * 0.7):
                                matching_reviews_subset = sentiment_df[mask].copy()
                            else:
                                matching_reviews_subset = pd.DataFrame()
                        else:
                            matching_reviews_subset = pd.DataFrame()
                            
                    except Exception as word_error:
                        # Método 3: Búsqueda aproximada con palabras clave
                        try:
                            # Extraer palabras clave más importantes del título
                            important_words = [w for w in activity_title_clean.split() if len(w) > 4]
                            if important_words:
                                # Buscar al menos una palabra clave importante
                                combined_mask = sentiment_df['ciudad'] == activity_city
                                word_found = False
                                for word in important_words[:3]:  # Solo primeras 3 palabras importantes
                                    try:
                                        word_mask = sentiment_df['texto'].str.lower().str.contains(word, na=False, regex=False)
                                        if word_mask.any():
                                            combined_mask = combined_mask & word_mask
                                            word_found = True
                                            break
                                    except:
                                        continue
                                
                                if word_found:
                                    matching_reviews_subset = sentiment_df[combined_mask].copy()
                                else:
                                    matching_reviews_subset = pd.DataFrame()
                            else:
                                matching_reviews_subset = pd.DataFrame()
                                
                        except Exception as final_error:
                            errors_count += 1
                            if errors_count < 10:
                                print(f"  Error final procesando '{activity_title[:30]}...': {final_error}")
                            continue
                
                # Agregar información de la actividad a cada review matching
                if not matching_reviews_subset.empty:
                    matching_reviews_subset['item_id'] = item_id
                    matching_reviews_subset['precio'] = activity.get('precio', 0)
                    matching_reviews_subset['rating_actividad'] = activity.get('rating', 3.0)
                    matching_reviews_subset['titulo_actividad'] = activity['titulo']
                    
                    matched_reviews.append(matching_reviews_subset)
                    total_matches += len(matching_reviews_subset)
                    successful_matches += 1
                    
            except Exception as e:
                errors_count += 1
                if errors_count < 10:
                    print(f"  Error general procesando actividad {idx}: {e}")
                continue
        
        # Consolidar resultados
        if matched_reviews:
            all_matched_reviews = pd.concat(matched_reviews, ignore_index=True)
            
            print(f"\nRESULTADOS DEL EMPAREJAMIENTO:")
            print(f"   Reviews emparejadas: {len(all_matched_reviews):,}")
            print(f"   Actividades con reviews: {all_matched_reviews['item_id'].nunique()}")
            print(f"   Tasa de cobertura: {successful_matches/len(activities_df)*100:.1f}%")
            print(f"   Actividades procesadas exitosamente: {successful_matches}")
            print(f"   Errores encontrados: {errors_count}")
            
            # Estadísticas adicionales
            reviews_per_activity = all_matched_reviews.groupby('item_id').size()
            print(f"   Reviews promedio por actividad: {reviews_per_activity.mean():.1f}")
            print(f"   Actividad con más reviews: {reviews_per_activity.max()}")
            
            # Estadísticas por ciudad
            city_stats = all_matched_reviews.groupby('ciudad').agg({
                'item_id': 'nunique',
                'texto': 'count'
            }).rename(columns={'item_id': 'actividades', 'texto': 'reviews'})
            
            print(f"\n   Distribución por ciudad:")
            for city, stats in city_stats.iterrows():
                print(f"     {city}: {stats['actividades']} actividades, {stats['reviews']} reviews")
            
        else:
            print(f"\nNO SE ENCONTRARON EMPAREJAMIENTOS")
            print(f"   Actividades procesadas: {len(activities_df)}")
            print(f"   Reviews disponibles: {len(sentiment_df)}")
            print(f"   Errores encontrados: {errors_count}")
            
            # Análisis más detallado para debugging
            print(f"\nANÁLISIS DETALLADO:")
            
            # Muestrear algunos títulos de actividades
            sample_titles = activities_df['titulo'].head(10).tolist()
            print(f"   Muestra de títulos de actividades:")
            for i, title in enumerate(sample_titles):
                print(f"     {i+1}. {title[:100]}...")
            
            # Muestrear algunos textos de reviews
            sample_texts = sentiment_df['texto'].head(5).tolist()
            print(f"   Muestra de textos de reviews:")
            for i, text in enumerate(sample_texts):
                print(f"     {i+1}. {text[:100]}...")
            
            # Crear DataFrame vacío con columnas esperadas
            all_matched_reviews = pd.DataFrame(columns=[
                'texto', 'ciudad', 'categoria', 'fecha', 'fuente', 
                'sentimiento', 'confianza', 'descripcion_sencilla',
                'item_id', 'precio', 'rating_actividad', 'titulo_actividad'
            ])
        
        return all_matched_reviews
    
    def test_matching_algorithm(self, activities_df, sentiment_df, sample_size=100):
        """
        Función de prueba para validar el algoritmo de emparejamiento
        """
        print(f"\nPRUEBA DE ALGORITMO DE EMPAREJAMIENTO")
        print("=" * 50)
        
        # Normalizar ciudades primero
        activities_test = activities_df.copy()
        sentiment_test = sentiment_df.copy()
        
        def normalize_city_name(city_name):
            if pd.isna(city_name):
                return 'Unknown'
            city_str = str(city_name).lower().strip()
            city_mappings = {
                'barcelona': 'Barcelona', 'barcelona1': 'Barcelona',
                'madrid': 'Madrid', 'malaga': 'Malaga', 'málaga': 'Malaga',
                'sevilla': 'Sevilla', 'valencia': 'Valencia', 'tenerife': 'Tenerife',
                'gran canaria': 'Gran Canaria', 'canaria': 'Gran Canaria', 'canarias': 'Gran Canaria',
                'mallorca': 'Mallorca', 'palma de mallorca': 'Mallorca', 'palma': 'Mallorca'
            }
            return city_mappings.get(city_str, city_name)
        
        activities_test['ciudad'] = activities_test['ciudad'].apply(normalize_city_name)
        sentiment_test['ciudad'] = sentiment_test['ciudad'].apply(normalize_city_name)
        
        # Tomar muestra pequeña para prueba
        sample_activities = activities_test.head(sample_size)
        
        print(f"Probando con {len(sample_activities)} actividades...")
        print(f"Cities en activities: {sorted(sample_activities['ciudad'].unique())}")
        print(f"Cities en sentiment: {sorted(sentiment_test['ciudad'].unique())}")
        
        matches_found = 0
        test_results = []
        
        for idx, activity in sample_activities.iterrows():
            activity_title = str(activity['titulo']).lower().strip()
            activity_city = activity['ciudad']
            
            # Limpiar título
            import string
            translator = str.maketrans('', '', '!¡¿?()[]{}*+^$|\\')
            activity_title_clean = activity_title.translate(translator)
            
            # Buscar matches
            city_reviews = sentiment_test[sentiment_test['ciudad'] == activity_city]
            
            if len(city_reviews) > 0:
                # Método simple: buscar título completo
                full_matches = city_reviews[
                    city_reviews['texto'].str.lower().str.contains(activity_title_clean, na=False, regex=False)
                ]
                
                # Método alternativo: buscar palabras clave
                words = [w for w in activity_title_clean.split() if len(w) > 3]
                word_matches = pd.DataFrame()
                
                if words:
                    mask = city_reviews['ciudad'] == activity_city
                    for word in words[:3]:  # Solo primeras 3 palabras importantes
                        try:
                            word_mask = city_reviews['texto'].str.lower().str.contains(word, na=False, regex=False)
                            mask = mask & word_mask
                        except:
                            continue
                    word_matches = city_reviews[mask]
                
                total_matches = len(full_matches) + len(word_matches)
                if total_matches > 0:
                    matches_found += 1
                
                test_results.append({
                    'titulo': activity['titulo'][:50],
                    'ciudad': activity_city,
                    'reviews_ciudad': len(city_reviews),
                    'matches_exactos': len(full_matches),
                    'matches_palabras': len(word_matches),
                    'total_matches': total_matches
                })
        
        print(f"\nResultados de la prueba:")
        print(f"   Actividades con matches: {matches_found}/{len(sample_activities)} ({matches_found/len(sample_activities)*100:.1f}%)")
        
        # Mostrar algunos ejemplos
        successful_matches = [r for r in test_results if r['total_matches'] > 0]
        if successful_matches:
            print(f"\n   Ejemplos de matches exitosos:")
            for i, match in enumerate(successful_matches[:5]):
                print(f"     {i+1}. {match['titulo']} ({match['ciudad']}) - {match['total_matches']} matches")
        
        failed_matches = [r for r in test_results if r['total_matches'] == 0 and r['reviews_ciudad'] > 0]
        if failed_matches:
            print(f"\n   Ejemplos de matches fallidos (con reviews disponibles):")
            for i, fail in enumerate(failed_matches[:5]):
                print(f"     {i+1}. {fail['titulo']} ({fail['ciudad']}) - {fail['reviews_ciudad']} reviews disponibles")
        
        return matches_found > 0
    
    def create_synthetic_user_ids(self, reviews_df):
        """
        Crea user_ids sintéticos basándose en patrones de reviews
        """
        print("Creando user_ids sintéticos...")
        
        # Estrategia: Crear clusters de usuarios basándose en:
        # 1. Ciudad + fecha + fuente (mismo usuario puede revisar múltiples actividades)
        # 2. Patrones de sentimiento similares
        # 3. Proximidad temporal
        
        reviews_df = reviews_df.copy()
        reviews_df['fecha_comentario'] = pd.to_datetime(reviews_df['fecha'], errors='coerce')  # Usar 'fecha' en lugar de 'fecha_comentario'
        
        # Crear grupos base por ciudad y fuente
        reviews_df['grupo_base'] = (
            reviews_df['ciudad'].astype(str) + '_' + 
            reviews_df['fuente'].astype(str)
        )
        
        # Agrupar por proximidad temporal (mismo día) y características similares
        user_id_counter = 0
        reviews_df['user_id'] = None
        
        # Convertir sentimiento categórico a numérico para comparación
        sentiment_to_numeric = {
            'negativo': -1,
            'neutro': 0,
            'positivo': 1
        }
        reviews_df['sentimiento_numerico'] = reviews_df['sentimiento'].map(sentiment_to_numeric).fillna(0)
        
        for grupo in reviews_df['grupo_base'].unique():
            grupo_reviews = reviews_df[reviews_df['grupo_base'] == grupo].copy()
            
            # Ordenar por fecha
            grupo_reviews = grupo_reviews.sort_values('fecha_comentario')
            
            current_user_reviews = []
            
            for idx, review in grupo_reviews.iterrows():
                if not current_user_reviews:
                    # Primera review del grupo
                    current_user_reviews = [idx]
                    continue
                
                # Obtener última review del usuario actual
                last_review_idx = current_user_reviews[-1]
                last_review = reviews_df.loc[last_review_idx]
                
                # Criterios para considerar mismo usuario:
                try:
                    time_diff = abs((review['fecha_comentario'] - last_review['fecha_comentario']).days)
                except:
                    time_diff = 365  # Si no se puede calcular, asumir diferencia grande
                
                sentiment_diff = abs(review['sentimiento_numerico'] - last_review['sentimiento_numerico'])
                confidence_diff = abs(review['confianza'] - last_review['confianza'])
                
                # Si las reviews son muy similares en tiempo y características, mismo usuario
                if (time_diff <= 7 and  # Máximo 7 días de diferencia
                    sentiment_diff <= 0.5 and  # Sentimientos similares (ajustado para categórico)
                    confidence_diff <= 0.1):  # Confianza similar
                    current_user_reviews.append(idx)
                else:
                    # Asignar user_id a reviews acumuladas
                    for review_idx in current_user_reviews:
                        reviews_df.loc[review_idx, 'user_id'] = f"user_{user_id_counter}"
                    
                    user_id_counter += 1
                    current_user_reviews = [idx]
            
            # Asignar user_id a las últimas reviews del grupo
            for review_idx in current_user_reviews:
                reviews_df.loc[review_idx, 'user_id'] = f"user_{user_id_counter}"
            user_id_counter += 1
        
        print(f"Creados {user_id_counter} user_ids sintéticos")
        return reviews_df
    
    def create_activity_features(self, activities_df, matched_reviews_df):
        """
        Crea features enriquecidas para las actividades
        """
        print("Creando features de actividades...")
        
        # Crear columna item_id si no existe, usando 'id'
        if 'item_id' not in activities_df.columns and 'id' in activities_df.columns:
            activities_df = activities_df.copy()
            activities_df['item_id'] = activities_df['id']
        
        # Agregar métricas de reviews por actividad
        if not matched_reviews_df.empty and 'item_id' in matched_reviews_df.columns:
            # Convertir sentimiento categórico a numérico para estadísticas
            sentiment_to_numeric = {
                'negativo': -1,
                'neutro': 0,
                'positivo': 1
            }
            matched_reviews_df['sentimiento_numerico'] = matched_reviews_df['sentimiento'].map(sentiment_to_numeric).fillna(0)
            
            review_metrics = matched_reviews_df.groupby('item_id').agg({
                'sentimiento_numerico': ['mean', 'std', 'count'],
                'confianza': ['mean', 'std'],
                'rating_actividad': ['mean', 'count']
            }).reset_index()
            
            review_metrics.columns = ['item_id', 'avg_sentiment', 'sentiment_std', 
                                    'review_count', 'avg_confidence', 'confidence_std',
                                    'avg_rating', 'rating_count']
            
            # Merge con actividades
            enriched_activities = activities_df.merge(
                review_metrics, on='item_id', how='left'
            )
            
            # Rellenar valores faltantes
            enriched_activities['avg_sentiment'] = enriched_activities['avg_sentiment'].fillna(0)
            enriched_activities['sentiment_std'] = enriched_activities['sentiment_std'].fillna(0)
            enriched_activities['review_count'] = enriched_activities['review_count'].fillna(0)
            enriched_activities['avg_confidence'] = enriched_activities['avg_confidence'].fillna(0.5)
        else:
            enriched_activities = activities_df.copy()
            enriched_activities['avg_sentiment'] = 0
            enriched_activities['sentiment_std'] = 0
            enriched_activities['review_count'] = 0
            enriched_activities['avg_confidence'] = 0.5
        
        # Crear categorías de precio
        enriched_activities['precio_categoria'] = pd.cut(
            enriched_activities['precio'], 
            bins=[0, 50, 100, 200, float('inf')], 
            labels=['Bajo', 'Medio', 'Alto', 'Premium']
        )
        
        # Normalizar rating para consistencia
        if enriched_activities['rating'].max() > enriched_activities['rating'].min():
            enriched_activities['rating_normalizado'] = (
                enriched_activities['rating'] - enriched_activities['rating'].min()
            ) / (enriched_activities['rating'].max() - enriched_activities['rating'].min())
        else:
            enriched_activities['rating_normalizado'] = 0.5
        
        return enriched_activities
    
    def create_temporal_features(self, monthly_interest_df):
        """
        Crea features temporales desde google trends
        """
        print("Creando features temporales...")
        
        # Convertir fecha
        monthly_interest_df['date'] = pd.to_datetime(monthly_interest_df['date'])
        
        # Crear features estacionales
        monthly_interest_df['mes'] = monthly_interest_df['date'].dt.month
        monthly_interest_df['año'] = monthly_interest_df['date'].dt.year
        monthly_interest_df['trimestre'] = monthly_interest_df['date'].dt.quarter
        
        # Definir estaciones
        def get_season(month):
            if month in [12, 1, 2]:
                return 'Invierno'
            elif month in [3, 4, 5]:
                return 'Primavera'
            elif month in [6, 7, 8]:
                return 'Verano'
            else:
                return 'Otoño'
        
        monthly_interest_df['estacion'] = monthly_interest_df['mes'].apply(get_season)
        
        # Melt para formato largo
        cities = ['Barcelona', 'Madrid', 'Malaga', 'Sevilla', 'Valencia', 
                 'Tenerife', 'Gran Canaria', 'Palma de Mallorca']
        
        temporal_features = pd.melt(
            monthly_interest_df, 
            id_vars=['date', 'mes', 'año', 'trimestre', 'estacion'],
            value_vars=cities,
            var_name='ciudad',
            value_name='interes_turistico'
        )
        
        return temporal_features
    
    def create_geographical_features(self, commuting_zones_df):
        """
        Crea features geográficas
        """
        print("Creando features geográficas...")
        
        # Extraer información geográfica
        geo_features = commuting_zones_df.copy()
        
        # Calcular densidad poblacional
        geo_features['densidad_poblacion'] = (
            geo_features['win_population'] / geo_features['area']
        )
        
        # Calcular densidad de carreteras
        geo_features['densidad_carreteras'] = (
            geo_features['win_roads_km'] / geo_features['area']
        )
        
        # Normalizar features geográficas
        scaler = StandardScaler()
        geo_cols = ['win_population', 'win_roads_km', 'area', 
                   'densidad_poblacion', 'densidad_carreteras']
        
        geo_features[geo_cols] = scaler.fit_transform(geo_features[geo_cols])
        self.scalers['geo_features'] = scaler
        
        return geo_features
    
    def create_tourism_context_features(self, un_tourism_df):
        """
        Crea features de contexto turístico desde datos ONU con análisis específico por tipo de turismo
        """
        print("Creando features de contexto turístico...")
        
        # Verificar columnas disponibles y mapear a nombres esperados
        print(f"Columnas disponibles en UN tourism: {list(un_tourism_df.columns)}")
        
        # Mapeo flexible de nombres de columnas
        column_mapping = {}
        for col in un_tourism_df.columns:
            col_lower = col.lower()
            if 'year' in col_lower or 'año' in col_lower:
                column_mapping['Year'] = col
            elif 'country' in col_lower or 'pais' in col_lower or 'país' in col_lower:
                column_mapping['Country'] = col
            elif 'category' in col_lower or 'categoria' in col_lower or 'categoría' in col_lower:
                column_mapping['Category'] = col
            elif 'indicator' in col_lower or 'indicador' in col_lower:
                column_mapping['Indicator'] = col
            elif 'value' in col_lower or 'valor' in col_lower:
                column_mapping['Value'] = col
        
        print(f"Mapeo de columnas: {column_mapping}")
        
        # Si no encontramos las columnas necesarias, crear features por defecto
        required_cols = ['Year', 'Country', 'Category', 'Indicator', 'Value']
        missing_cols = [col for col in required_cols if col not in column_mapping]
        
        if missing_cols:
            print(f"Columnas faltantes en datos de turismo ONU: {missing_cols}")
            print("Creando features de turismo por defecto...")
            
            # Crear features por defecto
            return {
                'spain_domestic': {},
                'international_inbound': {},
                'combined_features': {
                    'spending_ratio': 1.0,
                    'price_sensitivity_factor': 1.0,
                    'tourism_volume_ratio': 1.0,
                    'demand_pressure_factor': 1.0,
                    'domestic_tourism_strength': 0,
                    'local_preference_factor': 1.0
                }
            }
        
        # Renombrar columnas para trabajar con ellas
        un_tourism_renamed = un_tourism_df.rename(columns={v: k for k, v in column_mapping.items()})
        
        # Filtrar datos más recientes (últimos 3 años)
        current_year = datetime.now().year
        recent_data = un_tourism_renamed[un_tourism_renamed['Year'] >= current_year - 3]
        
        if recent_data.empty:
            print("No hay datos de turismo en los últimos 3 años, usando todos los datos disponibles")
            recent_data = un_tourism_renamed
        
        # Separar datos de España (turismo doméstico e inbound) vs otros países (outbound)
        spain_data = recent_data[recent_data['Country'].str.contains('Spain', case=False, na=False)].copy()
        other_countries_data = recent_data[~recent_data['Country'].str.contains('Spain', case=False, na=False)].copy()
        
        print(f"Datos de España: {len(spain_data)} registros")
        print(f"Datos de otros países: {len(other_countries_data)} registros")
        
        # Procesar datos de España
        spain_metrics = self._process_spain_tourism_data(spain_data)
        
        # Procesar datos de otros países (outbound hacia España)
        outbound_metrics = self._process_outbound_tourism_data(other_countries_data)
        
        # Combinar métricas
        tourism_context = {
            'spain_domestic': spain_metrics,
            'international_inbound': outbound_metrics,
            'combined_features': self._combine_tourism_features(spain_metrics, outbound_metrics)
        }
        
        return tourism_context
    
    def _process_spain_tourism_data(self, spain_data):
        """
        Procesa datos de turismo doméstico e inbound de España
        """
        metrics = {}
        
        # Categorías principales para España
        categories_of_interest = ['domestic', 'inbound arrivals', 'inbound arrivals by region', 
                                'inbound expenditure']
        
        for category in categories_of_interest:
            category_data = spain_data[
                spain_data['Category'].str.contains(category, case=False, na=False)
            ]
            
            if not category_data.empty:
                # Agregar por indicador
                category_metrics = category_data.groupby('Indicator').agg({
                    'Value': ['mean', 'sum', 'std', 'count']
                }).reset_index()
                
                category_metrics.columns = ['Indicator', f'{category}_mean', f'{category}_sum', 
                                          f'{category}_std', f'{category}_count']
                
                metrics[category] = category_metrics
        
        return metrics
    
    def _process_outbound_tourism_data(self, outbound_data):
        """
        Procesa datos de turismo outbound de otros países (potenciales visitantes a España)
        """
        metrics = {}
        
        # Categorías de outbound
        outbound_categories = ['outbound departures', 'outbound expenditure']
        
        for category in outbound_categories:
            category_data = outbound_data[
                outbound_data['Category'].str.contains(category, case=False, na=False)
            ]
            
            if not category_data.empty:
                # Agregar por país y indicador para entender patrones de gasto
                country_metrics = category_data.groupby(['Country', 'Indicator']).agg({
                    'Value': ['mean', 'sum', 'std']
                }).reset_index()
                
                country_metrics.columns = ['Country', 'Indicator', f'{category}_mean', 
                                         f'{category}_sum', f'{category}_std']
                
                # Calcular métricas globales
                global_metrics = category_data.groupby('Indicator').agg({
                    'Value': ['mean', 'sum', 'std', 'count']
                }).reset_index()
                
                global_metrics.columns = ['Indicator', f'{category}_global_mean', 
                                        f'{category}_global_sum', f'{category}_global_std',
                                        f'{category}_global_count']
                
                metrics[category] = {
                    'by_country': country_metrics,
                    'global': global_metrics
                }
        
        return metrics
    
    def _combine_tourism_features(self, spain_metrics, outbound_metrics):
        """
        Combina métricas de turismo para crear features de contexto
        """
        combined_features = {}
        
        # Ratio de gasto outbound vs inbound (indica poder adquisitivo de visitantes)
        try:
            if ('inbound expenditure' in spain_metrics and 
                'outbound expenditure' in outbound_metrics):
                
                inbound_spending = spain_metrics['inbound expenditure']
                outbound_spending = outbound_metrics['outbound expenditure']['global']
                
                if not inbound_spending.empty and not outbound_spending.empty:
                    avg_inbound = inbound_spending['inbound expenditure_mean'].mean()
                    avg_outbound = outbound_spending['outbound expenditure_global_mean'].mean()
                    
                    combined_features['spending_ratio'] = avg_outbound / max(avg_inbound, 1)
                    combined_features['price_sensitivity_factor'] = min(2.0, max(0.5, 
                        1.0 + (combined_features['spending_ratio'] - 1.0) * 0.1))
        except:
            combined_features['spending_ratio'] = 1.0
            combined_features['price_sensitivity_factor'] = 1.0
        
        # Volumen de turismo (arrivals vs departures)
        try:
            if ('inbound arrivals' in spain_metrics and 
                'outbound departures' in outbound_metrics):
                
                inbound_arrivals = spain_metrics['inbound arrivals']
                outbound_departures = outbound_metrics['outbound departures']['global']
                
                if not inbound_arrivals.empty and not outbound_departures.empty:
                    avg_arrivals = inbound_arrivals['inbound arrivals_mean'].mean()
                    avg_departures = outbound_departures['outbound departures_global_mean'].mean()
                    
                    combined_features['tourism_volume_ratio'] = avg_arrivals / max(avg_departures, 1)
                    combined_features['demand_pressure_factor'] = min(2.0, max(0.5,
                        1.0 + (combined_features['tourism_volume_ratio'] - 1.0) * 0.05))
        except:
            combined_features['tourism_volume_ratio'] = 1.0
            combined_features['demand_pressure_factor'] = 1.0
        
        # Factor de turismo doméstico
        try:
            if 'domestic' in spain_metrics:
                domestic_tourism = spain_metrics['domestic']
                if not domestic_tourism.empty:
                    combined_features['domestic_tourism_strength'] = domestic_tourism['domestic_mean'].mean()
                    combined_features['local_preference_factor'] = min(1.5, max(0.7,
                        1.0 + (combined_features['domestic_tourism_strength'] / 1000000 - 1.0) * 0.1))
        except:
            combined_features['domestic_tourism_strength'] = 0
            combined_features['local_preference_factor'] = 1.0
        
        return combined_features
    
    def create_user_item_matrix(self, matched_reviews_df, activities_df, storage_path="AI_Recomendador"):
        """
        Crea matriz usuario-item usando storage en disco para manejar datasets grandes
        """
        import os
        import h5py
        from pathlib import Path
        
        print("Creando matriz usuario-item con storage en disco...")
        
        if matched_reviews_df.empty or 'user_id' not in matched_reviews_df.columns:
            print("No hay datos de reviews matched o user_ids, creando matriz vacía")
            return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
        
        # Crear directorio de storage
        storage_dir = Path(storage_path)
        storage_dir.mkdir(exist_ok=True)
        
        print(f"Usando directorio de storage: {storage_dir.absolute()}")
        
        # Convertir sentimiento categórico a numérico
        sentiment_to_numeric = {
            'negativo': -1,
            'neutro': 0,
            'positivo': 1
        }
        
        matched_reviews_df['sentimiento_numerico'] = matched_reviews_df['sentimiento'].map(sentiment_to_numeric)
        
        # Convertir sentimiento categórico a pseudo-rating [1-5] 
        def sentiment_to_rating(sentiment_cat):
            if sentiment_cat == 'negativo':
                return np.random.uniform(1, 2)  # Rating entre 1-2
            elif sentiment_cat == 'positivo':
                return np.random.uniform(4, 5)  # Rating entre 4-5
            else:  # neutro
                return 3  # Rating neutro
        
        matched_reviews_df['pseudo_rating'] = matched_reviews_df['sentimiento'].apply(sentiment_to_rating)
        
        # Obtener usuarios e items únicos
        unique_users = matched_reviews_df['user_id'].unique()
        unique_items = matched_reviews_df['item_id'].unique()
        
        print(f"Dimensiones de matriz: {len(unique_users):,} usuarios x {len(unique_items):,} items")
        
        # Crear mapeos de índices
        user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
        item_to_idx = {item: idx for idx, item in enumerate(unique_items)}
        
        # Guardar mapeos para uso posterior
        mappings_file = storage_dir / 'user_item_mappings.pkl'
        import pickle
        with open(mappings_file, 'wb') as f:
            pickle.dump({
                'user_to_idx': user_to_idx,
                'item_to_idx': item_to_idx,
                'users': unique_users,
                'items': unique_items
            }, f)
        
        print(f"Mapeos guardados en {mappings_file}")
        
        # MÉTODO EFICIENTE: Procesar en chunks y guardar en formato HDF5
        chunk_size = 50000  # Procesar de 50k reviews a la vez
        total_chunks = len(matched_reviews_df) // chunk_size + 1
        
        # Archivos de storage
        ratings_file = storage_dir / 'user_item_ratings.h5'
        sentiment_file = storage_dir / 'user_item_sentiment.h5'
        confidence_file = storage_dir / 'user_item_confidence.h5'
        
        # Inicializar archivos HDF5
        with h5py.File(ratings_file, 'w') as f:
            ratings_dataset = f.create_dataset('ratings', 
                                             (len(unique_users), len(unique_items)), 
                                             dtype='float32', 
                                             fillvalue=0.0,
                                             compression='gzip')
            
        with h5py.File(sentiment_file, 'w') as f:
            sentiment_dataset = f.create_dataset('sentiment', 
                                               (len(unique_users), len(unique_items)), 
                                               dtype='float32', 
                                               fillvalue=0.0,
                                               compression='gzip')
            
        with h5py.File(confidence_file, 'w') as f:
            confidence_dataset = f.create_dataset('confidence', 
                                                (len(unique_users), len(unique_items)), 
                                                dtype='float32', 
                                                fillvalue=0.0,
                                                compression='gzip')
        
        print("Procesando reviews en chunks...")
        
        # Procesar por chunks
        for chunk_idx in range(total_chunks):
            start_idx = chunk_idx * chunk_size
            end_idx = min((chunk_idx + 1) * chunk_size, len(matched_reviews_df))
            
            if start_idx >= len(matched_reviews_df):
                break
                
            chunk = matched_reviews_df.iloc[start_idx:end_idx]
            print(f"Procesando chunk {chunk_idx + 1}/{total_chunks} ({len(chunk)} reviews)")
            
            # Agrupar por usuario-item y calcular métricas
            chunk_aggregated = chunk.groupby(['user_id', 'item_id']).agg({
                'pseudo_rating': 'mean',
                'sentimiento_numerico': 'mean',
                'confianza': 'mean'
            }).reset_index()
            
            # Actualizar matrices en disco
            with h5py.File(ratings_file, 'r+') as f:
                ratings_dataset = f['ratings']
                for _, row in chunk_aggregated.iterrows():
                    user_idx = user_to_idx[row['user_id']]
                    item_idx = item_to_idx[row['item_id']]
                    ratings_dataset[user_idx, item_idx] = row['pseudo_rating']
            
            with h5py.File(sentiment_file, 'r+') as f:
                sentiment_dataset = f['sentiment']
                for _, row in chunk_aggregated.iterrows():
                    user_idx = user_to_idx[row['user_id']]
                    item_idx = item_to_idx[row['item_id']]
                    sentiment_dataset[user_idx, item_idx] = row['sentimiento_numerico']
            
            with h5py.File(confidence_file, 'r+') as f:
                confidence_dataset = f['confidence']
                for _, row in chunk_aggregated.iterrows():
                    user_idx = user_to_idx[row['user_id']]
                    item_idx = item_to_idx[row['item_id']]
                    confidence_dataset[user_idx, item_idx] = row['confianza']
        
        # Crear DataFrames pequeños de muestra para uso inmediato (top 1000 usuarios más activos)
        print("Creando matrices de muestra para uso inmediato...")
        
        user_activity = matched_reviews_df['user_id'].value_counts()
        top_users = user_activity.head(1000).index.tolist()
        
        sample_reviews = matched_reviews_df[matched_reviews_df['user_id'].isin(top_users)]
        
        # Crear matrices de muestra en memoria
        sample_user_item_matrix = sample_reviews.pivot_table(
            index='user_id',
            columns='item_id',
            values='pseudo_rating',
            aggfunc='mean',
            fill_value=0
        )
        
        sample_user_item_sentiment = sample_reviews.pivot_table(
            index='user_id',
            columns='item_id',
            values='sentimiento_numerico',
            aggfunc='mean',
            fill_value=0
        )
        
        sample_user_item_confidence = sample_reviews.pivot_table(
            index='user_id',
            columns='item_id',
            values='confianza',
            aggfunc='mean',
            fill_value=0
        )
        
        print(f"Matrices creadas y guardadas en {storage_dir}")
        print(f"   Archivos principales:")
        print(f"     - {ratings_file} (ratings completos)")
        print(f"     - {sentiment_file} (sentimientos completos)")  
        print(f"     - {confidence_file} (confianza completa)")
        print(f"     - {mappings_file} (mapeos usuario-item)")
        print(f"   Matrices de muestra en memoria: {sample_user_item_matrix.shape[0]} usuarios x {sample_user_item_matrix.shape[1]} items")
        
        # Estadísticas
        print(f"Distribución de sentimientos:")
        sentiment_counts = matched_reviews_df['sentimiento'].value_counts()
        for sentiment, count in sentiment_counts.items():
            print(f"   {sentiment}: {count:,} ({count/len(matched_reviews_df)*100:.1f}%)")
        
        return sample_user_item_matrix, sample_user_item_sentiment, sample_user_item_confidence
    
    def load_user_item_matrices_from_storage(self, storage_path="AI_Recomendador", user_subset=None, item_subset=None):
        """
        Carga matrices usuario-item desde storage en disco
        """
        import h5py
        import pickle
        from pathlib import Path
        
        storage_dir = Path(storage_path)
        
        # Cargar mapeos
        mappings_file = storage_dir / 'user_item_mappings.pkl'
        with open(mappings_file, 'rb') as f:
            mappings = pickle.load(f)
        
        user_to_idx = mappings['user_to_idx']
        item_to_idx = mappings['item_to_idx']
        
        # Determinar subset de usuarios e items
        if user_subset is None:
            user_indices = list(range(len(mappings['users'])))
        else:
            user_indices = [user_to_idx[u] for u in user_subset if u in user_to_idx]
        
        if item_subset is None:
            item_indices = list(range(len(mappings['items'])))
        else:
            item_indices = [item_to_idx[i] for i in item_subset if i in item_to_idx]
        
        print(f"Cargando subset: {len(user_indices)} usuarios x {len(item_indices)} items")
        
        # Cargar datos desde HDF5
        ratings_file = storage_dir / 'user_item_ratings.h5'
        
        with h5py.File(ratings_file, 'r') as f:
            ratings_data = f['ratings'][user_indices, :][:, item_indices]
        
        # Convertir a DataFrame
        user_labels = [mappings['users'][i] for i in user_indices]
        item_labels = [mappings['items'][i] for i in item_indices]
        
        ratings_df = pd.DataFrame(ratings_data, index=user_labels, columns=item_labels)
        
    def prepare_training_data_for_deep_learning(self, matched_reviews_df, storage_path="AI_Recomendador", sample_size=30000):
        """
        Prepara datos de entrenamiento de manera eficiente para RTX 3050 Ti (4GB VRAM)
        """
        print(f"Preparando datos de entrenamiento optimizados para RTX 3050 Ti (muestra de {sample_size:,} reviews)...")
        
        if matched_reviews_df.empty:
            print("No hay reviews emparejadas disponibles")
            return pd.DataFrame()
        
        # Muestra estratificada más pequeña para 4GB VRAM
        if len(matched_reviews_df) > sample_size:
            # Priorizar usuarios más activos y reviews de alta confianza
            high_confidence_df = matched_reviews_df[matched_reviews_df['confianza'] >= 0.7]
            if len(high_confidence_df) >= sample_size:
                sample_df = high_confidence_df.sample(sample_size).reset_index(drop=True)
            else:
                # Completar con reviews de menor confianza
                remaining_needed = sample_size - len(high_confidence_df)
                low_confidence_df = matched_reviews_df[matched_reviews_df['confianza'] < 0.7]
                additional_sample = low_confidence_df.sample(min(len(low_confidence_df), remaining_needed))
                sample_df = pd.concat([high_confidence_df, additional_sample]).reset_index(drop=True)
        else:
            sample_df = matched_reviews_df.copy()
        
        print(f"Muestra final optimizada: {len(sample_df):,} reviews")
        
        # Convertir sentimientos categóricos
        sentiment_to_numeric = {
            'negativo': -1,
            'neutro': 0,
            'positivo': 1
        }
        
        sample_df['sentimiento_numerico'] = sample_df['sentimiento'].map(sentiment_to_numeric)
        
        # Crear mapeos más pequeños para GPU
        unique_users = sample_df['user_id'].unique()[:10000]  # Limitar usuarios para VRAM
        sample_df = sample_df[sample_df['user_id'].isin(unique_users)]
        
        unique_users = sample_df['user_id'].unique()
        unique_items = sample_df['item_id'].unique()
        unique_cities = sample_df['ciudad'].unique()
        
        user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
        item_to_idx = {item: idx for idx, item in enumerate(unique_items)}
        city_to_idx = {city: idx for idx, city in enumerate(unique_cities)}
        
        # Guardar mapeos para el modelo
        from pathlib import Path
        import pickle
        
        storage_dir = Path(storage_path)
        storage_dir.mkdir(exist_ok=True)
        
        training_mappings = {
            'user_to_idx': user_to_idx,
            'item_to_idx': item_to_idx,
            'city_to_idx': city_to_idx,
            'idx_to_user': {v: k for k, v in user_to_idx.items()},
            'idx_to_item': {v: k for k, v in item_to_idx.items()},
            'idx_to_city': {v: k for k, v in city_to_idx.items()}
        }
        
        mappings_file = storage_dir / 'training_mappings_gpu_optimized.pkl'
        with open(mappings_file, 'wb') as f:
            pickle.dump(training_mappings, f)
        
        print(f"Mapeos de entrenamiento GPU optimizados guardados en {mappings_file}")
        print(f"Dimensiones optimizadas: {len(unique_users):,} usuarios, {len(unique_items):,} items, {len(unique_cities)} ciudades")
        
        return sample_df, training_mappings
    
    def save_processed_data_efficiently(self, processed_data, output_path, storage_path="AI_Recomendador"):
        """
        Guarda datos procesados de manera eficiente usando referencias a archivos en disco
        """
        print(f"Guardando datos procesados de manera eficiente...")
        
        from pathlib import Path
        import pickle
        
        # Crear estructura de datos optimizada
        efficient_data = {
            'city_features': processed_data['city_features'],
            'matched_reviews_summary': {
                'total_reviews': len(processed_data.get('matched_reviews', [])),
                'total_users': processed_data.get('matched_reviews', pd.DataFrame()).get('user_id', pd.Series()).nunique(),
                'total_items': processed_data.get('matched_reviews', pd.DataFrame()).get('item_id', pd.Series()).nunique(),
                'cities': list(processed_data.get('matched_reviews', pd.DataFrame()).get('ciudad', pd.Series()).unique())
            },
            'storage_path': str(Path(storage_path).absolute()),
            'matrix_files': {
                'ratings': str(Path(storage_path) / 'user_item_ratings.h5'),
                'sentiment': str(Path(storage_path) / 'user_item_sentiment.h5'), 
                'confidence': str(Path(storage_path) / 'user_item_confidence.h5'),
                'mappings': str(Path(storage_path) / 'user_item_mappings.pkl')
            },
            'scalers': self.scalers,
            'encoders': self.encoders,
            'processing_metadata': {
                'timestamp': pd.Timestamp.now(),
                'total_activities_processed': len(processed_data.get('raw_datasets', {}).get('activities', [])),
                'total_reviews_original': len(processed_data.get('raw_datasets', {}).get('reviews', [])),
                'matching_success_rate': processed_data['matched_reviews_summary']['total_reviews'] / len(processed_data.get('raw_datasets', {}).get('reviews', [1])) if processed_data.get('raw_datasets', {}).get('reviews') is not None else 0
            }
        }
        
        # Guardar archivo principal (pequeño)
        with open(output_path, 'wb') as f:
            pickle.dump(efficient_data, f)
        
        # Guardar muestra de matched_reviews para análisis rápido
        if 'matched_reviews' in processed_data and not processed_data['matched_reviews'].empty:
            sample_reviews = processed_data['matched_reviews'].sample(min(10000, len(processed_data['matched_reviews'])))
            sample_file = Path(output_path).parent / 'matched_reviews_sample.pkl'
            sample_reviews.to_pickle(sample_file)
            efficient_data['sample_reviews_file'] = str(sample_file)
        
        print(f"Datos guardados exitosamente!")
        print(f"   Archivo principal: {output_path}")
        print(f"   Matrices grandes en: {storage_path}")
        print(f"   Espacio utilizado en disco: ~{self._estimate_storage_size(storage_path)} GB")
        
        return efficient_data
    
    def _estimate_storage_size(self, storage_path):
        """
        Estima el tamaño de almacenamiento utilizado
        """
        from pathlib import Path
        
        total_size = 0
        storage_dir = Path(storage_path)
        
        if storage_dir.exists():
            for file_path in storage_dir.rglob('*'):
                if file_path.is_file():
                    total_size += file_path.stat().st_size
        
        return round(total_size / (1024**3), 2)  # Convert to GB
    
    def prepare_deep_learning_features(self, datasets):
        """
        Prepara todas las features para el modelo de deep learning
        """
        print("Preparando features para deep learning...")
        
        # Validación inicial de datasets
        print("Validando datasets...")
        if 'activities' not in datasets or datasets['activities'].empty:
            raise ValueError("Dataset de actividades vacío o no encontrado")
        
        if 'reviews' not in datasets or datasets['reviews'].empty:
            raise ValueError("Dataset de reviews vacío o no encontrado")
        
        print(f"  Activities dataset: {len(datasets['activities'])} registros")
        print(f"  Reviews dataset: {len(datasets['reviews'])} registros")
        
        # Paso 1: Emparejar reviews con actividades
        try:
            matched_reviews = self.match_reviews_to_activities(
                datasets['activities'], datasets['reviews']
            )
        except Exception as e:
            print(f"Error en emparejamiento: {e}")
            print("Creando dataset vacío para continuar procesamiento...")
            matched_reviews = pd.DataFrame()
        
        # Paso 2: Crear user_ids sintéticos (solo si hay datos emparejados)
        if not matched_reviews.empty:
            try:
                print("Generando user_ids sintéticos...")
                matched_reviews = self.create_synthetic_user_ids(matched_reviews)
            except Exception as e:
                print(f"Error creando user_ids sintéticos: {e}")
                # Asignar user_ids simples como fallback
                matched_reviews['user_id'] = range(len(matched_reviews))
        else:
            print("Sin datos emparejados, saltando generación de user_ids")
        
        # Paso 3: Obtener datasets procesados
        try:
            activities = self.create_activity_features(datasets['activities'], matched_reviews)
        except Exception as e:
            print(f"Error creando features de actividades: {e}")
            activities = datasets['activities'].copy()
        
        try:
            temporal_features = self.create_temporal_features(datasets['monthly_interest'])
        except Exception as e:
            print(f"Error creando features temporales: {e}")
            temporal_features = pd.DataFrame()
        
        try:
            geo_features = self.create_geographical_features(datasets['commuting_zones'])
        except Exception as e:
            print(f"Error creando features geográficas: {e}")
            geo_features = pd.DataFrame()
        
        try:
            tourism_context = self.create_tourism_context_features(datasets['un_tourism'])
        except Exception as e:
            print(f"Error creando features de turismo: {e}")
            tourism_context = {}
        
        # Paso 4: Crear features consolidadas por ciudad
        city_features = {}
        
        cities = ['Barcelona', 'Madrid', 'Malaga', 'Sevilla', 'Valencia', 
                 'Tenerife', 'Gran Canaria', 'Palma de Mallorca']
        
        for city in cities:
            print(f"Procesando {city}...")
            
            try:
                # Features de actividades para la ciudad
                city_activities = activities[activities['ciudad'] == city]
                
                # Features temporales
                city_temporal = temporal_features[temporal_features['ciudad'] == city] if not temporal_features.empty else pd.DataFrame()
                
                # Features geográficas (buscar coincidencia por nombre)
                city_geo = None
                if not geo_features.empty:
                    geo_matches = geo_features[geo_features['name'].str.contains(city, case=False, na=False)]
                    city_geo = geo_matches.iloc[0] if len(geo_matches) > 0 else None
                
                # Features de reviews matched para la ciudad
                city_reviews = matched_reviews[matched_reviews['ciudad'] == city] if not matched_reviews.empty else pd.DataFrame()
                
                # Análisis de correlación clima-sentimiento
                try:
                    climate_correlation = self._analyze_climate_sentiment_correlation(city_reviews)
                except Exception as e:
                    print(f"    Error en análisis clima-sentimiento para {city}: {e}")
                    climate_correlation = {'correlation': 0, 'climate_factors': {}}
                
                # Consolidar features
                city_data = {
                    'activities': city_activities,
                    'temporal': city_temporal,
                    'geo': city_geo,
                    'tourism_context': tourism_context,
                    'reviews': city_reviews,
                    'climate_sentiment_correlation': climate_correlation
                }
                
                city_features[city] = city_data
                
            except Exception as e:
                print(f"    Error procesando {city}: {e}")
                # Crear estructura mínima para la ciudad
                city_features[city] = {
                    'activities': pd.DataFrame(),
                    'temporal': pd.DataFrame(),
                    'geo': None,
                    'tourism_context': {},
                    'reviews': pd.DataFrame(),
                    'climate_sentiment_correlation': {'correlation': 0, 'climate_factors': {}}
                }
        
        print(f"Features preparadas para {len(city_features)} ciudades")
        if not matched_reviews.empty:
            print(f"{len(matched_reviews)} reviews emparejadas disponibles para entrenamiento")
        else:
            print("Sin reviews emparejadas - el modelo entrenará solo con datos base")
        
        return city_features, matched_reviews
    
    def _analyze_climate_sentiment_correlation(self, city_reviews):
        """
        Analiza correlación entre clima y sentimiento categórico usando las 35 combinaciones específicas
        """
        if city_reviews.empty or 'descripcion_sencilla' not in city_reviews.columns:
            return {'correlation': 0, 'climate_factors': {}}
        
        # Convertir sentimiento categórico a numérico
        sentiment_to_numeric = {
            'negativo': -1,
            'neutro': 0,
            'positivo': 1
        }
        
        city_reviews['sentimiento_numerico'] = city_reviews['sentimiento'].map(sentiment_to_numeric)
        
        # Definir categorías principales basadas en las 35 combinaciones
        def categorize_detailed_weather(description):
            """
            Categoriza las 35 descripciones climáticas en grupos principales
            """
            if pd.isna(description):
                return 'Sin datos'
            
            desc = str(description).lower()
            
            # Categorías principales por condiciones del cielo
            if 'soleado' in desc:
                sky_condition = 'Soleado'
            elif 'nublado' in desc:
                sky_condition = 'Nublado'
            elif 'intervalos' in desc:
                sky_condition = 'Parcialmente nublado'
            else:
                sky_condition = 'Indefinido'
            
            # Condiciones de precipitación
            if 'lluvia intensa' in desc:
                precipitation = 'Lluvia intensa'
            elif 'lluvia moderada' in desc:
                precipitation = 'Lluvia moderada'
            elif 'lloviznas' in desc:
                precipitation = 'Lloviznas'
            elif 'sin lluvia' in desc:
                precipitation = 'Sin lluvia'
            else:
                precipitation = 'Sin especificar'
            
            # Condiciones de temperatura
            if 'muy caluroso' in desc:
                temperature = 'Muy caluroso'
            elif 'cálido' in desc or 'calido' in desc:
                temperature = 'Cálido'
            elif 'fresco y agradable' in desc:
                temperature = 'Fresco y agradable'
            elif 'frío' in desc or 'frio' in desc:
                temperature = 'Frío'
            else:
                temperature = 'Temperatura normal'
            
            # Viento
            wind = 'Con viento' if 'viento' in desc else 'Sin viento especificado'
            
            return {
                'sky_condition': sky_condition,
                'precipitation': precipitation,
                'temperature': temperature,
                'wind': wind,
                'full_description': description
            }
        
        # Aplicar categorización detallada
        city_reviews['weather_details'] = city_reviews['descripcion_sencilla'].apply(categorize_detailed_weather)
        
        # Extraer categorías principales
        city_reviews['sky_condition'] = city_reviews['weather_details'].apply(lambda x: x.get('sky_condition', 'Sin datos') if isinstance(x, dict) else 'Sin datos')
        city_reviews['precipitation'] = city_reviews['weather_details'].apply(lambda x: x.get('precipitation', 'Sin datos') if isinstance(x, dict) else 'Sin datos')
        city_reviews['temperature'] = city_reviews['weather_details'].apply(lambda x: x.get('temperature', 'Sin datos') if isinstance(x, dict) else 'Sin datos')
        
        # Crear encoding numérico optimizado para las 35 combinaciones
        def calculate_weather_score(weather_details):
            """
            Calcula score numérico basado en la favorabilidad esperada para turismo
            """
            if not isinstance(weather_details, dict):
                return 0.5
            
            score = 0.5  # Base neutral
            
            # Factor cielo (40% del score)
            sky_scores = {
                'Soleado': 0.4,
                'Parcialmente nublado': 0.25,
                'Nublado': 0.1,
                'Indefinido': 0.2
            }
            score += sky_scores.get(weather_details.get('sky_condition', ''), 0.2)
            
            # Factor precipitación (35% del score) - más importante
            precip_scores = {
                'Sin lluvia': 0.35,
                'Lloviznas': 0.1,
                'Lluvia moderada': -0.1,
                'Lluvia intensa': -0.25,
                'Sin especificar': 0.15
            }
            score += precip_scores.get(weather_details.get('precipitation', ''), 0.15)
            
            # Factor temperatura (20% del score)
            temp_scores = {
                'Fresco y agradable': 0.2,
                'Cálido': 0.15,
                'Temperatura normal': 0.1,
                'Frío': 0.05,
                'Muy caluroso': 0.05
            }
            score += temp_scores.get(weather_details.get('temperature', ''), 0.1)
            
            # Factor viento (5% del score)
            if weather_details.get('wind', '') == 'Con viento':
                score += 0.02  # Viento moderado puede ser agradable
            
            return max(0, min(1, score))  # Mantener en rango [0,1]
        
        city_reviews['weather_numeric'] = city_reviews['weather_details'].apply(calculate_weather_score)
        
        # Calcular correlación usando valores numéricos
        correlation = city_reviews[['sentimiento_numerico', 'weather_numeric']].corr().iloc[0, 1]
        correlation = correlation if not pd.isna(correlation) else 0
        
        # Análisis detallado por cada combinación específica
        specific_combinations = {}
        unique_descriptions = city_reviews['descripcion_sencilla'].value_counts()
        
        for description, count in unique_descriptions.items():
            if count >= 3:  # Solo analizar combinaciones con suficientes datos
                desc_reviews = city_reviews[city_reviews['descripcion_sencilla'] == description]
                sentiment_dist = desc_reviews['sentimiento'].value_counts(normalize=True)
                
                specific_combinations[description] = {
                    'total_reviews': count,
                    'sentiment_distribution': sentiment_dist.to_dict(),
                    'avg_confidence': desc_reviews['confianza'].mean(),
                    'positive_ratio': sentiment_dist.get('positivo', 0),
                    'negative_ratio': sentiment_dist.get('negativo', 0),
                    'neutral_ratio': sentiment_dist.get('neutro', 0),
                    'weather_score': desc_reviews['weather_numeric'].iloc[0],
                    'dominant_sentiment': sentiment_dist.idxmax() if not sentiment_dist.empty else 'neutro'
                }
        
        # Análisis por categorías principales
        climate_factors = {}
        
        # Por condición del cielo
        for sky_condition in city_reviews['sky_condition'].unique():
            if sky_condition != 'Sin datos':
                sky_reviews = city_reviews[city_reviews['sky_condition'] == sky_condition]
                if not sky_reviews.empty:
                    sentiment_dist = sky_reviews['sentimiento'].value_counts(normalize=True)
                    climate_factors[f'sky_{sky_condition.lower().replace(" ", "_")}'] = {
                        'total_reviews': len(sky_reviews),
                        'sentiment_distribution': sentiment_dist.to_dict(),
                        'avg_confidence': sky_reviews['confianza'].mean(),
                        'positive_ratio': sentiment_dist.get('positivo', 0),
                        'negative_ratio': sentiment_dist.get('negativo', 0)
                    }
        
        # Por precipitación
        for precipitation in city_reviews['precipitation'].unique():
            if precipitation != 'Sin datos':
                precip_reviews = city_reviews[city_reviews['precipitation'] == precipitation]
                if not precip_reviews.empty:
                    sentiment_dist = precip_reviews['sentimiento'].value_counts(normalize=True)
                    climate_factors[f'precip_{precipitation.lower().replace(" ", "_")}'] = {
                        'total_reviews': len(precip_reviews),
                        'sentiment_distribution': sentiment_dist.to_dict(),
                        'avg_confidence': precip_reviews['confianza'].mean(),
                        'positive_ratio': sentiment_dist.get('positivo', 0),
                        'negative_ratio': sentiment_dist.get('negativo', 0)
                    }
        
        # Por temperatura
        for temperature in city_reviews['temperature'].unique():
            if temperature != 'Sin datos':
                temp_reviews = city_reviews[city_reviews['temperature'] == temperature]
                if not temp_reviews.empty:
                    sentiment_dist = temp_reviews['sentimiento'].value_counts(normalize=True)
                    climate_factors[f'temp_{temperature.lower().replace(" ", "_")}'] = {
                        'total_reviews': len(temp_reviews),
                        'sentiment_distribution': sentiment_dist.to_dict(),
                        'avg_confidence': temp_reviews['confianza'].mean(),
                        'positive_ratio': sentiment_dist.get('positivo', 0),
                        'negative_ratio': sentiment_dist.get('negativo', 0)
                    }
        
        # Identificar combinaciones extremas (muy positivas o muy negativas)
        extreme_combinations = {}
        for desc, data in specific_combinations.items():
            if data['total_reviews'] >= 5:  # Solo combinaciones con datos suficientes
                pos_ratio = data['positive_ratio']
                neg_ratio = data['negative_ratio']
                
                if pos_ratio > 0.7:
                    extreme_combinations[desc] = {'type': 'very_positive', 'ratio': pos_ratio}
                elif neg_ratio > 0.5:
                    extreme_combinations[desc] = {'type': 'very_negative', 'ratio': neg_ratio}
                elif pos_ratio > 0.5 and 'lluvia' in desc.lower():
                    extreme_combinations[desc] = {'type': 'rain_resilient', 'ratio': pos_ratio}
        
        return {
            'correlation': correlation,
            'climate_factors': climate_factors,
            'specific_combinations': specific_combinations,
            'extreme_combinations': extreme_combinations,
            'weather_impact_strength': abs(correlation),
            'sentiment_distribution_overall': city_reviews['sentimiento'].value_counts(normalize=True).to_dict(),
            'total_unique_weather_conditions': len(unique_descriptions),
            'weather_diversity_score': len(unique_descriptions) / len(city_reviews) if len(city_reviews) > 0 else 0
        }
    
    def save_processed_data(self, processed_data, output_path):
        """
        Guarda datos procesados
        """
        print(f"Guardando datos procesados en {output_path}")
        
        with open(output_path, 'wb') as f:
            pickle.dump({
                'processed_data': processed_data,
                'scalers': self.scalers,
                'encoders': self.encoders
            }, f)
        
        print("Datos guardados exitosamente!")

In [5]:
if __name__ == "__main__":
    # Definir rutas de archivos
    file_paths = {
        'activities': ['atracciones_civitatis_procesado.csv', 'booking_atracciones_limpios.csv', 'booking_hoteles_limpio.csv', 
                      'detripadvisor_procesado.csv', 'getyourguide_procesado.csv'],
        'reviews': 'comentarios_final_definitivo_con_descripcion.csv',
        'un_tourism': ['inbound_arrivals_by_region.csv', 'inbound_arrivals.csv', 'inbound_expenditure.csv',
                      'outbound_departures.csv', 'outbound_expenditure.csv', 'tourism_domestic_trips.csv'],
        'commuting_zones': 'data-for-good-at-meta-commuting-zones-march-2023-ciudades.csv',
        'movement_data': 'movements_spain_cities_all.csv',
        'search_trends': 'busquedas_relacionadas_actuales.csv',
        'monthly_interest': 'interes_turistico_mensual_por_ciudad.csv'
    }
    
    # Configuración para datasets grandes
    STORAGE_PATH = "E:\AI Recomendador"
    
    # Inicializar preprocessor
    preprocessor = TourismDataPreprocessor()
    
    print("INICIANDO PROCESAMIENTO OPTIMIZADO PARA DATASETS GRANDES")
    print("=" * 60)
    
    # Cargar datasets
    print("\n1. Cargando datasets...")
    datasets = preprocessor.load_and_clean_datasets(file_paths)
    
    # Procesar features (incluye emparejamiento optimizado)
    print("\n2. Procesando features y emparejando reviews...")
    processed_features, matched_reviews = preprocessor.prepare_deep_learning_features(datasets)
    
    if not matched_reviews.empty:
        print(f"\n3. Emparejamiento exitoso: {len(matched_reviews):,} reviews emparejadas")
        
        # Crear matrices usuario-item usando storage en disco
        print("\n4. Creando matrices usuario-item (usando storage en disco)...")
        sample_matrices = preprocessor.create_user_item_matrix(
            matched_reviews, datasets['activities'], storage_path=STORAGE_PATH
        )
        
        # Preparar datos de entrenamiento eficientemente
        print("\n5. Preparando datos de entrenamiento...")
        training_sample, training_mappings = preprocessor.prepare_training_data_for_deep_learning(
            matched_reviews, storage_path=STORAGE_PATH, sample_size=100000
        )
        
        # Guardar todo de manera eficiente
        print("\n6. Guardando datos procesados...")
        all_processed_data = {
            'city_features': processed_features,
            'user_item_matrix': sample_matrices[0],  # Solo la muestra en memoria
            'user_sentiment_matrix': sample_matrices[1],
            'user_confidence_matrix': sample_matrices[2],
            'matched_reviews': matched_reviews,
            'training_sample': training_sample,
            'training_mappings': training_mappings,
            'raw_datasets': datasets
        }
        
        efficient_data = preprocessor.save_processed_data_efficiently(
            all_processed_data, 'processed_tourism_data_efficient.pkl', STORAGE_PATH
        )
        
        print("\nPROCESAMIENTO COMPLETADO EXITOSAMENTE")
        print("=" * 60)
        print(f"Archivos principales:")
        print(f"   • processed_tourism_data_efficient.pkl (metadatos y referencias)")
        print(f"   • {STORAGE_PATH}/ (matrices completas en HDF5)")
        print(f"   • matched_reviews_sample.pkl (muestra para análisis rápido)")
        
        print(f"\nEstadísticas finales:")
        print(f"   • Reviews emparejadas: {len(matched_reviews):,}")
        print(f"   • Usuarios sintéticos: {matched_reviews['user_id'].nunique():,}")
        print(f"   • Actividades cubiertas: {matched_reviews['item_id'].nunique():,}")
        print(f"   • Ciudades procesadas: {len(processed_features)}")
        print(f"   • Storage utilizado: ~{preprocessor._estimate_storage_size(STORAGE_PATH)} GB")
        
        print(f"\nPara cargar matrices específicas posteriormente:")
        print(f"   subset_matrix = preprocessor.load_user_item_matrices_from_storage(")
        print(f"       storage_path='{STORAGE_PATH}',")
        print(f"       user_subset=['user_1', 'user_2', ...],")
        print(f"       item_subset=['item_1', 'item_2', ...]")
        print(f"   )")
        
    else:
        print("\nNo se encontraron emparejamientos. Revisar datos de entrada.")
        
        # Guardar datos básicos sin matrices
        basic_processed_data = {
            'city_features': processed_features,
            'user_item_matrix': pd.DataFrame(),
            'user_sentiment_matrix': pd.DataFrame(),
            'user_confidence_matrix': pd.DataFrame(),
            'matched_reviews': pd.DataFrame(),
            'raw_datasets': datasets
        }
        
        preprocessor.save_processed_data(basic_processed_data, 'processed_tourism_data_basic.pkl')

INICIANDO PROCESAMIENTO OPTIMIZADO PARA DATASETS GRANDES

1. Cargando datasets...
Cargando datasets principales...

2. Procesando features y emparejando reviews...
Preparando features para deep learning...
Validando datasets...
  Activities dataset: 32669 registros
  Reviews dataset: 855745 registros
Emparejando reviews con actividades...
Normalizando nombres de ciudades...
Ciudades después de normalización:
  Activities: ['Barcelona', 'Gran Canaria', 'Madrid', 'Malaga', 'Mallorca', 'Sevilla', 'Tenerife', 'Valencia']
  Sentiment: ['Barcelona', 'Gran Canaria', 'Madrid', 'Malaga', 'Mallorca', 'Sevilla', 'Tenerife', 'Valencia']
Procesando 32669 actividades...
Progreso: 0/32669 (0.0%) - Matches: 0
Progreso: 1000/32669 (3.1%) - Matches: 4118
Progreso: 2000/32669 (6.1%) - Matches: 9329
Progreso: 3000/32669 (9.2%) - Matches: 13625
Progreso: 4000/32669 (12.2%) - Matches: 22823
Progreso: 5000/32669 (15.3%) - Matches: 26761
Progreso: 6000/32669 (18.4%) - Matches: 28235
Progreso: 7000/32669 (21.4

KeyboardInterrupt: 