In [4]:
import schedule
import time
import logging
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
import sqlite3
from typing import Dict, List, Optional
import argparse
import tensorflow as tf

In [6]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    tf.config.experimental.set_virtual_device_configuration(
        physical_devices[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3584)]
    )

In [8]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('tourism_recommender_fast.log'),
        logging.StreamHandler()
    ]
)

In [10]:
class TourismRecommenderPipeline:
    """
    Pipeline optimizado para RTX 3050 Ti
    """
    
    def __init__(self, config_path: str):
        self.config = self._load_config(config_path)
        self.db_path = self.config['database']['path']
        self.model_path = self.config['model']['path']
        self.data_paths = self.config['data_paths']
        self.storage_path = self.config.get('storage_path', 'AI_Recomendador')
        
        # Estado del pipeline
        self.last_training = None
        self.last_data_update = None
        
        logging.info("Pipeline RTX 3050 Ti inicializado")
    
    def _load_config(self, config_path: str) -> dict:
        with open(config_path, 'r') as f:
            config = json.load(f)
        return config
    
    def setup_database(self):
        """
        BD optimizada para almacenar resultados compactos
        """
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS recommendations_compact (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                user_id TEXT,
                city TEXT,
                item_id TEXT,
                final_score REAL,
                timestamp DATETIME,
                boost_factors TEXT
            )
        ''')
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS model_metrics_compact (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                model_version TEXT,
                timestamp DATETIME,
                val_loss REAL,
                val_accuracy REAL,
                training_time_minutes REAL,
                gpu_used BOOLEAN
            )
        ''')
        
        conn.commit()
        conn.close()
        logging.info("Base de datos compacta configurada")
    
    def data_ingestion_and_processing_fast(self):
        """
        Procesamiento rápido optimizado
        """
        start_time = time.time()
        logging.info("Iniciando procesamiento rápido de datos")
        
        try:
            # Importar clase optimizada
            from fixed_data_prep_pipeline import TourismDataPreprocessor
            
            self.data_preprocessor = TourismDataPreprocessor()
            
            # Cargar datasets
            datasets = self.data_preprocessor.load_and_clean_datasets(self.data_paths)
            
            # Procesamiento optimizado
            processed_features, matched_reviews = self.data_preprocessor.prepare_deep_learning_features(datasets)
            
            # Crear matrices usando storage en disco
            if not matched_reviews.empty:
                sample_matrices = self.data_preprocessor.create_user_item_matrix(
                    matched_reviews, datasets['activities'], storage_path=self.storage_path
                )
                
                # Preparar datos de entrenamiento optimizados
                training_sample, training_mappings = self.data_preprocessor.prepare_training_data_for_deep_learning(
                    matched_reviews, storage_path=self.storage_path, sample_size=30000
                )
                
                processed_data = {
                    'city_features': processed_features,
                    'user_item_matrix': sample_matrices[0],
                    'user_sentiment_matrix': sample_matrices[1], 
                    'user_confidence_matrix': sample_matrices[2],
                    'matched_reviews': matched_reviews,
                    'training_sample': training_sample,
                    'training_mappings': training_mappings,
                    'raw_datasets': datasets
                }
                
                # Guardado eficiente
                self.data_preprocessor.save_processed_data_efficiently(
                    processed_data, 'processed_tourism_data_fast.pkl', self.storage_path
                )
            else:
                logging.warning("No se encontraron reviews emparejadas")
                return None
            
            self.last_data_update = datetime.now()
            duration = time.time() - start_time
            
            logging.info(f"Procesamiento rápido completado en {duration/60:.1f} minutos")
            return processed_data
            
        except Exception as e:
            duration = time.time() - start_time
            logging.error(f"Error en procesamiento: {e}")
            raise
    
    def model_training_fast(self, processed_data: dict, retrain: bool = False):
        """
        Entrenamiento optimizado para RTX 3050 Ti
        """
        start_time = time.time()
        logging.info("Iniciando entrenamiento optimizado para RTX 3050 Ti")
        
        try:
            # Verificar si necesita reentrenamiento
            if not retrain and Path(self.model_path).exists():
                if self.last_training and (datetime.now() - self.last_training).days < 3:
                    logging.info("Modelo reciente encontrado, saltando entrenamiento")
                    return self._load_existing_model_fast()
            
            # Importar modelo optimizado
            from deep_learning_recommender import TourismRecommenderModel
            
            # Configuración optimizada para RTX 3050 Ti
            self.recommender_model = TourismRecommenderModel(
                embedding_dim=32,    # Reducido para 4GB VRAM
                dense_units=64       # Reducido para 4GB VRAM
            )
            
            # Usar datos de entrenamiento ya preparados
            training_data = processed_data.get('training_sample', pd.DataFrame())
            training_mappings = processed_data.get('training_mappings', {})
            
            if training_data.empty:
                raise ValueError("No hay datos de entrenamiento preparados")
            
            # Preparar datos para el modelo
            training_df = self.recommender_model.prepare_training_data({
                'training_sample': training_data,
                'training_mappings': training_mappings
            })
            
            # Construir modelo compacto
            num_users = len(training_mappings['user_to_idx'])
            num_items = len(training_mappings['item_to_idx'])
            num_cities = len(training_mappings['city_to_idx'])
            
            model = self.recommender_model.build_hybrid_model(
                num_users=num_users,
                num_items=num_items,
                num_cities=num_cities,
                contextual_features_dim=10,  # Reducido
                weather_features_dim=15      # Reducido
            )
            
            # Entrenamiento rápido
            logging.info("Iniciando entrenamiento con configuración RTX 3050 Ti...")
            
            history = self.recommender_model.train_model(
                training_df,
                epochs=30,           # Reducido para velocidad
                batch_size=128,      # Óptimo para 4GB VRAM
                validation_split=0.2
            )
            
            # Guardar modelo compacto
            self.recommender_model.save_model(self.model_path)
            
            # Métricas
            metrics = {
                'val_loss': min(history.history['val_loss']),
                'val_accuracy': max(history.history['val_interaction_accuracy']),
                'training_time': (time.time() - start_time) / 60,
                'gpu_used': True
            }
            
            self._save_model_metrics_fast(metrics)
            
            self.last_training = datetime.now()
            duration = time.time() - start_time
            
            logging.info(f"Entrenamiento completado en {duration/60:.1f} minutos")
            logging.info(f"Mejor val_loss: {metrics['val_loss']:.4f}")
            
            return self.recommender_model
            
        except Exception as e:
            duration = time.time() - start_time
            logging.error(f"Error en entrenamiento: {e}")
            raise
    
    def _load_existing_model_fast(self):
        """
        Carga modelo existente de manera rápida
        """
        try:
            from deep_learning_recommender import TourismRecommenderModel
            
            # Cargar modelo
            model = tf.keras.models.load_model(self.model_path)
            
            # Cargar mappings
            mappings_path = Path(self.storage_path) / 'training_mappings_gpu_optimized.pkl'
            with open(mappings_path, 'rb') as f:
                mappings = pickle.load(f)
            
            # Recrear objeto
            self.recommender_model = TourismRecommenderModel(embedding_dim=32, dense_units=64)
            self.recommender_model.model = model
            self.recommender_model.user_to_idx = mappings['user_to_idx']
            self.recommender_model.item_to_idx = mappings['item_to_idx']
            self.recommender_model.city_to_idx = mappings['city_to_idx']
            
            logging.info("Modelo existente cargado rápidamente")
            return self.recommender_model
            
        except Exception as e:
            logging.error(f"Error cargando modelo existente: {e}")
            return None
    
    def _save_model_metrics_fast(self, metrics: dict):
        """
        Guarda métricas de manera rápida
        """
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            INSERT INTO model_metrics_compact 
            (model_version, timestamp, val_loss, val_accuracy, training_time_minutes, gpu_used)
            VALUES (?, ?, ?, ?, ?, ?)
        ''', (
            f"fast_v{datetime.now().strftime('%Y%m%d_%H%M')}",
            datetime.now(),
            metrics.get('val_loss', 0),
            metrics.get('val_accuracy', 0),
            metrics.get('training_time', 0),
            metrics.get('gpu_used', False)
        ))
        
        conn.commit()
        conn.close()
    
    def run_fast_pipeline(self, retrain_model: bool = False):
        """
        Ejecuta pipeline completo optimizado
        """
        logging.info("=== INICIANDO PIPELINE RÁPIDO RTX 3050 Ti ===")
        
        total_start_time = time.time()
        
        try:
            # 1. Setup BD
            self.setup_database()
            
            # 2. Procesamiento de datos
            processed_data = self.data_ingestion_and_processing_fast()
            
            if processed_data is None:
                logging.error("No se pudieron procesar los datos")
                return
            
            # 3. Entrenamiento del modelo
            model = self.model_training_fast(processed_data, retrain=retrain_model)
            
            if model is None:
                logging.error("No se pudo entrenar/cargar el modelo")
                return
            
            total_duration = time.time() - total_start_time
            
            logging.info("=== PIPELINE RÁPIDO COMPLETADO EXITOSAMENTE ===")
            logging.info(f"Tiempo total: {total_duration/60:.1f} minutos")
            
            # Estadísticas finales
            training_sample = processed_data.get('training_sample', pd.DataFrame())
            if not training_sample.empty:
                logging.info(f"Reviews de entrenamiento: {len(training_sample):,}")
                logging.info(f"Usuarios: {training_sample['user_id'].nunique():,}")
                logging.info(f"Items: {training_sample['item_id'].nunique():,}")
            
            return True
            
        except Exception as e:
            total_duration = time.time() - total_start_time
            logging.error(f"ERROR EN PIPELINE: {e}")
            logging.error(f"Tiempo antes del error: {total_duration/60:.1f} minutos")
            return False

In [12]:
def main():
    parser = argparse.ArgumentParser(description='Tourism Recommender Fast Pipeline')
    parser.add_argument('--config', default='config_fast.json', help='Configuración')
    parser.add_argument('--retrain', action='store_true', help='Forzar reentrenamiento')
    
    args = parser.parse_args()
    
    # Crear configuración optimizada si no existe
    if not Path(args.config).exists():
        default_config = {
            "database": {"path": "tourism_recommender_fast.db"},
            "model": {"path": "tourism_model_rtx3050ti.h5"},
            "storage_path": "AI_Recomendador",
            "data_paths": {
                "activities": ["activities_1.csv", "activities_2.csv", "activities_3.csv", 
                              "activities_4.csv", "activities_5.csv"],
                "reviews": "sentiment_analysis_with_weather.csv",
                "un_tourism": ["un_tourism_1.csv", "un_tourism_2.csv", "un_tourism_3.csv",
                              "un_tourism_4.csv", "un_tourism_5.csv", "un_tourism_6.csv", 
                              "un_tourism_7.csv"],
                "commuting_zones": "meta_commuting_zones.csv",
                "movement_data": "meta_movement_data.csv",
                "search_trends": "google_trends_searches.csv",
                "monthly_interest": "google_trends_monthly_interest.csv"
            }
        }
        
        with open(args.config, 'w') as f:
            json.dump(default_config, f, indent=2)
        
        logging.info(f"Configuración optimizada creada: {args.config}")
    
    # Ejecutar pipeline rápido
    pipeline = TourismRecommenderPipeline(args.config)
    
    success = pipeline.run_fast_pipeline(retrain_model=args.retrain)
    
    if success:
        print("\nPIPELINE RÁPIDO COMPLETADO CON ÉXITO!")
        print("Optimizado para RTX 3050 Ti")
        print("Modelo y datos guardados en disco")
        print("Métricas guardadas en BD")
    else:
        print("\nPIPELINE FALLÓ - Revisar logs")

In [None]:
if __name__ == "__main__":
    main()