In [6]:
import duckdb
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

import pandas as pd # Only used for date range and holiday generation
import holidays # pip install holidays
import datetime

In [7]:
GOLD_VENTAS_SEMANALES_CLUSTERED_LIGHTGBM = '../data/gold_ventas_semanales_clustered_lgbm.parquet'
GOLD_VENTAS_SEMANALES_CLUSTERED = '../data/gold_ventas_semanales_training_clustered.parquet'


In [9]:


def feature_engineering_lightgbm(parquet_path: str, holiday_country: str = 'ES') -> pa.Table:
    """
    Reads a Parquet file containing time series data and generates features
    using PyArrow and DuckDB.

    Args:
        parquet_path: Path to the input Parquet file.
                        Expected columns: establecimiento, material, week, weekly_volume,
                                        has_promo, is_covid_period, cluster_label (optional).
        holiday_country: Country code for the holidays library (e.g., 'ES').

    Returns:
        A PyArrow Table containing the original data plus the generated features.
        Returns None if an error occurs during loading or processing.
    """
    try:
        # --- 1. Load Data with PyArrow ---
        print(f"Loading data from: {parquet_path}")
        arrow_table = pq.read_table(parquet_path)
        print(f"Initial table loaded with shape: ({arrow_table.num_rows}, {arrow_table.num_columns})")
        print("Initial schema:")
        print(arrow_table.schema)

        # Validate essential columns
        required_cols = {'establecimiento', 'material', 'week', 'weekly_volume'}
        if not required_cols.issubset(arrow_table.schema.names):
            missing = required_cols - set(arrow_table.schema.names)
            print(f"Error: Missing required columns: {missing}")
            return None

        # Ensure 'week' is a date type
        week_col_idx = arrow_table.schema.get_field_index('week')
        if week_col_idx != -1 and not pa.types.is_temporal(arrow_table.schema.field('week').type):
            print("Attempting to cast 'week' column to date32...")
            try:
                # Try casting, assuming it's string or similar. Adjust if needed.
                date_col = pc.cast(arrow_table.column('week'), pa.date32())
                arrow_table = arrow_table.set_column(week_col_idx, pa.field('week', pa.date32()), date_col)
                print("'week' column successfully cast to date32.")
            except Exception as e:
                print(f"Error casting 'week' column to date: {e}. Please ensure it's in a parseable format.")
                return None
        elif week_col_idx == -1:
                print("Error: 'week' column not found.")
                return None


    except Exception as e:
        print(f"Error loading Parquet file '{parquet_path}': {e}")
        return None

    try:
        # --- 2. Prepare Holiday Data ---
        print("Generating holiday features...")
        # Determine year range from data
        min_date_pa = pc.min(arrow_table.column('week')).as_py()
        max_date_pa = pc.max(arrow_table.column('week')).as_py()

        # Handle potential NaT dates if column is empty or has issues
        if min_date_pa is None or max_date_pa is None:
                print("Error: Could not determine date range from 'week' column.")
                return None

        # Ensure they are date objects if they are datetime
        if isinstance(min_date_pa, datetime.datetime):
            min_date_pa = min_date_pa.date()
        if isinstance(max_date_pa, datetime.datetime):
            max_date_pa = max_date_pa.date()

        years = list(range(min_date_pa.year, max_date_pa.year + 1))

        # Get holidays using the library
        country_holidays = holidays.country_holidays(holiday_country, years=years)
        all_holiday_dates = {**country_holidays} # Combine dicts

        # Create a PyArrow table with holiday dates for joining in DuckDB
        holiday_dates_list = sorted(list(all_holiday_dates.keys()))
        holidays_table = pa.Table.from_pydict({
            'holiday_date': pa.array(holiday_dates_list, type=pa.date32()),
            'is_holiday_flag': pa.array([1] * len(holiday_dates_list), type=pa.int8())
        })
        print(f"Generated {holidays_table.num_rows} holiday entries for years {min(years)}-{max(years)}.")

        # --- 3. Connect to DuckDB & Register Tables ---
        print("Connecting to DuckDB and registering tables...")
        con = duckdb.connect(database=':memory:', read_only=False)
        con.register('sales_data', arrow_table)
        con.register('holidays_data', holidays_table)

        # --- 4. Build and Execute Feature Engineering SQL Query ---
        print("Building and executing feature engineering SQL query...")
        # Define rolling window sizes and lag sizes
        rolling_windows = [4, 8, 12, 52]
        lags = [1, 2, 4, 8, 12, 26, 52]

        # Start building the SQL query using CTEs
        sql_parts = []
        sql_parts.append("""
    WITH InputData AS (
    -- Select and potentially cast types if needed
    SELECT
        establecimiento,
        material,
        week,
        CAST(weekly_volume AS DOUBLE) AS weekly_volume, -- Use double for calculations
        try_cast(has_promo as TINYINT) as has_promo, -- Ensure correct types
        try_cast(is_covid_period as TINYINT) as is_covid_period,
        cluster_label -- Assumes this column exists from clustering step
    FROM sales_data
    ),
    DateFeatures AS (
    -- Calculate basic and cyclical date features
    SELECT
        *,
        CAST(strftime(week, '%Y') AS INTEGER) AS year,
        CAST(strftime(week, '%m') AS INTEGER) AS month,
        CAST(strftime(week, '%W') AS INTEGER) AS week_of_year, -- ISO week number
        sin(2 * pi() * CAST(strftime(week, '%m') AS INTEGER) / 12.0) AS month_sin,
        cos(2 * pi() * CAST(strftime(week, '%m') AS INTEGER) / 12.0) AS month_cos,
        sin(2 * pi() * CAST(strftime(week, '%W') AS INTEGER) / 52.0) AS week_of_year_sin,
        cos(2 * pi() * CAST(strftime(week, '%W') AS INTEGER) / 52.0) AS week_of_year_cos,
        -- Add day_of_year sin/cos if needed
    FROM InputData
    ),
    HolidayFeatures AS (
    -- Join with holiday data to get flags
    SELECT
        d.*,
        -- Flag if the week *starts* on a holiday date (adjust logic if needed)
        COALESCE(h.is_holiday_flag, 0) AS is_holiday_exact_date,
        -- Flag if *any* day within the week ending on 'week' is a holiday
        -- This requires looking back 6 days. Max window function helps.
        MAX(COALESCE(h.is_holiday_flag, 0)) OVER (
            PARTITION BY establecimiento, material
            ORDER BY week
            ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
        ) AS is_holiday_in_week
    FROM DateFeatures d
    LEFT JOIN holidays_data h ON d.week = h.holiday_date -- Join on exact date first
    ),
    LagFeatures AS (
    -- Calculate lag features for weekly_volume
    SELECT
        *,
    """)
        # Add lag features dynamically
        lag_cols = []
        for lag in lags:
            lag_cols.append(f"        LAG(weekly_volume, {lag}) OVER (PARTITION BY establecimiento, material ORDER BY week) AS volume_lag_{lag}")
        sql_parts.append(",\n".join(lag_cols))
        sql_parts.append("""
    FROM HolidayFeatures
    ),
    RollingWindowFeatures AS (
    -- Calculate rolling window features
    SELECT
        *,
    """)
        # Add rolling window features dynamically
        rolling_cols = []
        base_col = 'weekly_volume'
        # Use shift(1) equivalent by adjusting the window frame
        # ROWS BETWEEN N PRECEDING AND 1 PRECEDING excludes current row
        for w in rolling_windows:
            rolling_cols.extend([
                f"        AVG({base_col}) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS volume_roll_mean_{w}w",
                f"        MEDIAN({base_col}) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS volume_roll_median_{w}w",
                f"        STDDEV_SAMP({base_col}) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS volume_roll_std_{w}w",
                f"        MIN({base_col}) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS volume_roll_min_{w}w",
                f"        MAX({base_col}) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS volume_roll_max_{w}w",
                # Stats on non-zero values within the window
                f"        COUNT(CASE WHEN {base_col} > 0 THEN 1 ELSE NULL END) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS volume_roll_count_nonzero_{w}w",
                f"        AVG(CASE WHEN {base_col} > 0 THEN {base_col} ELSE NULL END) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS volume_roll_mean_nonzero_{w}w",
                f"        MEDIAN(CASE WHEN {base_col} > 0 THEN {base_col} ELSE NULL END) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS volume_roll_median_nonzero_{w}w",
                f"        STDDEV_SAMP(CASE WHEN {base_col} > 0 THEN {base_col} ELSE NULL END) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS volume_roll_std_nonzero_{w}w",
                # Ratio of non-zero weeks in window
                f"        CAST(COUNT(CASE WHEN {base_col} > 0 THEN 1 ELSE NULL END) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING) AS DOUBLE) / "
                f"          NULLIF(COUNT(*) OVER (PARTITION BY establecimiento, material ORDER BY week ROWS BETWEEN {w-1} PRECEDING AND 1 PRECEDING), 0) AS volume_roll_ratio_nonzero_{w}w"
            ])
        sql_parts.append(",\n".join(rolling_cols))
        sql_parts.append("""
    FROM LagFeatures
    ),
    IntermittencyFeatures AS (
    -- Calculate time since last sale
    SELECT
        *,
        -- Find the date of the previous sale
        LAG(CASE WHEN weekly_volume > 0 THEN week ELSE NULL END IGNORE NULLS) OVER (PARTITION BY establecimiento, material ORDER BY week) AS last_sale_week,
        -- Calculate difference in days
        (week - LAG(CASE WHEN weekly_volume > 0 THEN week ELSE NULL END IGNORE NULLS) OVER (PARTITION BY establecimiento, material ORDER BY week)) AS days_since_last_sale
    FROM RollingWindowFeatures
    )
    -- Final Selection: Select all original columns and newly created features
    SELECT * FROM IntermittencyFeatures ORDER BY establecimiento, material, week;
    """)

        final_sql = "\n".join(sql_parts)
        # print("\n--- Generated SQL Query ---")
        # print(final_sql)
        # print("--- End SQL Query ---")

        # Execute the query
        result_arrow_table = con.execute(final_sql).arrow()
        print(f"Feature engineering completed. Result table shape: ({result_arrow_table.num_rows}, {result_arrow_table.num_columns})")

        # --- 5. Post-processing and Cleanup ---
        con.close() # Close DuckDB connection

        # Handle potential NaNs/Infs introduced (though COALESCE in SQL is preferred)
        # Example: Fill NaNs in 'days_since_last_sale' for the first sale
        if 'days_since_last_sale' in result_arrow_table.schema.names:
                days_col_idx = result_arrow_table.schema.get_field_index('days_since_last_sale')
                days_col = result_arrow_table.column('days_since_last_sale')
                # Fill with a large number or 0, depending on desired meaning
                # Using fill_null which is simpler than pc.if_else for simple replacement
                filled_days_col = days_col.fill_null(0) # Fill initial NaNs with 0 days
                result_arrow_table = result_arrow_table.set_column(days_col_idx, pa.field('days_since_last_sale', filled_days_col.type), filled_days_col)

        # Fill NaNs in rolling features (often occur at the start)
        # Example: fill rolling std with 0, rolling means/medians with 0 or forward fill
        for col_name in result_arrow_table.schema.names:
                if 'roll_' in col_name:
                    col_idx = result_arrow_table.schema.get_field_index(col_name)
                    col = result_arrow_table.column(col_name)
                    # Simple fill with 0 for demonstration
                    # More sophisticated filling might be needed (e.g., ffill within groups - harder without pandas)
                    if pa.types.is_floating(col.type) or pa.types.is_integer(col.type):
                        filled_col = col.fill_null(0)
                        result_arrow_table = result_arrow_table.set_column(col_idx, pa.field(col_name, filled_col.type), filled_col)


        print("Final Schema after feature engineering:")
        print(result_arrow_table.schema)
        print(f"Final table shape: ({result_arrow_table.num_rows}, {result_arrow_table.num_columns})")
        pq.write_table(result_arrow_table, GOLD_VENTAS_SEMANALES_CLUSTERED_LIGHTGBM)
        print(f"Saved features table to {GOLD_VENTAS_SEMANALES_CLUSTERED_LIGHTGBM}")

        # Para asegurarnos que se guarda correctamente, comprobamos si el archivo existe
        if os.path.exists(GOLD_VENTAS_SEMANALES_CLUSTERED_LIGHTGBM):
            print(f"Confirmed: Features file exists at {GOLD_VENTAS_SEMANALES_CLUSTERED_LIGHTGBM}")
        else:
            print(f"Warning: Features file was not created at {GOLD_VENTAS_SEMANALES_CLUSTERED_LIGHTGBM}")




        return result_arrow_table

    except Exception as e:
        print(f"An error occurred during feature engineering: {e}")
        if 'con' in locals() and con:
            con.close()
        return None

In [11]:
processed_table = feature_engineering_lightgbm(GOLD_VENTAS_SEMANALES_CLUSTERED)

Loading data from: ../data/gold_ventas_semanales_training_clustered.parquet
Initial table loaded with shape: (25674861, 7)
Initial schema:
establecimiento: string
material: string
week: date32[day]
has_promo: int32
weekly_volume: double
is_covid_period: int32
cluster_label: int32
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1109
Generating holiday features...
Generated 26 holiday entries for years 2022-2024.
Connecting to DuckDB and registering tables...
Building and executing feature engineering SQL query...
Feature engineering completed. Result table shape: (25674861, 65)
Final Schema after feature engineering:
establecimiento: string
material: string
week: date32[day]
weekly_volume: double
has_promo: int8
is_covid_period: int8
cluster_label: int32
year: int32
month: int32
week_of_year: int32
month_sin: double
month_cos: double
week_of_year_sin: double
week_of_year_cos: double
is_holiday_exact_date: int8
is_holiday_in_week: int8
v

In [3]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import joblib
import os
import gc
import warnings
import lightgbm as lgb

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)


In [None]:

PARQUET_FILE = '../data/gold_ventas_semanales_clustered_lgbm.parquet'
OUTPUT_DIR = '../models/lightgbm'
METRICS_FILE = os.path.join(OUTPUT_DIR, 'all_cluster_metrics.csv')
PREDICTIONS_FILE = os.path.join(OUTPUT_DIR, 'all_series_predictions.csv')
PLOTS_DIR = os.path.join(OUTPUT_DIR, 'prediction_plots_by_series')
MODELS_DIR = os.path.join(OUTPUT_DIR, 'trained_cluster_models')

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

TARGET_VARIABLE = 'weekly_volume'
SERIES_ID_COLS = ['establecimiento', 'material']
CLUSTER_COL = 'cluster_label'
DATE_COL = 'week'

parquet_meta = pq.read_metadata(PARQUET_FILE)
ALL_COLS = [col.name for col in parquet_meta.schema]

FEATURES_COLS = [
    col for col in ALL_COLS
    if col not in [TARGET_VARIABLE, DATE_COL, CLUSTER_COL, 'last_sale_week'] 
]

CATEGORICAL_FEATURES = ['establecimiento', 'material', 'cluster_label']
CATEGORICAL_FEATURES_FOR_MODEL = [f for f in ['establecimiento', 'material'] if f in FEATURES_COLS]
NUMERICAL_FEATURES = [col for col in FEATURES_COLS if col not in CATEGORICAL_FEATURES_FOR_MODEL]

def calculate_mase(y_true_train, y_true_test, y_pred_test):
    y_true_train = np.array(y_true_train).flatten()
    y_true_test = np.array(y_true_test).flatten()
    y_pred_test = np.array(y_pred_test).flatten()

    if len(y_true_train) < 2:
        return np.nan

    naive_forecast_error_train = np.mean(np.abs(np.diff(y_true_train)))

    if naive_forecast_error_train < 1e-9:
         model_mae_test = mean_absolute_error(y_true_test, y_pred_test)
         return np.inf if model_mae_test > 1e-9 else 0.0

    model_mae_test = mean_absolute_error(y_true_test, y_pred_test)

    return model_mae_test / naive_forecast_error_train


def evaluate_model(y_true_train, y_true_test, y_pred_test, label="Cluster"):
    metrics = {
        f'{label}_mae': mean_absolute_error(y_true_test, y_pred_test),
        f'{label}_rmse': np.sqrt(mean_squared_error(y_true_test, y_pred_test)),
        f'{label}_mape': mean_absolute_percentage_error(y_true_test, y_pred_test) if np.all(np.abs(y_true_test) > 1e-9) else np.nan,
        f'{label}_r2': r2_score(y_true_test, y_pred_test),
        f'{label}_mase': calculate_mase(y_true_train, y_true_test, y_pred_test)
    }
    mape_key = f'{label}_mape'
    if mape_key in metrics and np.isinf(metrics[mape_key]):
         metrics[mape_key] = np.nan
    return metrics


def plot_predictions(dates_test, y_true_test, y_pred_test, estab, material, cluster_id, filepath):
    plt.figure(figsize=(15, 6))
    plt.plot(dates_test, y_true_test, label='Real', marker='.', linestyle='-')
    plt.plot(dates_test, y_pred_test, label=f'Predicción (Cluster {cluster_id})', marker='x', linestyle='--')
    plt.title(f'Predicción vs Real - {estab} / {material} (Cluster {cluster_id})')
    plt.xlabel('Semana')
    plt.ylabel('Volumen Semanal')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(filepath)
    plt.close()

try:
    print(f"Leyendo columna de clusters '{CLUSTER_COL}' del archivo: {PARQUET_FILE}")
    pf = pq.ParquetFile(PARQUET_FILE)
    cluster_labels_df = pf.read(columns=[CLUSTER_COL]).to_pandas()
    unique_clusters = cluster_labels_df[CLUSTER_COL].unique()
    unique_clusters = [c for c in unique_clusters if pd.notna(c)] 
    print(f"Encontrados {len(unique_clusters)} clusters únicos.")
    del cluster_labels_df
    gc.collect()
except Exception as e:
    print(f"Error al leer la columna de clusters del Parquet: {e}")
    exit()

all_cluster_metrics = []
header_preds = SERIES_ID_COLS + [DATE_COL, 'actual_volume', 'predicted_volume', CLUSTER_COL]
pd.DataFrame(columns=header_preds).to_csv(PREDICTIONS_FILE, index=False)

N_SPLITS_CV = 5 
N_ITER_HPT = 10 
SCORING_METRIC_HPT = 'neg_mean_absolute_error' 

param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 'log2', 0.7, 1.0] 
}
try:
    pf = pq.ParquetFile(PARQUET_FILE)
    # Leer solo la columna del cluster para eficiencia
    cluster_labels_df = pf.read(columns=[CLUSTER_COL]).to_pandas()
    unique_clusters = cluster_labels_df[CLUSTER_COL].unique()
    # Remover posibles NaNs si existen
    unique_clusters = [c for c in unique_clusters if pd.notna(c)] 
    print(f"Encontrados {len(unique_clusters)} clusters únicos.")
    del cluster_labels_df # Liberar memoria
    gc.collect()
except Exception as e:
    print(f"Error al leer la columna de clusters del Parquet: {e}")
    # Intentar cargar todo si es necesario (cuidado con la memoria)
    # O manejar el error de otra forma
    exit()


# --- 4. Procesamiento por Cluster (Entrenamiento, CV, HPT, Evaluación) ---

all_cluster_metrics = []
# El archivo de predicciones se inicializa una vez
header_preds = SERIES_ID_COLS + [DATE_COL, 'actual_volume', 'predicted_volume', CLUSTER_COL]
pd.DataFrame(columns=header_preds).to_csv(PREDICTIONS_FILE, index=False)


# Configuración de CV y HPT (igual que antes)
N_SPLITS_CV = 5 
N_ITER_HPT = 10 
SCORING_METRIC_HPT = 'neg_mean_absolute_error' 

# Define el espacio de búsqueda de hiperparámetros para RandomForestRegressor (igual que antes)
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 'log2', 0.7, 1.0] 
}

# Inicializar archivo de métricas de cluster
header_metrics = [CLUSTER_COL] + [f'Cluster_{m}' for m in ['mae', 'rmse', 'mape', 'r2', 'mase']] + ['best_params']
pd.DataFrame(columns=header_metrics).to_csv(METRICS_FILE, index=False)


print(f"\nIniciando procesamiento para {len(unique_clusters)} clusters...")
# Replace tqdm with print statements for progress tracking
for cluster_count, cluster_id in enumerate(unique_clusters):
    print(f"\n--- Procesando Cluster {cluster_count+1}/{len(unique_clusters)}: ID = {cluster_id} ---")
    
    try:
        # 4.1. Cargar datos para el cluster actual
        print(f"Cargando datos para el cluster {cluster_id}...")
        filters = [(CLUSTER_COL, '=', cluster_id)]
        cluster_df = pd.read_parquet(PARQUET_FILE, filters=filters)
        
        # Asegurar orden temporal GLOBAL dentro del cluster para CV
        # Primero por serie, luego por fecha para mantener bloques de series juntos
        cluster_df = cluster_df.sort_values(by=SERIES_ID_COLS + [DATE_COL]).reset_index(drop=True)
        print(f"Cluster {cluster_id}: {len(cluster_df)} filas cargadas.")

        # Mínimo de datos para entrenar/validar (ajusta según necesidad)
        if len(cluster_df) < (N_SPLITS_CV + 2) * 2: # Un umbral más alto para clusters
            print(f"Datos insuficientes para CV en cluster {cluster_id} ({len(cluster_df)} filas). Saltando cluster.")
            continue
            
        # 4.2. Preparar X e y para el cluster
        X_cluster = cluster_df[FEATURES_COLS]
        y_cluster = cluster_df[TARGET_VARIABLE]
        
        # Identificadores y fechas para post-procesamiento (predicciones, gráficos)
        ids_cluster = cluster_df[SERIES_ID_COLS + [CLUSTER_COL]]
        dates_cluster = cluster_df[DATE_COL]

        # 4.3. Preprocesamiento Pipeline
        # Incluye Imputación para numéricas y OneHotEncoding para categóricas
        
        # Pipeline para numéricas: Imputar NaNs (ej. con mediana)
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median'))
        ])

        # Pipeline para categóricas: Imputar NaNs (con constante como 'missing') y luego OneHotEncode
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
        ])

        # Crear el preprocesador con ColumnTransformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, NUMERICAL_FEATURES),
                ('cat', categorical_transformer, CATEGORICAL_FEATURES_FOR_MODEL)
            ], 
            remainder='passthrough' # Mantiene columnas no especificadas si FEATURES_COLS no fue exhaustivo
        )
        
        # 4.4. Separar un conjunto de Test Final (Hold-out) para el CLUSTER
        print(f"Realizando split Train/Test para el cluster {cluster_id}...")
        test_size_ratio = 0.2 # Usar el último 20% como test
        n_rows = len(cluster_df)
        split_point = int(n_rows * (1 - test_size_ratio))

        # Asegurarse que hay suficientes datos para train y test
        if split_point < N_SPLITS_CV + 1 or n_rows - split_point < 1:
             print(f"Datos insuficientes para split Train/Test adecuado en cluster {cluster_id}. Saltando.")
             continue
             
        X_train_val, X_test = X_cluster.iloc[:split_point], X_cluster.iloc[split_point:]
        y_train_val, y_test = y_cluster.iloc[:split_point], y_cluster.iloc[split_point:]
        
        # Guardar IDs y fechas correspondientes al conjunto de test para el guardado final
        ids_test = ids_cluster.iloc[split_point:]
        dates_test = dates_cluster.iloc[split_point:]
        y_true_train_for_mase = y_train_val.values # Para cálculo de MASE agregado

        print(f"Cluster {cluster_id}: Train/Val={len(X_train_val)}, Test={len(X_test)}")

        # 4.5. Configurar TimeSeriesSplit para CV dentro del conjunto Train/Val del CLUSTER
        # test_size debería ser una fracción razonable de los datos de train/val
        cv_test_size = max(1, len(X_train_val) // (N_SPLITS_CV + 1)) 
        tscv = TimeSeriesSplit(n_splits=N_SPLITS_CV, gap=0, test_size=cv_test_size)

        # 4.6. Configurar y Ejecutar RandomizedSearchCV
        lgbm = lgb.LGBMRegressor(random_state=42, n_jobs=-1) # LGBM maneja mejor n_jobs=-1

        pipeline = Pipeline([
            ('preprocess', preprocessor),
            ('regressor', lgbm) # <-- Usar LGBM
        ])

                # Ajustar el grid de parámetros para que se aplique al paso 'regressor'
        param_dist_pipeline = {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__learning_rate': [0.05, 0.1, 0.2],
            'regressor__num_leaves': [20, 31, 40], # Relacionado con max_depth pero no igual
            'regressor__max_depth': [-1, 10, 20], # -1 es sin límite
            'regressor__min_child_samples': [20, 50, 100],
            'regressor__subsample': [0.7, 0.8, 0.9], # Muestreo de filas
            'regressor__colsample_bytree': [0.7, 0.8, 0.9] # Muestreo de features
        }

        # Configura RandomizedSearchCV con el nuevo pipeline y param_dist
        search = RandomizedSearchCV(
            estimator=pipeline,
            param_distributions=param_dist_pipeline,
            n_iter=N_ITER_HPT, # Puedes empezar con 10-20 para LGBM
            cv=tscv,
            scoring=SCORING_METRIC_HPT,
            n_jobs=1, # El Search sigue secuencial
            refit=True,
            random_state=42,
            verbose=10
        )

        print(f"Iniciando HPT con LightGBM para cluster {cluster_id}...")
        search.fit(X_train_val, y_train_val)
        
        best_model_cluster = search.best_estimator_
        best_params_cluster = search.best_params_
        print(f"Mejores parámetros encontrados para cluster {cluster_id}: {best_params_cluster}")
        
        # 4.7. Predicción en el Conjunto de Test (Hold-out) del CLUSTER
        print(f"Realizando predicciones en el conjunto de test del cluster {cluster_id}...")
        y_pred_test_cluster = best_model_cluster.predict(X_test)
        
        # 4.8. Calcular Métricas AGREGADAS para el CLUSTER
        print(f"Calculando métricas agregadas para el cluster {cluster_id}...")
        cluster_metrics = evaluate_model(y_true_train_for_mase, y_test.values, y_pred_test_cluster, label="Cluster")
        
        # Añadir ID de cluster y parámetros
        metrics_row = {CLUSTER_COL: cluster_id}
        metrics_row.update(cluster_metrics)
        metrics_row['best_params'] = str(best_params_cluster) # Guardar como string
        all_cluster_metrics.append(metrics_row)
        
        # 4.9. Guardar Métricas del Cluster (append)
        pd.DataFrame([metrics_row]).to_csv(METRICS_FILE, mode='a', header=False, index=False)

        # 4.10. Preparar y Guardar Predicciones a Nivel de SERIE (append)
        predictions_df = pd.DataFrame({
            'establecimiento': ids_test['establecimiento'],
            'material': ids_test['material'],
            DATE_COL: dates_test,
            'actual_volume': y_test.values,
            'predicted_volume': y_pred_test_cluster,
            CLUSTER_COL: ids_test[CLUSTER_COL]
        })
        predictions_df.to_csv(PREDICTIONS_FILE, mode='a', header=False, index=False)
        
        # 4.11. Generar y Guardar Gráficos POR SERIE dentro del cluster
        print(f"Generando gráficos para las series del cluster {cluster_id}...")
        unique_series_in_test = ids_test[SERIES_ID_COLS].drop_duplicates().values.tolist()
        
        for series_idx, (estab, material) in enumerate(unique_series_in_test):
             print(f"Generando gráfico {series_idx+1}/{len(unique_series_in_test)} para cluster {cluster_id}")
             # Filtrar las predicciones y reales para esta serie específica del test set
             mask = (predictions_df['establecimiento'] == estab) & (predictions_df['material'] == material)
             series_pred_df = predictions_df[mask]
             
             if not series_pred_df.empty:
                 plot_filename = os.path.join(PLOTS_DIR, f'pred_vs_actual_{estab}_{material}_cluster{cluster_id}.png')
                 plot_predictions(
                     series_pred_df[DATE_COL], 
                     series_pred_df['actual_volume'], 
                     series_pred_df['predicted_volume'], 
                     estab, 
                     material, 
                     cluster_id, 
                     plot_filename
                 )

        # 4.12. Guardar el Modelo Entrenado del CLUSTER
        model_filename = os.path.join(MODELS_DIR, f'lgbm_model_cluster_{cluster_id}.joblib')
        joblib.dump(best_model_cluster, model_filename)
        print(f"Modelo del cluster {cluster_id} guardado en: {model_filename}")

    except Exception as e:
        print(f"ERROR procesando el cluster {cluster_id}: {e}")
        # Opcional: guardar información del error en el log de métricas
        error_row = {CLUSTER_COL: cluster_id}
        error_row.update({k: 'ERROR' for k in header_metrics if k != CLUSTER_COL and k != 'best_params'})
        error_row['best_params'] = str(e) # Guardar el mensaje de error
        pd.DataFrame([error_row]).to_csv(METRICS_FILE, mode='a', header=False, index=False)

    finally:
        # Liberar memoria explícitamente
        del cluster_df, X_cluster, y_cluster, ids_cluster, dates_cluster
        del X_train_val, X_test, y_train_val, y_test, ids_test, dates_test
        if 'pipeline' in locals(): del pipeline
        if 'search' in locals(): del search
        if 'best_model_cluster' in locals(): del best_model_cluster
        if 'y_pred_test_cluster' in locals(): del y_pred_test_cluster
        if 'predictions_df' in locals(): del predictions_df
        gc.collect()

print("\n--- Proceso Completado ---")
print(f"Resultados guardados en el directorio: {OUTPUT_DIR}")
print(f"Métricas de cluster consolidadas: {METRICS_FILE}")
print(f"Predicciones por serie consolidadas: {PREDICTIONS_FILE}")
print(f"Gráficos individuales por serie en: {PLOTS_DIR}")
print(f"Modelos de cluster entrenados en: {MODELS_DIR}")

# Puedes cargar y analizar los resultados consolidados al final si lo deseas
# metrics_final_df = pd.read_csv(METRICS_FILE)
# print("\nResumen de Métricas (promedio sobre todos los clusters):")
# print(metrics_final_df[[col for col in metrics_final_df.columns if 'Cluster_' in col]].mean(numeric_only=True))

IndentationError: unexpected indent (1370517386.py, line 103)

In [None]:

PARQUET_FILE = '../data/gold_ventas_semanales_clustered_lgbm.parquet'
OUTPUT_DIR = '../models/lightgbm'
METRICS_FILE = os.path.join(OUTPUT_DIR, 'all_cluster_metrics.csv')
PREDICTIONS_FILE = os.path.join(OUTPUT_DIR, 'all_series_predictions.csv')
PLOTS_DIR = os.path.join(OUTPUT_DIR, 'prediction_plots_by_series')
MODELS_DIR = os.path.join(OUTPUT_DIR, 'trained_cluster_models')

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

TARGET_VARIABLE = 'weekly_volume'
SERIES_ID_COLS = ['establecimiento', 'material']
CLUSTER_COL = 'cluster_label'
DATE_COL = 'week'

parquet_meta = pq.read_metadata(PARQUET_FILE)
ALL_COLS = [col.name for col in parquet_meta.schema]

FEATURES_COLS = [
    col for col in ALL_COLS
    if col not in [TARGET_VARIABLE, DATE_COL, CLUSTER_COL, 'last_sale_week'] 
]

CATEGORICAL_FEATURES = ['establecimiento', 'material', 'cluster_label']
CATEGORICAL_FEATURES_FOR_MODEL = [f for f in ['establecimiento', 'material'] if f in FEATURES_COLS]
NUMERICAL_FEATURES = [col for col in FEATURES_COLS if col not in CATEGORICAL_FEATURES_FOR_MODEL]

def calculate_mase(y_true_train, y_true_test, y_pred_test):
    y_true_train = np.array(y_true_train).flatten()
    y_true_test = np.array(y_true_test).flatten()
    y_pred_test = np.array(y_pred_test).flatten()

    if len(y_true_train) < 2:
        return np.nan

    naive_forecast_error_train = np.mean(np.abs(np.diff(y_true_train)))

    if naive_forecast_error_train < 1e-9:
         model_mae_test = mean_absolute_error(y_true_test, y_pred_test)
         return np.inf if model_mae_test > 1e-9 else 0.0

    model_mae_test = mean_absolute_error(y_true_test, y_pred_test)

    return model_mae_test / naive_forecast_error_train


def evaluate_model(y_true_train, y_true_test, y_pred_test, label="Cluster"):
    metrics = {
        f'{label}_mae': mean_absolute_error(y_true_test, y_pred_test),
        f'{label}_rmse': np.sqrt(mean_squared_error(y_true_test, y_pred_test)),
        f'{label}_mape': mean_absolute_percentage_error(y_true_test, y_pred_test) if np.all(np.abs(y_true_test) > 1e-9) else np.nan,
        f'{label}_r2': r2_score(y_true_test, y_pred_test),
        f'{label}_mase': calculate_mase(y_true_train, y_true_test, y_pred_test)
    }
    mape_key = f'{label}_mape'
    if mape_key in metrics and np.isinf(metrics[mape_key]):
         metrics[mape_key] = np.nan
    return metrics


def plot_predictions(dates_test, y_true_test, y_pred_test, estab, material, cluster_id, filepath):
    plt.figure(figsize=(15, 6))
    plt.plot(dates_test, y_true_test, label='Real', marker='.', linestyle='-')
    plt.plot(dates_test, y_pred_test, label=f'Predicción (Cluster {cluster_id})', marker='x', linestyle='--')
    plt.title(f'Predicción vs Real - {estab} / {material} (Cluster {cluster_id})')
    plt.xlabel('Semana')
    plt.ylabel('Volumen Semanal')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(filepath)
    plt.close()

try:
    print(f"Leyendo columna de clusters '{CLUSTER_COL}' del archivo: {PARQUET_FILE}")
    pf = pq.ParquetFile(PARQUET_FILE)
    cluster_labels_df = pf.read(columns=[CLUSTER_COL]).to_pandas()
    unique_clusters = cluster_labels_df[CLUSTER_COL].unique()
    unique_clusters = [c for c in unique_clusters if pd.notna(c)] 
    print(f"Encontrados {len(unique_clusters)} clusters únicos.")
    del cluster_labels_df
    gc.collect()
except Exception as e:
    print(f"Error al leer la columna de clusters del Parquet: {e}")
    exit()

all_cluster_metrics = []
header_preds = SERIES_ID_COLS + [DATE_COL, 'actual_volume', 'predicted_volume', CLUSTER_COL]
pd.DataFrame(columns=header_preds).to_csv(PREDICTIONS_FILE, index=False)

N_SPLITS_CV = 5 
N_ITER_HPT = 10 
SCORING_METRIC_HPT = 'neg_mean_absolute_error' 

param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 'log2', 0.7, 1.0] 
}
try:
    pf = pq.ParquetFile(PARQUET_FILE)
    # Leer solo la columna del cluster para eficiencia
    cluster_labels_df = pf.read(columns=[CLUSTER_COL]).to_pandas()
    unique_clusters = cluster_labels_df[CLUSTER_COL].unique()
    # Remover posibles NaNs si existen
    unique_clusters = [c for c in unique_clusters if pd.notna(c)] 
    print(f"Encontrados {len(unique_clusters)} clusters únicos.")
    del cluster_labels_df # Liberar memoria
    gc.collect()
except Exception as e:
    print(f"Error al leer la columna de clusters del Parquet: {e}")
    # Intentar cargar todo si es necesario (cuidado con la memoria)
    # O manejar el error de otra forma
    exit()


# --- 4. Procesamiento por Cluster (Entrenamiento, CV, HPT, Evaluación) ---

all_cluster_metrics = []
# El archivo de predicciones se inicializa una vez
header_preds = SERIES_ID_COLS + [DATE_COL, 'actual_volume', 'predicted_volume', CLUSTER_COL]
pd.DataFrame(columns=header_preds).to_csv(PREDICTIONS_FILE, index=False)


# Configuración de CV y HPT (igual que antes)
N_SPLITS_CV = 5 
N_ITER_HPT = 10 
SCORING_METRIC_HPT = 'neg_mean_absolute_error' 

# Define el espacio de búsqueda de hiperparámetros para RandomForestRegressor (igual que antes)
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 'log2', 0.7, 1.0] 
}

# Inicializar archivo de métricas de cluster
header_metrics = [CLUSTER_COL] + [f'Cluster_{m}' for m in ['mae', 'rmse', 'mape', 'r2', 'mase']] + ['best_params']
pd.DataFrame(columns=header_metrics).to_csv(METRICS_FILE, index=False)


print(f"\nIniciando procesamiento para {len(unique_clusters)} clusters...")
# Replace tqdm with print statements for progress tracking
for cluster_count, cluster_id in enumerate(unique_clusters):
    print(f"\n--- Procesando Cluster {cluster_count+1}/{len(unique_clusters)}: ID = {cluster_id} ---")
    
    try:
        # 4.1. Cargar datos para el cluster actual
        print(f"Cargando datos para el cluster {cluster_id}...")
        filters = [(CLUSTER_COL, '=', cluster_id)]
        cluster_df = pd.read_parquet(PARQUET_FILE, filters=filters)
        
        # Asegurar orden temporal GLOBAL dentro del cluster para CV
        # Primero por serie, luego por fecha para mantener bloques de series juntos
        cluster_df = cluster_df.sort_values(by=SERIES_ID_COLS + [DATE_COL]).reset_index(drop=True)
        print(f"Cluster {cluster_id}: {len(cluster_df)} filas cargadas.")

        # Mínimo de datos para entrenar/validar (ajusta según necesidad)
        if len(cluster_df) < (N_SPLITS_CV + 2) * 2: # Un umbral más alto para clusters
            print(f"Datos insuficientes para CV en cluster {cluster_id} ({len(cluster_df)} filas). Saltando cluster.")
            continue
            
        # 4.2. Preparar X e y para el cluster
        X_cluster = cluster_df[FEATURES_COLS]
        y_cluster = cluster_df[TARGET_VARIABLE]
        
        # Identificadores y fechas para post-procesamiento (predicciones, gráficos)
        ids_cluster = cluster_df[SERIES_ID_COLS + [CLUSTER_COL]]
        dates_cluster = cluster_df[DATE_COL]

        # 4.3. Preprocesamiento Pipeline
        # Incluye Imputación para numéricas y OneHotEncoding para categóricas
        
        # Pipeline para numéricas: Imputar NaNs (ej. con mediana)
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median'))
        ])

        # Pipeline para categóricas: Imputar NaNs (con constante como 'missing') y luego OneHotEncode
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
        ])

        # Crear el preprocesador con ColumnTransformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, NUMERICAL_FEATURES),
                ('cat', categorical_transformer, CATEGORICAL_FEATURES_FOR_MODEL)
            ], 
            remainder='passthrough' # Mantiene columnas no especificadas si FEATURES_COLS no fue exhaustivo
        )
        
        # 4.4. Separar un conjunto de Test Final (Hold-out) para el CLUSTER
        print(f"Realizando split Train/Test para el cluster {cluster_id}...")
        test_size_ratio = 0.2 # Usar el último 20% como test
        n_rows = len(cluster_df)
        split_point = int(n_rows * (1 - test_size_ratio))

        # Asegurarse que hay suficientes datos para train y test
        if split_point < N_SPLITS_CV + 1 or n_rows - split_point < 1:
             print(f"Datos insuficientes para split Train/Test adecuado en cluster {cluster_id}. Saltando.")
             continue
             
        X_train_val, X_test = X_cluster.iloc[:split_point], X_cluster.iloc[split_point:]
        y_train_val, y_test = y_cluster.iloc[:split_point], y_cluster.iloc[split_point:]
        
        # Guardar IDs y fechas correspondientes al conjunto de test para el guardado final
        ids_test = ids_cluster.iloc[split_point:]
        dates_test = dates_cluster.iloc[split_point:]
        y_true_train_for_mase = y_train_val.values # Para cálculo de MASE agregado

        print(f"Cluster {cluster_id}: Train/Val={len(X_train_val)}, Test={len(X_test)}")

        # 4.5. Configurar TimeSeriesSplit para CV dentro del conjunto Train/Val del CLUSTER
        # test_size debería ser una fracción razonable de los datos de train/val
        cv_test_size = max(1, len(X_train_val) // (N_SPLITS_CV + 1)) 
        tscv = TimeSeriesSplit(n_splits=N_SPLITS_CV, gap=0, test_size=cv_test_size)

        # 4.6. Configurar y Ejecutar RandomizedSearchCV
        lgbm = lgb.LGBMRegressor(random_state=42, n_jobs=-1) # LGBM maneja mejor n_jobs=-1

        pipeline = Pipeline([
            ('preprocess', preprocessor),
            ('regressor', lgbm) # <-- Usar LGBM
        ])

                # Ajustar el grid de parámetros para que se aplique al paso 'regressor'
        param_dist_pipeline = {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__learning_rate': [0.05, 0.1, 0.2],
            'regressor__num_leaves': [20, 31, 40], # Relacionado con max_depth pero no igual
            'regressor__max_depth': [-1, 10, 20], # -1 es sin límite
            'regressor__min_child_samples': [20, 50, 100],
            'regressor__subsample': [0.7, 0.8, 0.9], # Muestreo de filas
            'regressor__colsample_bytree': [0.7, 0.8, 0.9] # Muestreo de features
        }

        # Configura RandomizedSearchCV con el nuevo pipeline y param_dist
        search = RandomizedSearchCV(
            estimator=pipeline,
            param_distributions=param_dist_pipeline,
            n_iter=N_ITER_HPT, # Puedes empezar con 10-20 para LGBM
            cv=tscv,
            scoring=SCORING_METRIC_HPT,
            n_jobs=1, # El Search sigue secuencial
            refit=True,
            random_state=42,
            verbose=10
        )

        print(f"Iniciando HPT con LightGBM para cluster {cluster_id}...")
        search.fit(X_train_val, y_train_val)
        
        best_model_cluster = search.best_estimator_
        best_params_cluster = search.best_params_
        print(f"Mejores parámetros encontrados para cluster {cluster_id}: {best_params_cluster}")
        
        # 4.7. Predicción en el Conjunto de Test (Hold-out) del CLUSTER
        print(f"Realizando predicciones en el conjunto de test del cluster {cluster_id}...")
        y_pred_test_cluster = best_model_cluster.predict(X_test)
        
        # 4.8. Calcular Métricas AGREGADAS para el CLUSTER
        print(f"Calculando métricas agregadas para el cluster {cluster_id}...")
        cluster_metrics = evaluate_model(y_true_train_for_mase, y_test.values, y_pred_test_cluster, label="Cluster")
        
        # Añadir ID de cluster y parámetros
        metrics_row = {CLUSTER_COL: cluster_id}
        metrics_row.update(cluster_metrics)
        metrics_row['best_params'] = str(best_params_cluster) # Guardar como string
        all_cluster_metrics.append(metrics_row)
        
        # 4.9. Guardar Métricas del Cluster (append)
        pd.DataFrame([metrics_row]).to_csv(METRICS_FILE, mode='a', header=False, index=False)

        # 4.10. Preparar y Guardar Predicciones a Nivel de SERIE (append)
        predictions_df = pd.DataFrame({
            'establecimiento': ids_test['establecimiento'],
            'material': ids_test['material'],
            DATE_COL: dates_test,
            'actual_volume': y_test.values,
            'predicted_volume': y_pred_test_cluster,
            CLUSTER_COL: ids_test[CLUSTER_COL]
        })
        predictions_df.to_csv(PREDICTIONS_FILE, mode='a', header=False, index=False)
        
        # 4.11. Generar y Guardar Gráficos POR SERIE dentro del cluster
        print(f"Generando gráficos para las series del cluster {cluster_id}...")
        unique_series_in_test = ids_test[SERIES_ID_COLS].drop_duplicates().values.tolist()
        
        # Limitar a un máximo de 1000 gráficos por cluster
        max_plots = 1000
        for series_idx, (estab, material) in enumerate(unique_series_in_test[:max_plots]):
             print(f"Generando gráfico {series_idx+1}/{min(len(unique_series_in_test), max_plots)} para cluster {cluster_id}")
             # Filtrar las predicciones y reales para esta serie específica del test set
             mask = (predictions_df['establecimiento'] == estab) & (predictions_df['material'] == material)
             series_pred_df = predictions_df[mask]
             
             if not series_pred_df.empty:
                 plot_filename = os.path.join(PLOTS_DIR, f'pred_vs_actual_{estab}_{material}_cluster{cluster_id}.png')
                 plot_predictions(
                     series_pred_df[DATE_COL], 
                     series_pred_df['actual_volume'], 
                     series_pred_df['predicted_volume'], 
                     estab, 
                     material, 
                     cluster_id, 
                     plot_filename
                 )

        # 4.12. Guardar el Modelo Entrenado del CLUSTER
        model_filename = os.path.join(MODELS_DIR, f'lgbm_model_cluster_{cluster_id}.joblib')
        joblib.dump(best_model_cluster, model_filename)
        print(f"Modelo del cluster {cluster_id} guardado en: {model_filename}")

    except Exception as e:
        print(f"ERROR procesando el cluster {cluster_id}: {e}")
        # Opcional: guardar información del error en el log de métricas
        error_row = {CLUSTER_COL: cluster_id}
        error_row.update({k: 'ERROR' for k in header_metrics if k != CLUSTER_COL and k != 'best_params'})
        error_row['best_params'] = str(e) # Guardar el mensaje de error
        pd.DataFrame([error_row]).to_csv(METRICS_FILE, mode='a', header=False, index=False)

    finally:
        # Liberar memoria explícitamente
        del cluster_df, X_cluster, y_cluster, ids_cluster, dates_cluster
        del X_train_val, X_test, y_train_val, y_test, ids_test, dates_test
        if 'pipeline' in locals(): del pipeline
        if 'search' in locals(): del search
        if 'best_model_cluster' in locals(): del best_model_cluster
        if 'y_pred_test_cluster' in locals(): del y_pred_test_cluster
        if 'predictions_df' in locals(): del predictions_df
        gc.collect()

print("\n--- Proceso Completado ---")
print(f"Resultados guardados en el directorio: {OUTPUT_DIR}")
print(f"Métricas de cluster consolidadas: {METRICS_FILE}")
print(f"Predicciones por serie consolidadas: {PREDICTIONS_FILE}")
print(f"Gráficos individuales por serie en: {PLOTS_DIR}")
print(f"Modelos de cluster entrenados en: {MODELS_DIR}")

# Puedes cargar y analizar los resultados consolidados al final si lo deseas
# metrics_final_df = pd.read_csv(METRICS_FILE)
# print("\nResumen de Métricas (promedio sobre todos los clusters):")
# print(metrics_final_df[[col for col in metrics_final_df.columns if 'Cluster_' in col]].mean(numeric_only=True))

Leyendo columna de clusters 'cluster_label' del archivo: ../data/gold_ventas_semanales_clustered_lgbm.parquet
Encontrados 7 clusters únicos.
Encontrados 7 clusters únicos.

Iniciando procesamiento para 7 clusters...

--- Procesando Cluster 1/7: ID = 2 ---
Cargando datos para el cluster 2...
Cluster 2: 16151645 filas cargadas.
Realizando split Train/Test para el cluster 2...
Cluster 2: Train/Val=12921316, Test=3230329
Iniciando HPT con LightGBM para cluster 2...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START regressor__colsample_bytree=0.8, regressor__learning_rate=0.05, regressor__max_depth=10, regressor__min_child_samples=50, regressor__n_estimators=200, regressor__num_leaves=31, regressor__subsample=0.9
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.460972 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 