In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import lightgbm as lgb
import matplotlib.pyplot as plt
import os
from datetime import timedelta

# --- Configuración Inicial ---
print("Iniciando el pipeline simplificado para predicción de demanda con LightGBM...")

# Definir directorios relativos
current_dir = os.getcwd()
data_path = os.path.join(current_dir, "../data/raw/Dataframe_Final_Data_LSTM.csv")
models_dir = os.path.join(current_dir, "../data/models/")
os.makedirs(models_dir, exist_ok=True)

if not os.path.exists(data_path):
    raise FileNotFoundError(f"El archivo {data_path} no se encuentra.")

# Parámetros
item_id = 'HOBBIES_1_001'
store_id = 'CA_1'
initial_seq_length = 3
print(f"Entrenando modelo para item_id: {item_id}, store_id: {store_id}")

# --- Paso 1: Cargar y Preparar los Datos ---
df = pd.read_csv(data_path, parse_dates=['date'])
df_filtered = df[(df['item_id'] == item_id) & (df['store_id'] == store_id)].copy()
df_filtered = df_filtered.sort_values('date')

# Verificar si hay datos después del filtrado
if df_filtered.empty:
    raise ValueError(f"No hay datos para item_id: {item_id} y store_id: {store_id}. Verifica los filtros.")
print(f"Número de filas para item_id={item_id} y store_id={store_id}: {len(df_filtered)}")

# Rellenar fechas faltantes
date_range = pd.date_range(start=df_filtered['date'].min(), end=df_filtered['date'].max())
missing_dates = date_range.difference(df_filtered['date'])
if not missing_dates.empty:
    print(f"Faltan {len(missing_dates)} fechas. Rellenando con ventas = 0.")
    df_missing = pd.DataFrame({
        'date': missing_dates,
        'item_id': item_id,
        'store_id': store_id,
        'sales': 0,
        'snap_CA': 0,
        'sell_price': df_filtered['sell_price'].mean() if not df_filtered['sell_price'].isna().all() else 0,
        'event_name_1': 'NoEvent',
        'event_type_1': 'NoType',
        'event_name_2': 'NoEvent',
        'event_type_2': 'NoType',
        'wm_yr_wk': 0,
        'weekday': missing_dates.day_name(),
        'wday': missing_dates.dayofweek + 1,
        'month': missing_dates.month,
        'year': missing_dates.year
    })
    df_filtered = pd.concat([df_filtered, df_missing]).sort_values('date').reset_index(drop=True)

# Rellenar valores faltantes en columnas originales
df_filtered['sell_price'] = df_filtered['sell_price'].fillna(df_filtered['sell_price'].mean())
df_filtered['snap_CA'] = df_filtered['snap_CA'].fillna(0)
print(f"Número de filas después de rellenar valores faltantes: {len(df_filtered)}")

# Suavizar picos en las ventas (media móvil simple)
df_filtered['sales_smoothed'] = df_filtered['sales'].rolling(window=7, min_periods=1, center=True).mean()
df_filtered['sales'] = df_filtered['sales_smoothed'].fillna(df_filtered['sales'])

# Transformar las ventas para suavizar la distribución
df_filtered['sales'] = np.log1p(df_filtered['sales'])

# Ingeniería de características avanzada
df_filtered['lag_1'] = df_filtered['sales'].shift(1)
df_filtered['lag_7'] = df_filtered['sales'].shift(7)
df_filtered['lag_14'] = df_filtered['sales'].shift(14)
df_filtered['rolling_mean_7'] = df_filtered['sales'].rolling(window=7).mean()
df_filtered['rolling_std_7'] = df_filtered['sales'].rolling(window=7).std()
df_filtered['rolling_min_7'] = df_filtered['sales'].rolling(window=7).min()
df_filtered['rolling_max_7'] = df_filtered['sales'].rolling(window=7).max()
df_filtered['day_of_week'] = df_filtered['date'].dt.dayofweek
df_filtered['is_weekend'] = df_filtered['day_of_week'].isin([5, 6]).astype(int)
df_filtered['month'] = df_filtered['date'].dt.month
df_filtered['event'] = df_filtered['event_name_1'].notna().astype(int)
df_filtered['snap'] = df_filtered['snap_CA']
df_filtered['price_change'] = df_filtered['sell_price'].pct_change()
df_filtered['week_of_year'] = df_filtered['date'].dt.isocalendar().week

# Convertir week_of_year a tipo numérico explícitamente
df_filtered['week_of_year'] = pd.to_numeric(df_filtered['week_of_year'], errors='coerce').fillna(0).astype(int)

# Codificar eventos como variables dummy
df_filtered = pd.get_dummies(df_filtered, columns=['event_name_1', 'event_type_1'], dummy_na=True)

# Rellenar valores NaN generados por shift y rolling
df_filtered = df_filtered.fillna(0)

# Verificar nuevamente el número de filas después del preprocesamiento
available_rows = len(df_filtered)
print(f"Número de filas después de preprocesamiento: {available_rows}")

# Ajustar seq_length dinámicamente si hay pocos datos
if available_rows <= 1:
    raise ValueError(f"No hay suficientes datos para procesar. Filas disponibles: {available_rows}.")
elif available_rows - 1 < initial_seq_length:
    seq_length = max(1, available_rows - 1)
    print(f"Advertencia: seq_length ajustado a {seq_length} debido a datos insuficientes.")
else:
    seq_length = initial_seq_length

# Normalizar datos
scaler = MinMaxScaler(feature_range=(0, 1))
features = [col for col in df_filtered.columns if col in ['sales', 'lag_1', 'lag_7', 'lag_14', 'rolling_mean_7', 'rolling_std_7', 'rolling_min_7', 'rolling_max_7', 'day_of_week', 'is_weekend', 'month', 'event', 'snap', 'sell_price', 'price_change', 'week_of_year'] or 'event_name_1' in col or 'event_type_1' in col]
# Filtrar características con baja varianza
features_to_use = []
for feat in features:
    if df_filtered[feat].var() > 0.01:  # Umbral de varianza
        features_to_use.append(feat)
    else:
        print(f"Eliminando característica con baja varianza: {feat}")

# Escalar las características seleccionadas
df_filtered[features_to_use] = scaler.fit_transform(df_filtered[features_to_use])

# Seleccionar columnas para el modelo
data_df = df_filtered[features_to_use]
print("Dtypes de data_df:")
print(data_df.dtypes)
print(f"Número de características después de filtrar: {len(features_to_use)}")

# Asegurarse de que todos los datos sean numéricos
data = data_df.values.astype(np.float32)
print(f"Forma de data: {data.shape}")

# Verificar que data sea bidimensional
if len(data.shape) == 1:
    data = data.reshape(-1, 1)

# Generar secuencias
X, y = [], []
n_samples = len(data) - seq_length
if n_samples > 0:
    for i in range(n_samples):
        seq = data[i:i + seq_length, :]
        X.append(seq)
        y.append(data[i + seq_length, 0])
    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)
else:
    raise ValueError(f"No se pueden generar secuencias. Filas disponibles: {len(data)}, secuencia requerida: {seq_length}.")

print(f"Forma de X después de generación: {X.shape}")
if len(X.shape) != 3:
    raise ValueError(f"X no tiene 3 dimensiones. Forma actual: {X.shape}.")

# --- Paso 2: Dividir en Entrenamiento, Validación y Prueba ---
train_split = int(0.7 * len(X))
val_split = int(0.85 * len(X))
if train_split == 0 or val_split == train_split:
    raise ValueError(f"No hay suficientes secuencias para dividir. Número de secuencias: {len(X)}")
X_train, X_val, X_test = X[:train_split], X[train_split:val_split], X[val_split:]
y_train, y_val, y_test = y[:train_split], y[train_split:val_split], y[val_split:]

# --- Paso 3: Modelo LightGBM ---
# Convertir X_train, X_val, X_test a DataFrame para mantener nombres de características
feature_names = [f'feat_{i}_{j}' for i in range(seq_length) for j in range(len(features_to_use))]
X_train_lgb = pd.DataFrame(X_train.reshape((X_train.shape[0], -1)), columns=feature_names)
X_val_lgb = pd.DataFrame(X_val.reshape((X_val.shape[0], -1)), columns=feature_names)
X_test_lgb = pd.DataFrame(X_test.reshape((X_test.shape[0], -1)), columns=feature_names)

lgb_params = {
    'num_leaves': [5, 10, 15],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'feature_fraction': [0.9, 1.0],
    'bagging_fraction': [0.9, 1.0],
    'min_data_in_leaf': [1, 3]
}
lgb_model = lgb.LGBMRegressor(random_state=42, metric='l2', num_threads=16)
grid_search = GridSearchCV(lgb_model, lgb_params, cv=TimeSeriesSplit(n_splits=10), scoring='r2', verbose=1)
grid_search.fit(X_train_lgb, y_train)
lgb_model = grid_search.best_estimator_
print(f"Mejores parámetros LightGBM: {grid_search.best_params_}")

# --- Paso 4: Predicciones ---
# Validación
y_val_pred = lgb_model.predict(X_val_lgb)
y_val_true = y_val

# Prueba
y_test_pred = lgb_model.predict(X_test_lgb)
y_test_true = y_test

# Desnormalizar
y_test_2d = y_test.reshape(-1, 1)
y_pred_2d = y_test_pred.reshape(-1, 1)
zeros_test = np.zeros((y_test_2d.shape[0], len(features_to_use)-1))
zeros_pred = np.zeros((y_pred_2d.shape[0], len(features_to_use)-1))
y_test_to_transform = np.hstack((y_test_2d, zeros_test))
y_pred_to_transform = np.hstack((y_pred_2d, zeros_pred))
y_test_original = scaler.inverse_transform(y_test_to_transform)[:, 0]
y_pred_original = scaler.inverse_transform(y_pred_to_transform)[:, 0]

# Invertir la transformación logarítmica
y_test_original = np.expm1(y_test_original)
y_pred_original = np.expm1(y_pred_original)

# Métricas: R² y NWRMSLE
mse = mean_squared_error(y_test_original, y_pred_original)
mae = mean_absolute_error(y_test_original, y_pred_original)
r2 = r2_score(y_test_original, y_pred_original)

# Calcular NWRMSLE
weights = np.ones_like(y_test_original)
nwrmsle = np.sqrt(np.sum(weights * (np.log1p(y_pred_original) - np.log1p(y_test_original))**2) / np.sum(weights))
print("\nMétricas de Error (LightGBM):")
print(f"- MSE: {mse:.2f}")
print(f"- MAE: {mae:.2f}")
print(f"- R²: {r2:.2f}")
print(f"- NWRMSLE: {nwrmsle:.2f}")

# Visualización
test_dates = df_filtered['date'].iloc[val_split + seq_length:].values
plt.figure(figsize=(12, 6))
plt.plot(test_dates, y_test_original, label='Ventas Reales', color='blue')
plt.plot(test_dates, y_pred_original, label='Predicciones LightGBM', color='orange')
plt.title(f'Predicciones vs. Realidad ({item_id}, {store_id})')
plt.xlabel('Fecha')
plt.ylabel('Ventas')
plt.legend()
plt.savefig('predictions_vs_reality_lgb.png')
print("Gráfico guardado como 'predictions_vs_reality_lgb.png'")

# --- Paso 5: Predicciones Futuras ---
X_last_lgb = X_test_lgb.iloc[-1:].copy()
predictions = []
current_sequence_lgb = X_last_lgb.copy()

for _ in range(30):
    next_pred = lgb_model.predict(current_sequence_lgb)[0]
    predictions.append(next_pred)
    current_sequence_lgb = np.roll(current_sequence_lgb, -1, axis=1)
    current_sequence_lgb.iloc[0, -1] = next_pred

# Desnormalizar predicciones
predictions = np.array(predictions).reshape(-1, 1)
predictions_to_transform = np.hstack((predictions, np.zeros((predictions.shape[0], len(features_to_use)-1))))
predictions_original = scaler.inverse_transform(predictions_to_transform)[:, 0]
predictions_original = np.expm1(predictions_original)

# Fechas futuras
last_date = df_filtered['date'].max()
future_dates = [last_date + timedelta(days=i) for i in range(1, 31)]

print("\nPredicciones para los próximos 30 días:")
for date, pred in zip(future_dates, predictions_original):
    print(f"Predicción para {date.strftime('%Y-%m-%d')}: {pred:.2f} unidades")

# # Guardar modelo
# lgb_model.save_model(os.path.join(models_dir, f'lgb_optimized_{item_id}_{store_id}.txt'))
# print(f"Modelo guardado en {models_dir}")