In [17]:
import pandas as pd
import tensorflow as tf
import numpy as np
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


df = pd.read_csv("../../datasets/sell-in.txt.gz", sep="\t")
df = df.groupby(by=["periodo","product_id"]).agg({"tn":"sum"}).reset_index()
df["periodo"] = pd.to_datetime(df["periodo"], format="%Y%m")
df_pivot = df.pivot(index="product_id", columns="periodo", values="tn").fillna(0)
print(f"Shape de la matriz: {df_pivot.shape}")  # (800 productos x 36 meses)
del df
gc.collect()


Shape de la matriz: (1233, 36)


0

In [18]:
def create_windows_with_lags_as_features(data, window_size=12, horizon=2, lags=[1, 3, 6]):
    """
    Crea ventanas donde cada timestep tiene como features: el valor original y sus lags
    """
    total_length = len(data)
    start_idx = window_size + max(lags) + horizon - 1
    n_samples = total_length - start_idx - (horizon - 1)

    n_features = 1 + len(lags)  # valor actual + lags
    X = np.zeros((n_samples, window_size, n_features))
    y = np.zeros((n_samples, 1))

    for i in range(n_samples):
        current_idx = start_idx + i
        for t in range(window_size):
            base_idx = current_idx - (window_size - t)
            features = [data[base_idx]]  # valor actual
            for lag in lags:
                features.append(data[base_idx - lag])
            X[i, t] = features

        y[i] = data[current_idx + horizon - 1]

    return X, y


In [19]:
# En lugar de normalizar producto por producto (muy lento), hazlo vectorizado
from sklearn.preprocessing import StandardScaler

# Guardar los scalers para cada producto
scalers = {producto: StandardScaler() for producto in df_pivot.index}
scaled_data = np.zeros_like(df_pivot.values)

# Normalización vectorizada
for i, producto in enumerate(df_pivot.index):
    scaled_data[i] = scalers[producto].fit_transform(df_pivot.loc[producto].values.reshape(-1, 1)).flatten()

In [20]:
# Usar numpy para crear ventanas de manera vectorizada
def create_windows(data, window_size=12, horizon=2):
    n_samples = data.shape[0] - window_size - horizon + 1
    X = np.zeros((n_samples, window_size, 1))
    y = np.zeros((n_samples, 1))
    
    for i in range(n_samples):
        X[i] = data[i:i+window_size].reshape(-1, 1)
        y[i] = data[i+window_size+horizon-1]  # t+2
        
    return X, y

X, y, productos = [], [], []

lags = [1, 6, 12]



for i_producto, producto in enumerate(df_pivot.index):
    serie = scaled_data[i_producto]
    X_prod, y_prod = create_windows_with_lags_as_features(serie, lags=lags)

    X.append(X_prod)
    y.append(y_prod)
    productos.extend([producto] * len(X_prod))

X = np.vstack(X)
y = np.vstack(y)

productos_unicos = df_pivot.index.unique()
productos_train, productos_test = train_test_split(productos_unicos, test_size=0.2, random_state=42)

# Máscaras para filtrar
train_mask = [p in productos_train for p in productos]
test_mask = [p in productos_test for p in productos]

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau



# Arquitectura mejorada
model = Sequential([
    LSTM(128, activation='tanh', input_shape=(12, 4), return_sequences=True),
    BatchNormalization(),
    Dropout(0.2),
    LSTM(64, activation='tanh'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

# Optimizador con learning rate ajustable
optimizer = Adam(learning_rate=0.001)

# Callbacks para mejorar el entrenamiento
callbacks = [
    EarlyStopping(patience=15, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.1, patience=5)
]

model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

# Entrenamiento con validación
history = model.fit(
    X_train, y_train,
    epochs=200,  # Más epochs pero con early stopping
    batch_size=256,  # Tamaño de batch ajustado
    validation_data=(X_test, y_test),
    callbacks=callbacks,
    verbose=1
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200


In [22]:
def predecir_t2(product_id, ultimos_12_meses, lags=[1, 3, 6]):
    """
    Predice usando 12 pasos y lags como features.
    """
    try:
        scaler = scalers[product_id]
        # Escalamos la serie completa primero
        serie = np.array(ultimos_12_meses, dtype=np.float32).reshape(-1, 1)
        serie_scaled = scaler.transform(serie).flatten()

        X_new = []

        # Generar 12 pasos con sus features
        for i in range(12):
            idx = i
            base = serie_scaled[idx]
            features = [base]
            for lag in lags:
                if idx - lag >= 0:
                    features.append(serie_scaled[idx - lag])
                else:
                    # Si no hay suficiente historial para el lag, rellenar con 0
                    features.append(0.0)
            X_new.append(features)

        X_new = np.array(X_new).reshape(1, 12, 1 + len(lags))
        y_pred_scaled = model.predict(X_new, verbose=0)
        y_pred = (y_pred_scaled * scaler.scale_) + scaler.mean_

        return y_pred
    except KeyError:
        print(f"Producto {product_id} no encontrado en los scalers")
        return None
    except Exception as e:
        print(f"Error al predecir para {product_id}: {str(e)}")
        return None


In [23]:
def predecir_todos(productos_a_predecir, n_workers=4):
    """
    Predice en paralelo para muchos productos.
    """
    def predict_single(producto):
        try:
            ultimos_12 = df_pivot.loc[producto].iloc[-12:].values
            pred = predecir_t2(producto, ultimos_12)
            if pred is not None:
                return producto, pred[0][0]
        except:
            pass
        return producto, np.nan

    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        results = list(executor.map(predict_single, productos_a_predecir))

    return dict(results)


In [24]:
productos_ok = pd.read_csv("https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt", sep="\t")
productos_ok = productos_ok["product_id"].unique()
predicciones = predecir_todos(productos_ok)

In [25]:
df = pd.DataFrame([predicciones.keys(),predicciones.values()])
df = df.T
df.columns = ["product_id", "tn"]
df["product_id"] = df["product_id"].astype(int)
df.to_csv("../../results/LSTM_lags_1_6_12.csv", sep=",", index=False)