In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Rutas
ruta_datos = '/content/drive/MyDrive/Laboratorio_3/data/sell-in.txt'
ruta_productos = '/content/drive/MyDrive/Laboratorio_3/data/tb_productos.txt'
ruta_stocks = '/content/drive/MyDrive/Laboratorio_3/data/tb_stocks.txt'
ruta_predecir = '/content/drive/MyDrive/Laboratorio_3/data/product_id_apredecir201912.txt'

#Leer archivos
df = pd.read_csv(ruta_datos, sep='\t', decimal='.')
df_productos = pd.read_csv(ruta_productos, sep='\t', decimal='.')
df_stocks = pd.read_csv(ruta_stocks, sep='\t', decimal='.')
df_predecir = pd.read_csv(ruta_predecir, sep='\t')

In [None]:
#Convertir 'periodo' de formato YYYYMM en df
df['periodo'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')

#Crear columna periodo_ym
df['periodo_ym'] = df['periodo'].dt.strftime('%Y%m').astype(int)

# LGMB

In [None]:
# ---------------------------
# 1. Calcular features de venta
# ---------------------------
ultimos_12m = list(range(201901, 202001))

# ⚠️ Eliminar duplicados en productos
productos = df_predecir[['product_id']].drop_duplicates()
meses = pd.DataFrame({'periodo_ym': ultimos_12m})
base_completa = productos.merge(meses, how='cross')

df_trim12 = df[df['periodo_ym'].isin(ultimos_12m)]
df_completo = base_completa.merge(df_trim12[['product_id', 'periodo_ym', 'tn']],
                                   on=['product_id', 'periodo_ym'], how='left')
df_completo['tn'] = df_completo['tn'].fillna(0)

# Agrupar por producto sin duplicar
df_features = df_completo.groupby('product_id').agg(
    promedio_tn_12m=('tn', lambda x: x.sum() / 12),
    cantidad_ceros_12m=('tn', lambda x: (x == 0).sum()),
    std_tn_12m=('tn', 'std'),
    max_tn_12m=('tn', 'max'),
    min_tn_12m=('tn', 'min'),
    meses_con_ventas_altas=('tn', lambda x: (x > 10).sum())
).reset_index()

# ---------------------------
# 2. Unir con atributos del producto (sin duplicados)
# ---------------------------
df_productos_nodup = df_productos.drop_duplicates(subset='product_id')
df_modelo = df_features.merge(df_productos_nodup, on='product_id', how='left')

# ---------------------------
# 3. Preparar features
# ---------------------------
y = df_modelo['promedio_tn_12m']
X = df_modelo.drop(columns=['product_id', 'promedio_tn_12m'])

X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------
# 4. Modelo LGBM
# ---------------------------
model = LGBMRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
print(f"📊 MSE Test: {mean_squared_error(y_test, y_pred_test):.2f}")

# ---------------------------
# 5. Predicción y exportación
# ---------------------------
X_pred = X.reindex(columns=X.columns, fill_value=0)
y_pred = model.predict(X_pred)

df_preds = df_modelo[['product_id']].copy()
df_preds['tn_predicho'] = y_pred

# Eliminar duplicados (por si acaso)
df_preds = df_preds.drop_duplicates(subset='product_id')

# Exportar
output_path = '/content/drive/MyDrive/Laboratorio_3/predicciones_lgbm_mejorado.csv'
df_preds.to_csv(output_path, index=False, sep=',', decimal='.')

print(f"✅ Exportado sin duplicados: {len(df_preds)} productos.")
print(f"📂 Archivo: {output_path}")
df_preds.head()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 562
[LightGBM] [Info] Number of data points in the train set: 624, number of used features: 30
[LightGBM] [Info] Start training from score 38.106888
📊 MSE Test: 1156.47
✅ Exportado sin duplicados: 780 productos.
📂 Archivo: /content/drive/MyDrive/Laboratorio_3/predicciones_lgbm_mejorado.csv


Unnamed: 0,product_id,tn_predicho
0,20001,671.50395
1,20002,671.50395
2,20003,471.204539
3,20004,471.204539
4,20005,471.204539
