In [1]:
import pandas as pd

In [None]:
df = pd.read_csv('../datasets/dt_fe2.csv',sep=',')

In [3]:
df.columns

Index(['product_id', 'customer_id', 'periodo', 'plan_precios_cuidados',
       'cust_request_qty', 'cust_request_tn', 'tn', 'cat1', 'cat2', 'cat3',
       'brand', 'sku_size', 'stock_final', 'periodo_dt', 'target', 'month',
       'year', 'quarter', 'semester', 'is_month_end', 'season',
       'size_vs_category', 'lag_1m', 'lag_2m', 'lag_3m', 'rolling_3m_mean',
       'annual_trend', 'seasonal_variation'],
      dtype='object')

# Pruebas sin CV

In [4]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import os

# === 1. Carga de datos y preprocesamiento ===
# Asegurate de tener cargado tu DataFrame `df`

df = df.drop(['periodo_dt', 'descripcion'], axis=1, errors='ignore')
df_kgl = df[df["periodo"] == 201912]
df = df[~df["periodo"].isin([201911, 201912])]

# Codificar categóricas
cat_cols = ['cat1', 'cat2', 'cat3', 'brand', 'plan_precios_cuidados']
for col in cat_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# Separar features y target
X = df.drop(columns=["target"])
y = df["target"]

# Split fijo para validación (sin CV)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

# === 2. Definición del objetivo para Optuna ===
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "n_jobs": -1,
        "seed": 42,
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
    }

    model = lgb.train(
        params,
        lgb.Dataset(X_train, label=y_train),
        valid_sets=[lgb.Dataset(X_val, label=y_val)],
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )

    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    return rmse

# === 3. Configurar almacenamiento SQLite para Optuna ===
os.makedirs("optuna_storage", exist_ok=True)
DB_PATH = "optuna_storage/optuna_simple.db"
STUDY_NAME = "lightgbm_sin_cv"
storage_url = f"sqlite:///{DB_PATH}"

# === 4. Crear o cargar estudio ===
study = optuna.create_study(
    study_name=STUDY_NAME,
    storage=storage_url,
    direction="minimize",
    load_if_exists=True
)

# === 5. Ejecutar optimización ===
study.optimize(objective, n_trials=50)

# === 6. Mostrar resultados ===
print("Mejores hiperparámetros encontrados:")
print(study.best_params)
print(f"Mejor RMSE: {study.best_value:.4f}")


[I 2025-05-29 19:03:47,655] A new study created in RDB with name: lightgbm_sin_cv
[I 2025-05-29 19:04:50,026] Trial 0 finished with value: 0.045658001287507084 and parameters: {'num_leaves': 105, 'learning_rate': 0.024315929579374472, 'feature_fraction': 0.7888657299143276, 'bagging_fraction': 0.4745455361773574, 'bagging_freq': 4, 'min_child_samples': 83, 'lambda_l1': 0.001559820251810067, 'lambda_l2': 0.6653187540435737, 'max_depth': 8}. Best is trial 0 with value: 0.045658001287507084.
[I 2025-05-29 19:05:06,729] Trial 1 finished with value: 0.04869099846492722 and parameters: {'num_leaves': 251, 'learning_rate': 0.08384975193198724, 'feature_fraction': 0.6185285582491427, 'bagging_fraction': 0.5367390396767875, 'bagging_freq': 3, 'min_child_samples': 20, 'lambda_l1': 0.6557882888668388, 'lambda_l2': 0.049556939238187936, 'max_depth': 3}. Best is trial 0 with value: 0.045658001287507084.
[I 2025-05-29 19:09:07,679] Trial 2 finished with value: 0.048768083748518584 and parameters: {'

Mejores hiperparámetros encontrados:
{'num_leaves': 232, 'learning_rate': 0.004648054506981842, 'feature_fraction': 0.5703341269914189, 'bagging_fraction': 0.5990528265244626, 'bagging_freq': 6, 'min_child_samples': 100, 'lambda_l1': 3.392866762405977e-06, 'lambda_l2': 0.0011444820553097982, 'max_depth': 11}
Mejor RMSE: 0.0434


In [8]:

# Entrenar con los mejores hiperparámetros
best_params = study.best_params.copy()
best_params.update({
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "n_jobs": -1,
    "seed": 42
})

# Entrenar una vez el modelo con esos parámetros (ej. sobre X_train si no querés usar todo)
model = lgb.train(
    best_params,
    lgb.Dataset(X_train, label=y_train),
    valid_sets=[lgb.Dataset(X_val, label=y_val)],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

cat_cols = ['cat1', 'cat2', 'cat3', 'brand', 'plan_precios_cuidados']
for col in cat_cols:
    if col in df.columns:
        le = LabelEncoder()
        df_kgl[col] = le.fit_transform(df_kgl[col].astype(str))

# Asegurar las mismas columnas
X_kgl = df_kgl[X.columns]  # Misma estructura

# === 9. Hacer predicción sobre nuevos datos ===
preds_kgl = model.predict(X_kgl)

# Mostrar o guardar resultados
print("Predicciones para periodo 201912:")
print(preds_kgl)

Predicciones para periodo 201912:
[-4.87869999e+01 -5.75757666e+00  5.17383229e+00 ...  1.37966385e-03
  2.17837551e-02  2.17837551e-02]


In [9]:
productos_ok = pd.read_csv("https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt", sep="\t")
result = pd.DataFrame({"product_id": X_kgl["product_id"], "tn": X_kgl["tn"],  "ypred": preds_kgl})
result["tn"] = result["ypred"] + result["tn"]
result = result[result["product_id"].isin(productos_ok["product_id"])]
result = result.groupby("product_id").agg({"tn":"sum"}).reset_index()
result

Unnamed: 0,product_id,tn
0,20001,1262.157240
1,20002,1059.660593
2,20003,785.257886
3,20004,647.145097
4,20005,494.370122
...,...,...
775,21263,0.404318
776,21265,0.396308
777,21266,0.394427
778,21267,0.077200


In [10]:
result.to_csv("results/resultados_lgbm_1opt.csv",sep=',', index=False)

# pruebas con CV

In [8]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
import sqlite3
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import os

# 1. Configuración de la base de datos SQLite para Optuna
DB_NAME = "optuna_studies.db"
STUDY_NAME = "primera_optimizacion"

# Crear directorio si no existe
os.makedirs("optuna_storage", exist_ok=True)
DB_PATH = f"optuna_storage/{DB_NAME}"

# 2. Preparación de datos (asegúrate de haber hecho el feature engineering primero)
# Eliminar variables no útiles para el modelo
df = df.drop(['periodo_dt', 'descripcion'], axis=1, errors='ignore')

df_kgl = df[df["periodo"].isin([201912])]

df = df.drop(df[df["periodo"].isin([201911,201912])].index,axis=0)

# Codificar variables categóricas
cat_cols = ['cat1', 'cat2', 'cat3', 'brand', 'plan_precios_cuidados',]
for col in cat_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# Separar features y target
X = df.drop(['target'], axis=1)
y = df['target']

# 3. Configuración de la validación cruzada temporal
tscv = TimeSeriesSplit(n_splits=5)

# 4. Función objetivo para Optuna
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
        'seed': 42,
        
        # Parámetros a optimizar
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
    }
    
    # Validación cruzada
    val_scores = []
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Dataset de LightGBM
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        # Entrenamiento
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
            num_boost_round=1000
        )
        
        # Predicción y evaluación
        preds = model.predict(X_val)
        val_score = np.sqrt(mean_squared_error(y_val, preds))
        val_scores.append(val_score)
    
    return np.mean(val_scores)

# 5. Configuración del almacenamiento en SQLite
def setup_optuna_storage():
    # Crear conexión a la base de datos
    conn = sqlite3.connect(DB_PATH)
    conn.close()
    
    # URL de almacenamiento para Optuna
    storage_url = f"sqlite:///{DB_PATH}"
    return storage_url

# 6. Función para cargar o crear estudio
def get_or_create_study(storage_url, study_name):
    try:
        # Intentar cargar estudio existente
        study = optuna.load_study(
            study_name=study_name,
            storage=storage_url
        )
        print(f"Estudio '{study_name}' cargado con {len(study.trials)} trials existentes")
    except:
        # Crear nuevo estudio si no existe
        study = optuna.create_study(
            study_name=study_name,
            storage=storage_url,
            direction="minimize",
            load_if_exists=True
        )
        print(f"Nuevo estudio '{study_name}' creado")
    return study

# 7. Ejecutar la optimización
def run_optimization():
    storage_url = setup_optuna_storage()
    study = get_or_create_study(storage_url, STUDY_NAME)
    
    # Configurar logger para ver progreso
    optuna.logging.set_verbosity(optuna.logging.INFO)
    
    # Ejecutar optimización
    study.optimize(objective, n_trials=50, timeout=3600)
    
    return study

# 8. Ejecutar todo el proceso
if __name__ == "__main__":
    study = run_optimization()
    
    # Mostrar resultados
    print("\nMejores parámetros encontrados:")
    print(study.best_params)
    print(f"Mejor RMSE: {study.best_value:.4f}")
    
    # 9. Entrenar modelo final con mejores parámetros
    best_params = study.best_params
    best_params.update({
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'seed': 42,
        'n_jobs': -1
    })
    
    final_model = lgb.LGBMRegressor(**best_params, n_estimators=1000)
    final_model.fit(X, y)
    
    # 10. Guardar modelo final
    import joblib
    joblib.dump(final_model, 'lightgbm_optimized_model.pkl')
    
    # 11. Visualizaciones
    print("\nVisualizaciones disponibles:")
    print(f"Base de datos de estudios: {DB_PATH}")
    
    # Exportar resultados a DataFrame
    trials_df = study.trials_dataframe()
    trials_df.to_csv("optuna_storage/trials_results.csv", index=False)
    print("Resultados de trials guardados en optuna_storage/trials_results.csv")

[I 2025-05-29 18:47:47,354] A new study created in RDB with name: primera_optimizacion


Nuevo estudio 'primera_optimizacion' creado


[W 2025-05-29 18:53:45,446] Trial 0 failed with parameters: {'num_leaves': 125, 'learning_rate': 0.0180870615296578, 'feature_fraction': 0.7737019415710031, 'bagging_fraction': 0.9085842649015098, 'bagging_freq': 2, 'min_child_samples': 52, 'lambda_l1': 3.935282838035174e-05, 'lambda_l2': 4.275372907991264e-05, 'max_depth': 11} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\carre\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\carre\AppData\Local\Temp\ipykernel_13592\1664033529.py", line 74, in objective
    model = lgb.train(
            ^^^^^^^^^^
  File "c:\Users\carre\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightgbm\engine.py", line 307, in train
    booster.update(fobj=fobj)
  File "c:\Users\carre\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightgb

KeyboardInterrupt: 