In [1]:
import os
import lightgbm as lgb
import pandas as pd
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import optuna
import random
from datetime import datetime
import pytz

# importo los .py
import sys
sys.path.append("auxiliares")
import config
import metricas
import lightgbm_aux
import extras

pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', None)

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
####################################################
############# Setear segun cada maquina ############
#os.chdir("C:/Users/herna/labo3_empresa3_repo/")
#os.chdir("C:/diego_tools/labo3/")
os.chdir("/home/dcastro_contacto/buckets/b1/")
####################################################

In [3]:
arch_entrada = "datasets/emp3_sellout_producto_fe.csv"
carpeta_exp_prefijo = "exp/intermedia_1/"

In [4]:
def lgbm_error_rate(preds, train_data):
    labels = train_data.get_label()
    ret_val = metricas.error_rate(labels,preds)
    return 'ER', ret_val, False

In [5]:
def objective(trial):
    print("****************", config.OBJECTIVE_PARAM, "****************")
    
    lgb_params = {
        'objective': config.OBJECTIVE_PARAM,
        'first_metric_only': True,
        'boost_from_average': True,
        'max_depth':-1,
        'lambda_l1': trial.suggest_uniform('lambda_l1', 0.0,config.L1_UPPER_PARAM),
        'lambda_l2': trial.suggest_uniform('lambda_l2', 0.0,config.L2_UPPER_PARAM),
        'min_gain_to_split':0.0,
        'force_row_wise':True,
        'feature_pre_filter':False,
        'metric': "None",
        'max_bin': config.MAX_BIN_PARAM,
        'learning_rate': trial.suggest_uniform('learning_rate', config.LEARNING_RATE_LOWER_PARAM,config.LEARNING_RATE_UPPER_PARAM ),
        'num_leaves' : trial.suggest_int('num_leaves', config.NUM_LEAVES_LOWER_PARAM, config.NUM_LEAVES_UPPER_PARAM),
        'feature_fraction': trial.suggest_uniform('feature_fraction', config.FEATURE_FRACTION_LOWER_PARAM, config.FEATURE_FRACTION_UPPER_PARAM),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', config.MIN_DATA_IN_LEAF_LOWER_PARAM, config.MIN_DATA_IN_LEAF_UPPER_PARAM),
        #'bagging_freq':trial.suggest_int('bagging_freq', 1, 10),
        #'bagging_fraction': trial.suggest_categorical('bagging_fraction',[0.7,0.75,0.8,0.85,0.9,0.95]),
        #'extra_trees':trial.suggest_categorical('extra_trees',[True,False]),
        'verbose':-100,
        'num_threads':-1
    }
    
    lgb_params_num_boost_round = 10000  #un numero muy grande, lo limita early_stopping_rounds
    lgb_params_early_stopping_rounds = int(50 + 5/lgb_params['learning_rate']) #  #el parametro discolo, que depende de otro
    
    lgb_train = lgb.Dataset(data=X_train, label=y_train, feature_name=cols_entren)
    lgb_validate = lgb.Dataset(data=X_validate, label=y_validate, reference=lgb_train, feature_name=cols_entren)
    
    #Train using selected parameters
    model = lgb.train(lgb_params, lgb_train,
                  valid_sets=[lgb_validate],
                  num_boost_round=lgb_params_num_boost_round,
                  early_stopping_rounds=lgb_params_early_stopping_rounds,
                  feval=lgbm_error_rate,
                  verbose_eval=100)
    
    y_pred_validate =model.predict(X_validate) #Create predictions on test set
    er_validate = metricas.error_rate(y_validate,y_pred_validate)
    
    GLOBAL_PARAMS_ITER.append(lgb_params)
    GLOBAL_BEST_IT_ITER.append(model.best_iteration)
    GLOBAL_ER_VALIDATE_ITER.append(er_validate)
    GLOBAL_MODEL_ITER.append(model)
    
    return er_validate

In [6]:
##############################################

In [7]:
df_sellout = pd.read_csv(arch_entrada)
df_sellout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34707 entries, 0 to 34706
Data columns (total 86 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   product_id                    34707 non-null  int64  
 1   periodo                       34707 non-null  int64  
 2   tn                            34707 non-null  float64
 3   cero_ventas                   34707 non-null  int64  
 4   cust_request_qty              34707 non-null  float64
 5   cust_request_tn               34707 non-null  float64
 6   mes                           34707 non-null  int64  
 7   producto_estrella             34707 non-null  float64
 8   plan_precios_cuidados         34707 non-null  float64
 9   meses_historia_prod           34707 non-null  int64  
 10  cat1                          34707 non-null  object 
 11  cat2                          34707 non-null  object 
 12  cat3                          34707 non-null  object 
 13  s

In [8]:
df_prods_prediccion_entren=pd.read_csv(config.ARCH_PRODUCTOS_PREDICCION_ENTRENAMIENTO)
df_prods_prediccion_entren.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 868 entries, 0 to 867
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   product_id  868 non-null    int64
dtypes: int64(1)
memory usage: 6.9 KB


In [9]:
### Categoricas
df_sellout = extras.convertir_categoricas_prod(df_sellout)

In [10]:
### Split de datasets
df_train = df_sellout[(df_sellout.periodo <= config.TRAIN_PERIODO_LIMITE_PARAM) & (df_sellout.periodo >= config.PERIODO_INICIO_PARAM)]
df_validate = df_sellout[(df_sellout.product_id.isin(df_prods_prediccion_entren.product_id)) & (df_sellout.periodo == config.VALIDATE_PERIODO_PARAM)]

print("Periodos entrenar:",df_train.periodo.unique())
print("Periodos validar:",df_validate.periodo.unique())

df_validate = df_validate.sort_values(by="product_id",ascending=True)
print("product_id a validar:", len(df_validate))

Periodos entrenar: [201701 201702 201703 201704 201705 201706 201707 201708 201709 201710
 201711 201712 201801 201802 201803 201804 201805 201806 201807 201808
 201809 201810]
Periodos validar: [201812]
product_id a validar: 868


In [11]:
### Armado variables entrenamiento
cols_remover_entren = ["tn_mas_2","periodo"]

# Train - Validate
X_train = df_train.drop(columns=cols_remover_entren, axis=1)
X_validate = df_validate.drop(columns=cols_remover_entren, axis=1)

y_train = df_train.tn_mas_2
y_validate = df_validate.tn_mas_2

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

print("\nX_validate:", X_validate.shape)
print("y_validate:", y_validate.shape)

cols_entren = X_train.columns.tolist()

X_train: (20016, 84)
y_train: (20016,)

X_validate: (868, 84)
y_validate: (868,)


In [None]:
### Optimizacion Bayesiana
GLOBAL_PARAMS_ITER = []
GLOBAL_BEST_IT_ITER = []
GLOBAL_ER_VALIDATE_ITER = []
GLOBAL_MODEL_ITER = []

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=config.CANT_ITERACIONES_OPT_BAY_PARAM)
print('# Experimentos:', len(study.trials))
print('Mejor experimento:', study.best_trial.params)

[I 2023-12-11 15:32:54,239] A new study created in memory with name: no-name-dc46c6dc-bd13-4210-94a2-2e27527ac0c7


**************** regression ****************
Training until validation scores don't improve for 73 rounds
[100]	valid_0's ER: 56.03
[200]	valid_0's ER: 52.65
[300]	valid_0's ER: 51.49


[I 2023-12-11 15:32:55,529] Trial 0 finished with value: 50.95 and parameters: {'lambda_l1': 529.5886137116597, 'lambda_l2': 686.0466430830945, 'learning_rate': 0.2122572338860103, 'num_leaves': 885, 'feature_fraction': 0.7420799556258064, 'min_data_in_leaf': 6640}. Best is trial 0 with value: 50.95.


[400]	valid_0's ER: 51.14
Early stopping, best iteration is:
[388]	valid_0's ER: 50.95
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 69 rounds
[100]	valid_0's ER: 51.98


[I 2023-12-11 15:32:56,017] Trial 1 finished with value: 50.34 and parameters: {'lambda_l1': 56.44785272608288, 'lambda_l2': 758.1110294104332, 'learning_rate': 0.2558334893270708, 'num_leaves': 676, 'feature_fraction': 0.3935502779557843, 'min_data_in_leaf': 5882}. Best is trial 1 with value: 50.34.


[200]	valid_0's ER: 51.57
Early stopping, best iteration is:
[136]	valid_0's ER: 50.34
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 87 rounds
[100]	valid_0's ER: 27.58
[200]	valid_0's ER: 27.18
[300]	valid_0's ER: 27.08


[I 2023-12-11 15:32:59,917] Trial 2 finished with value: 26.79 and parameters: {'lambda_l1': 261.5439040527991, 'lambda_l2': 430.93800790985904, 'learning_rate': 0.13175642606589244, 'num_leaves': 339, 'feature_fraction': 0.5496702102724926, 'min_data_in_leaf': 211}. Best is trial 2 with value: 26.79.


Early stopping, best iteration is:
[252]	valid_0's ER: 26.79
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 76 rounds
[100]	valid_0's ER: 53.62
[200]	valid_0's ER: 50.35
[300]	valid_0's ER: 49.3


[I 2023-12-11 15:33:01,098] Trial 3 finished with value: 48.68 and parameters: {'lambda_l1': 599.4331164478059, 'lambda_l2': 200.30810075898265, 'learning_rate': 0.19224571993338335, 'num_leaves': 89, 'feature_fraction': 0.651432099918581, 'min_data_in_leaf': 6450}. Best is trial 2 with value: 26.79.


[400]	valid_0's ER: 48.9
Early stopping, best iteration is:
[394]	valid_0's ER: 48.68
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 95 rounds
[100]	valid_0's ER: 64.7
[200]	valid_0's ER: 61.03
[300]	valid_0's ER: 59.25
[400]	valid_0's ER: 57.85
[500]	valid_0's ER: 57.13
[600]	valid_0's ER: 56.51
[700]	valid_0's ER: 56.15
[800]	valid_0's ER: 55.29
[900]	valid_0's ER: 55.05
[1000]	valid_0's ER: 54.92
[1100]	valid_0's ER: 54.81


[I 2023-12-11 15:33:04,568] Trial 4 finished with value: 54.74 and parameters: {'lambda_l1': 498.10175393568744, 'lambda_l2': 453.3051282969073, 'learning_rate': 0.11014160073048335, 'num_leaves': 858, 'feature_fraction': 0.8510356295967927, 'min_data_in_leaf': 7328}. Best is trial 2 with value: 26.79.


[1200]	valid_0's ER: 54.93
Early stopping, best iteration is:
[1152]	valid_0's ER: 54.74
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 129 rounds
[100]	valid_0's ER: 61.73
[200]	valid_0's ER: 55.76
[300]	valid_0's ER: 53.64
[400]	valid_0's ER: 52.74
[500]	valid_0's ER: 51.75
[600]	valid_0's ER: 50.88
[700]	valid_0's ER: 50.2
[800]	valid_0's ER: 49.93
[900]	valid_0's ER: 49.5
[1000]	valid_0's ER: 49.21
[1100]	valid_0's ER: 49.04
Early stopping, best iteration is:
[1054]	valid_0's ER: 48.96
Evaluated only: ER


[I 2023-12-11 15:33:07,605] Trial 5 finished with value: 48.96 and parameters: {'lambda_l1': 490.2700212630581, 'lambda_l2': 780.860850500334, 'learning_rate': 0.0632607467644285, 'num_leaves': 545, 'feature_fraction': 0.7922195837617825, 'min_data_in_leaf': 6382}. Best is trial 2 with value: 26.79.


**************** regression ****************
Training until validation scores don't improve for 72 rounds
[100]	valid_0's ER: 51.89


[I 2023-12-11 15:33:08,378] Trial 6 finished with value: 49.68 and parameters: {'lambda_l1': 324.1094651871109, 'lambda_l2': 849.6595028011178, 'learning_rate': 0.2179000207288346, 'num_leaves': 911, 'feature_fraction': 0.6984803760601872, 'min_data_in_leaf': 5828}. Best is trial 2 with value: 26.79.


[200]	valid_0's ER: 51.18
Early stopping, best iteration is:
[167]	valid_0's ER: 49.68
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 114 rounds
[100]	valid_0's ER: 42.37
[200]	valid_0's ER: 41.19
[300]	valid_0's ER: 40.21
[400]	valid_0's ER: 39.64
[500]	valid_0's ER: 39.26
[600]	valid_0's ER: 39.3
[700]	valid_0's ER: 38.85
[800]	valid_0's ER: 38.71


[I 2023-12-11 15:33:14,497] Trial 7 finished with value: 38.7 and parameters: {'lambda_l1': 211.73708445895045, 'lambda_l2': 321.037280406139, 'learning_rate': 0.0775347801570902, 'num_leaves': 203, 'feature_fraction': 0.744232915445245, 'min_data_in_leaf': 1429}. Best is trial 2 with value: 26.79.


[900]	valid_0's ER: 38.96
Early stopping, best iteration is:
[799]	valid_0's ER: 38.7
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 83 rounds
[100]	valid_0's ER: 28.1
[200]	valid_0's ER: 27.11


[I 2023-12-11 15:33:18,840] Trial 8 finished with value: 26.96 and parameters: {'lambda_l1': 87.70821991550449, 'lambda_l2': 915.174910787163, 'learning_rate': 0.14995731255445555, 'num_leaves': 991, 'feature_fraction': 0.866822994130465, 'min_data_in_leaf': 302}. Best is trial 2 with value: 26.79.


Early stopping, best iteration is:
[183]	valid_0's ER: 26.96
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 80 rounds
[100]	valid_0's ER: 63.48
[200]	valid_0's ER: 59.97
[300]	valid_0's ER: 58.92
[400]	valid_0's ER: 58.18
[500]	valid_0's ER: 57.22
[600]	valid_0's ER: 56.83


[I 2023-12-11 15:33:20,002] Trial 9 finished with value: 56.4 and parameters: {'lambda_l1': 431.4779332348312, 'lambda_l2': 660.7879726653192, 'learning_rate': 0.1632431240329774, 'num_leaves': 236, 'feature_fraction': 0.307185965235211, 'min_data_in_leaf': 7407}. Best is trial 2 with value: 26.79.


[700]	valid_0's ER: 56.82
[800]	valid_0's ER: 56.83
Early stopping, best iteration is:
[761]	valid_0's ER: 56.4
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 225 rounds
[100]	valid_0's ER: 54.41
[200]	valid_0's ER: 55.69


[I 2023-12-11 15:33:21,135] Trial 10 finished with value: 54.23 and parameters: {'lambda_l1': 807.625198694137, 'lambda_l2': 100.43442111455403, 'learning_rate': 0.02854157393764714, 'num_leaves': 367, 'feature_fraction': 0.507623578896002, 'min_data_in_leaf': 3044}. Best is trial 2 with value: 26.79.


[300]	valid_0's ER: 55.29
Early stopping, best iteration is:
[111]	valid_0's ER: 54.23
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 89 rounds
[100]	valid_0's ER: 29.39
[200]	valid_0's ER: 27.24
[300]	valid_0's ER: 27.04


[I 2023-12-11 15:33:31,454] Trial 11 finished with value: 26.98 and parameters: {'lambda_l1': 17.487321963061447, 'lambda_l2': 956.2581532625129, 'learning_rate': 0.12630203928285452, 'num_leaves': 480, 'feature_fraction': 0.9797573430483999, 'min_data_in_leaf': 133}. Best is trial 2 with value: 26.79.


Early stopping, best iteration is:
[276]	valid_0's ER: 26.98
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 85 rounds
[100]	valid_0's ER: 28.09
[200]	valid_0's ER: 27.2
[300]	valid_0's ER: 26.95


[I 2023-12-11 15:33:39,361] Trial 12 finished with value: 26.83 and parameters: {'lambda_l1': 176.98621273492313, 'lambda_l2': 548.2771222984433, 'learning_rate': 0.140429580101097, 'num_leaves': 676, 'feature_fraction': 0.570148361335449, 'min_data_in_leaf': 44}. Best is trial 2 with value: 26.79.


Early stopping, best iteration is:
[246]	valid_0's ER: 26.83
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 66 rounds
[100]	valid_0's ER: 45.59
[200]	valid_0's ER: 43.02
[300]	valid_0's ER: 40.64
[400]	valid_0's ER: 39.93
[500]	valid_0's ER: 38.61
[600]	valid_0's ER: 37.31
[700]	valid_0's ER: 36.9
[800]	valid_0's ER: 36.47
[900]	valid_0's ER: 36.12
[1000]	valid_0's ER: 35.95
[1100]	valid_0's ER: 35.34
[1200]	valid_0's ER: 35.08
[1300]	valid_0's ER: 34.81


[I 2023-12-11 15:33:44,723] Trial 13 finished with value: 34.56 and parameters: {'lambda_l1': 216.17593135639936, 'lambda_l2': 513.5555537262323, 'learning_rate': 0.29494872834932206, 'num_leaves': 658, 'feature_fraction': 0.5448320434322357, 'min_data_in_leaf': 2181}. Best is trial 2 with value: 26.79.


Early stopping, best iteration is:
[1318]	valid_0's ER: 34.56
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 94 rounds
[100]	valid_0's ER: 55.23
[200]	valid_0's ER: 52.24
[300]	valid_0's ER: 49.45
[400]	valid_0's ER: 47.97
[500]	valid_0's ER: 47.13
[600]	valid_0's ER: 46.32
[700]	valid_0's ER: 45.55
[800]	valid_0's ER: 44.55
[900]	valid_0's ER: 43.73
[1000]	valid_0's ER: 43.07
[1100]	valid_0's ER: 42.48
[1200]	valid_0's ER: 42.06
[1300]	valid_0's ER: 41.66
[1400]	valid_0's ER: 41.49
[1500]	valid_0's ER: 41.25
[1600]	valid_0's ER: 40.98
[1700]	valid_0's ER: 40.74
[1800]	valid_0's ER: 40.66
[1900]	valid_0's ER: 40.58
[2000]	valid_0's ER: 40.3
[2100]	valid_0's ER: 40.08
[2200]	valid_0's ER: 39.83
[2300]	valid_0's ER: 39.53
[2400]	valid_0's ER: 39.49
[2500]	valid_0's ER: 39.13
[2600]	valid_0's ER: 38.77
[2700]	valid_0's ER: 38.68
[2800]	valid_0's ER: 38.46
[2900]	valid_0's ER: 38.39
[3000]	valid_0's ER: 38.16
[3100]	valid_

[I 2023-12-11 15:33:53,910] Trial 14 finished with value: 37.95 and parameters: {'lambda_l1': 244.245399549492, 'lambda_l2': 444.54673568900733, 'learning_rate': 0.11316404429442767, 'num_leaves': 699, 'feature_fraction': 0.5886501350375044, 'min_data_in_leaf': 4079}. Best is trial 2 with value: 26.79.


[3200]	valid_0's ER: 38.12
Early stopping, best iteration is:
[3118]	valid_0's ER: 37.95
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 83 rounds
[100]	valid_0's ER: 38.81
[200]	valid_0's ER: 36.26
[300]	valid_0's ER: 35.44


[I 2023-12-11 15:33:56,018] Trial 15 finished with value: 34.84 and parameters: {'lambda_l1': 332.7909324918515, 'lambda_l2': 580.455407297702, 'learning_rate': 0.1506775391413057, 'num_leaves': 405, 'feature_fraction': 0.4662579962725304, 'min_data_in_leaf': 1204}. Best is trial 2 with value: 26.79.


[400]	valid_0's ER: 35.19
Early stopping, best iteration is:
[340]	valid_0's ER: 34.84
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 109 rounds
[100]	valid_0's ER: 58.05
[200]	valid_0's ER: 53.46
[300]	valid_0's ER: 51.16
[400]	valid_0's ER: 49.85
[500]	valid_0's ER: 48.8
[600]	valid_0's ER: 48.5
[700]	valid_0's ER: 47.96
[800]	valid_0's ER: 46.86
[900]	valid_0's ER: 46.3
[1000]	valid_0's ER: 45.56
[1100]	valid_0's ER: 45.13
[1200]	valid_0's ER: 44.54
[1300]	valid_0's ER: 44.11
[1400]	valid_0's ER: 43.72
[1500]	valid_0's ER: 43.37
[1600]	valid_0's ER: 43.07
[1700]	valid_0's ER: 42.98
[1800]	valid_0's ER: 42.84
[1900]	valid_0's ER: 42.67
[2000]	valid_0's ER: 42.55
[2100]	valid_0's ER: 42.32
[2200]	valid_0's ER: 42.1
[2300]	valid_0's ER: 41.9
[2400]	valid_0's ER: 41.73
[2500]	valid_0's ER: 41.56
[2600]	valid_0's ER: 41.2
[2700]	valid_0's ER: 41.04
[2800]	valid_0's ER: 40.85
[2900]	valid_0's ER: 40.62
[3000]	valid_0's ER

[I 2023-12-11 15:34:10,152] Trial 16 finished with value: 38.72 and parameters: {'lambda_l1': 141.93083683685836, 'lambda_l2': 342.32172385893386, 'learning_rate': 0.08462641941642177, 'num_leaves': 268, 'feature_fraction': 0.6023815730035774, 'min_data_in_leaf': 3691}. Best is trial 2 with value: 26.79.


**************** regression ****************
Training until validation scores don't improve for 508 rounds
[100]	valid_0's ER: 60.96
[200]	valid_0's ER: 40.58
[300]	valid_0's ER: 38.01
[400]	valid_0's ER: 38.44
[500]	valid_0's ER: 39.23
[600]	valid_0's ER: 39.9
[700]	valid_0's ER: 40.25
[800]	valid_0's ER: 40.06


[I 2023-12-11 15:34:13,799] Trial 17 finished with value: 37.9 and parameters: {'lambda_l1': 7.644923959601499, 'lambda_l2': 571.9101787357646, 'learning_rate': 0.010916996559839964, 'num_leaves': 27, 'feature_fraction': 0.4505072233165552, 'min_data_in_leaf': 1136}. Best is trial 2 with value: 26.79.


Early stopping, best iteration is:
[340]	valid_0's ER: 37.9
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 78 rounds


[I 2023-12-11 15:34:14,198] Trial 18 finished with value: 45.9 and parameters: {'lambda_l1': 149.56917545598304, 'lambda_l2': 15.328250831138064, 'learning_rate': 0.17628692439598354, 'num_leaves': 528, 'feature_fraction': 0.256400199116692, 'min_data_in_leaf': 2157}. Best is trial 2 with value: 26.79.


Early stopping, best iteration is:
[18]	valid_0's ER: 45.9
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 85 rounds
[100]	valid_0's ER: 25.82
[200]	valid_0's ER: 25.42


[I 2023-12-11 15:34:20,254] Trial 19 finished with value: 25.35 and parameters: {'lambda_l1': 302.79268142588865, 'lambda_l2': 348.0497511898205, 'learning_rate': 0.13939286348275323, 'num_leaves': 785, 'feature_fraction': 0.6240015966120067, 'min_data_in_leaf': 57}. Best is trial 19 with value: 25.35.


Early stopping, best iteration is:
[173]	valid_0's ER: 25.35
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 93 rounds
[100]	valid_0's ER: 55.48
[200]	valid_0's ER: 52.42
[300]	valid_0's ER: 51


[I 2023-12-11 15:34:21,771] Trial 20 finished with value: 50.67 and parameters: {'lambda_l1': 340.63180055256964, 'lambda_l2': 321.7183155752097, 'learning_rate': 0.11520090665397575, 'num_leaves': 789, 'feature_fraction': 0.6501746144446783, 'min_data_in_leaf': 4735}. Best is trial 19 with value: 25.35.


[400]	valid_0's ER: 51.44
Early stopping, best iteration is:
[354]	valid_0's ER: 50.67
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 86 rounds
[100]	valid_0's ER: 26.37
[200]	valid_0's ER: 26


[I 2023-12-11 15:34:25,813] Trial 21 finished with value: 25.84 and parameters: {'lambda_l1': 270.2554533009711, 'lambda_l2': 409.22007842676766, 'learning_rate': 0.13628654033009288, 'num_leaves': 602, 'feature_fraction': 0.5730052661275433, 'min_data_in_leaf': 87}. Best is trial 19 with value: 25.35.


Early stopping, best iteration is:
[151]	valid_0's ER: 25.84
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 78 rounds


[I 2023-12-11 15:34:26,570] Trial 22 finished with value: 33.17 and parameters: {'lambda_l1': 302.95882817289805, 'lambda_l2': 416.73341782485204, 'learning_rate': 0.1773121397189496, 'num_leaves': 573, 'feature_fraction': 0.5163441787857, 'min_data_in_leaf': 734}. Best is trial 19 with value: 25.35.


Early stopping, best iteration is:
[19]	valid_0's ER: 33.17
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 89 rounds
[100]	valid_0's ER: 44.72
[200]	valid_0's ER: 42.86
[300]	valid_0's ER: 41.97
[400]	valid_0's ER: 41.02
[500]	valid_0's ER: 40.65
[600]	valid_0's ER: 39.99
[700]	valid_0's ER: 39.52
[800]	valid_0's ER: 39
[900]	valid_0's ER: 38.82
[1000]	valid_0's ER: 38.39
[1100]	valid_0's ER: 37.88
[1200]	valid_0's ER: 37.36
[1300]	valid_0's ER: 37.46
Early stopping, best iteration is:
[1238]	valid_0's ER: 37.17
Evaluated only: ER


[I 2023-12-11 15:34:34,357] Trial 23 finished with value: 37.17 and parameters: {'lambda_l1': 381.6155439948649, 'lambda_l2': 224.5519018321905, 'learning_rate': 0.1276649603321415, 'num_leaves': 354, 'feature_fraction': 0.6303185457384999, 'min_data_in_leaf': 2064}. Best is trial 19 with value: 25.35.


**************** regression ****************
Training until validation scores don't improve for 84 rounds


[I 2023-12-11 15:34:35,054] Trial 24 finished with value: 35.04 and parameters: {'lambda_l1': 254.54005780433621, 'lambda_l2': 359.92843251551136, 'learning_rate': 0.1469205875382575, 'num_leaves': 785, 'feature_fraction': 0.3911604996157467, 'min_data_in_leaf': 832}. Best is trial 19 with value: 25.35.


[100]	valid_0's ER: 36.35
Early stopping, best iteration is:
[24]	valid_0's ER: 35.04
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 108 rounds
[100]	valid_0's ER: 45.43
[200]	valid_0's ER: 43.25
[300]	valid_0's ER: 42.31
[400]	valid_0's ER: 42.09
[500]	valid_0's ER: 41.42


[I 2023-12-11 15:34:37,726] Trial 25 finished with value: 41.29 and parameters: {'lambda_l1': 405.07844560507317, 'lambda_l2': 248.67137821877387, 'learning_rate': 0.08489224528551911, 'num_leaves': 438, 'feature_fraction': 0.5469116886569488, 'min_data_in_leaf': 1804}. Best is trial 19 with value: 25.35.


Early stopping, best iteration is:
[488]	valid_0's ER: 41.29
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 97 rounds


[I 2023-12-11 15:34:38,454] Trial 26 finished with value: 50.96 and parameters: {'lambda_l1': 266.3684387590133, 'lambda_l2': 478.320169243446, 'learning_rate': 0.10528079034017737, 'num_leaves': 617, 'feature_fraction': 0.6744077086861325, 'min_data_in_leaf': 2876}. Best is trial 19 with value: 25.35.


[100]	valid_0's ER: 51.74
Early stopping, best iteration is:
[31]	valid_0's ER: 50.96
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 87 rounds
[100]	valid_0's ER: 31.07
[200]	valid_0's ER: 30.42


[I 2023-12-11 15:34:41,139] Trial 27 finished with value: 30.22 and parameters: {'lambda_l1': 110.7471986791673, 'lambda_l2': 404.76895653015515, 'learning_rate': 0.13503382980527495, 'num_leaves': 742, 'feature_fraction': 0.5915266785260339, 'min_data_in_leaf': 614}. Best is trial 19 with value: 25.35.


Early stopping, best iteration is:
[206]	valid_0's ER: 30.22
Evaluated only: ER
**************** regression ****************
Training until validation scores don't improve for 81 rounds
[100]	valid_0's ER: 27.15
[200]	valid_0's ER: 26.39
[300]	valid_0's ER: 26.2
[400]	valid_0's ER: 26.07
[500]	valid_0's ER: 26.05


In [None]:
# Feature Importance
best_model_index = study.best_trial.number
best_model = GLOBAL_MODEL_ITER[best_model_index]
feature_importance = lightgbm_aux.plot_lgb_importances(best_model, num=30, plot=True)

In [None]:
#Si se usa holdout, se generan variables para luego evaluar modelos
if(config.USAR_HOLDOUT_PARAM):
    # Dataframes
    df_train_all = df_sellout[(df_sellout.periodo <= config.TRAIN_ALL_PERIODO_LIMITE_PARAM) & (df_sellout.periodo >= config.PERIODO_INICIO_PARAM)]
    df_holdout = df_sellout[(df_sellout.product_id.isin(df_prods_prediccion_entren.product_id)) & (df_sellout.periodo == config.HOLDOUT_PERIODO_PARAM)]
    
    df_holdout = df_holdout.sort_values(by="product_id",ascending=True)
    print("product_id a testear:", len(df_holdout))

    print("Periodos entrenar ALL:",df_train_all.periodo.unique())
    print("Periodos holdout:",df_holdout.periodo.unique())

    # Variables entrenamiento
    X_train_all = df_train_all.drop(columns=cols_remover_entren, axis=1)
    X_holdout = df_holdout.drop(columns=cols_remover_entren, axis=1)
    
    y_train_all = df_train_all.tn_mas_2
    y_holdout = df_holdout.tn_mas_2
    print("\nX_train_all:", X_train_all.shape)
    print("y_train_all:", y_train_all.shape)
    
    print("\nX_holdout:", X_holdout.shape)
    print("y_holdout:", y_holdout.shape)

    lgbtrain_all = lgb.Dataset(data=X_train_all, label=y_train_all, feature_name=cols_entren)

In [None]:
current_datetime = datetime.now().astimezone(pytz.timezone('America/Argentina/Buenos_Aires'))
exp_numero = current_datetime.strftime("%Y%m%d %H%M%S").replace(" ", "_")
exp_numero

In [None]:
# Guardo resultados
carpeta_exp = carpeta_exp_prefijo + exp_numero + "/"
if not os.path.exists(carpeta_exp):
    os.makedirs(carpeta_exp)
    
# Feature Importance
feature_importance.to_csv(carpeta_exp + "feature_importance.csv",index=False)

# Parametros
parametros_nombres = ["PERIODO_INICIO_PARAM","TRAIN_PERIODO_LIMITE_PARAM","VALIDATE_PERIODO_PARAM","USAR_HOLDOUT_PARAM",
                      "TRAIN_ALL_PERIODO_LIMITE_PARAM","HOLDOUT_PERIODO_PARAM",
                      "MODELO_FINAL_PERIODO_LIMITE_PARAM","FUTURE_PERIODO_PARAM",
                      "ARCH_PRODUCTOS_PREDICCION_ENTRENAMIENTO","ARCH_PRODUCTOS_PREDICCION_FUTURE",
                      "NUM_LAGS_PARAM","FAMILIA_FEATURES_TEMP_PARAM",
                      "CANT_ITERACIONES_OPT_BAY_PARAM","CANT_SEMILLAS"]

parametros_valores = [config.PERIODO_INICIO_PARAM,config.TRAIN_PERIODO_LIMITE_PARAM,config.VALIDATE_PERIODO_PARAM,
                      config.USAR_HOLDOUT_PARAM,config.TRAIN_ALL_PERIODO_LIMITE_PARAM,config.HOLDOUT_PERIODO_PARAM,
                      config.MODELO_FINAL_PERIODO_LIMITE_PARAM,config.FUTURE_PERIODO_PARAM,
                      config.ARCH_PRODUCTOS_PREDICCION_ENTRENAMIENTO,config.ARCH_PRODUCTOS_PREDICCION_FUTURE,
                      config.NUM_LAGS_PARAM,config.FAMILIA_FEATURES_TEMP_PARAM,
                      config.CANT_ITERACIONES_OPT_BAY_PARAM, config.CANT_SEMILLAS]

parametros = pd.DataFrame(data={"nombre":parametros_nombres,"valor":parametros_valores})
parametros.to_csv(carpeta_exp + "parametros.csv",index=False)

# Optimizacion Bayesiana
df_opt_bay = pd.DataFrame(data={"lgb_params":GLOBAL_PARAMS_ITER,"best_iteration":GLOBAL_BEST_IT_ITER,
                                "error_rate_validate":GLOBAL_ER_VALIDATE_ITER})

# Ordeno de menor a mayor
df_opt_bay = df_opt_bay.sort_values(by="error_rate_validate",ascending=True)
df_opt_bay["index"] = range(0,len(GLOBAL_PARAMS_ITER))

# Holdout
if(config.USAR_HOLDOUT_PARAM):
    er_holdout_iter = [None] * len(df_opt_bay)
    er_holdout_semillerio_iter = [None] * len(df_opt_bay)
    
    for i in range(config.OFFSET_EVAL_HOLDOUT,config.CANT_EVAL_HOLDOUT):
        print("Iteración:",i)

        #Modelo Train All (Sin Semillerio)
        model_train_all = lgb.train(df_opt_bay.iloc[i].lgb_params, lgbtrain_all, num_boost_round=df_opt_bay.iloc[i].best_iteration)
        y_pred_holdout = model_train_all.predict(X_holdout)
        er_holdout = metricas.error_rate(y_holdout,y_pred_holdout)
        er_holdout_iter[i]=er_holdout
        
        #Semillerio
        y_pred_holdout_semillerio = lightgbm_aux.semillerio(lgbtrain_all, df_opt_bay.iloc[i].lgb_params, df_opt_bay.iloc[i].best_iteration, X_holdout,config.CANT_SEMILLAS)
        er_holdout_semillerio_iter[i]= metricas.error_rate(y_holdout,y_pred_holdout_semillerio)
        
    df_opt_bay["error_rate_holdout"]=er_holdout_iter    
    df_opt_bay["error_rate_holdout_semillerio"]=er_holdout_semillerio_iter    

df_opt_bay.to_excel(carpeta_exp +  "opt_bay.xlsx",index=False)

In [None]:
print("FINALIZADO")