In [1]:
import os
import lightgbm as lgb
import pandas as pd
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import optuna
import random
from datetime import datetime
import pytz

# importo los .py
import sys
sys.path.append("auxiliares")
import config
import metricas
import lightgbm_aux
import extras

pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', None)

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
####################################################
############# Setear segun cada maquina ############
#os.chdir("C:/Users/herna/labo3_empresa3_repo/")
#os.chdir("C:/diego_tools/labo3/")
os.chdir("/home/dcastro_contacto/buckets/b1/")
####################################################

In [3]:
#Manual
prefijo_arch_entrada = "datasets/emp3_sellout_fe_cluster_manual_"
num_clusters=94
carpeta_base_exp = "exp/avanzada_1/"

#Time Warping
#prefijo_arch_entrada = "datasets/emp3_sellout_base_cluster_warp_"
#prefijo_arch_salida = "datasets/emp3_sellout_fe_cluster_warp_"
#num_clusters=94
#carpeta_base_exp = "exp/avanzada_2/"

In [4]:
def lgbm_error_rate(preds, train_data):
    labels = train_data.get_label()
    ret_val = metricas.error_rate(labels,preds)
    return 'ER', ret_val, False

In [5]:
def objective(trial):
    print("****************", config.OBJECTIVE_PARAM, "****************")
    
    lgb_params = {
        'objective': config.OBJECTIVE_PARAM,
        'first_metric_only': True,
        'boost_from_average': True,
        'max_depth':-1,
        'lambda_l1': trial.suggest_uniform('lambda_l1', 0.0,config.L1_UPPER_PARAM),
        'lambda_l2': trial.suggest_uniform('lambda_l2', 0.0,config.L2_UPPER_PARAM),
        'min_gain_to_split':0.0,
        'force_row_wise':True,
        'feature_pre_filter':False,
        'metric': "None",
        'max_bin': config.MAX_BIN_PARAM,
        'learning_rate': trial.suggest_uniform('learning_rate', config.LEARNING_RATE_LOWER_PARAM,config.LEARNING_RATE_UPPER_PARAM ),
        'num_leaves' : trial.suggest_int('num_leaves', config.NUM_LEAVES_LOWER_PARAM, config.NUM_LEAVES_UPPER_PARAM),
        'feature_fraction': trial.suggest_uniform('feature_fraction', config.FEATURE_FRACTION_LOWER_PARAM, config.FEATURE_FRACTION_UPPER_PARAM),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', config.MIN_DATA_IN_LEAF_LOWER_PARAM, config.MIN_DATA_IN_LEAF_UPPER_PARAM),
        #'bagging_freq':trial.suggest_int('bagging_freq', 1, 10),
        #'bagging_fraction': trial.suggest_categorical('bagging_fraction',[0.7,0.75,0.8,0.85,0.9,0.95]),
        #'extra_trees':trial.suggest_categorical('extra_trees',[True,False]),
        'verbose':-100,
        'num_threads':-1
    }
    
    lgb_params_num_boost_round = 10000  #un numero muy grande, lo limita early_stopping_rounds
    lgb_params_early_stopping_rounds = int(50 + 5/lgb_params['learning_rate']) #  #el parametro discolo, que depende de otro
    
    lgb_train = lgb.Dataset(data=X_train, label=y_train, feature_name=cols_entren)
    lgb_validate = lgb.Dataset(data=X_validate, label=y_validate, reference=lgb_train, feature_name=cols_entren)
    
    #Train using selected parameters
    model = lgb.train(lgb_params, lgb_train,
                  valid_sets=[lgb_validate],
                  num_boost_round=lgb_params_num_boost_round,
                  early_stopping_rounds=lgb_params_early_stopping_rounds,
                  feval=lgbm_error_rate,
                  verbose_eval=100)
    
    y_pred_validate =model.predict(X_validate) #Create predictions on test set
    er_validate = metricas.error_rate(y_validate,y_pred_validate)
    
    GLOBAL_PARAMS_ITER.append(lgb_params)
    GLOBAL_BEST_IT_ITER.append(model.best_iteration)
    GLOBAL_ER_VALIDATE_ITER.append(er_validate)
    GLOBAL_MODEL_ITER.append(model)
    
    return er_validate

In [6]:
##############################################

In [7]:
current_datetime = datetime.now().astimezone(pytz.timezone('America/Argentina/Buenos_Aires'))
exp_numero = current_datetime.strftime("%Y%m%d %H%M%S").replace(" ", "_")
exp_numero

'20231217_175220'

In [8]:
# Guardo Info del experimento
carpeta_exp = carpeta_base_exp + exp_numero + "/"
if not os.path.exists(carpeta_exp):
    os.makedirs(carpeta_exp)
    
# Parametros
parametros_nombres = ["PERIODOS_EXCLUIR","PERIODO_INICIO_PARAM","TRAIN_PERIODO_LIMITE_PARAM",
                      "VALIDATE_PERIODO_PARAM","USAR_HOLDOUT_PARAM",
                      "TRAIN_ALL_PERIODO_LIMITE_PARAM","HOLDOUT_PERIODO_PARAM",
                      "MODELO_FINAL_PERIODO_LIMITE_PARAM","FUTURE_PERIODO_PARAM",
                      "ARCH_PRODUCTOS_PREDICCION_ENTRENAMIENTO","ARCH_PRODUCTOS_PREDICCION_FUTURE",
                      "NUM_LAGS_PARAM","FAMILIA_FEATURES_TEMP_PARAM","AMPLIA_FEATURES_PARAM",
                      "CANT_ITERACIONES_OPT_BAY_PARAM","CANT_SEMILLAS"]

parametros_valores = [config.PERIODOS_EXCLUIR,config.PERIODO_INICIO_PARAM,config.TRAIN_PERIODO_LIMITE_PARAM,
                      config.VALIDATE_PERIODO_PARAM,
                      config.USAR_HOLDOUT_PARAM,config.TRAIN_ALL_PERIODO_LIMITE_PARAM,config.HOLDOUT_PERIODO_PARAM,
                      config.MODELO_FINAL_PERIODO_LIMITE_PARAM,config.FUTURE_PERIODO_PARAM,
                      config.ARCH_PRODUCTOS_PREDICCION_ENTRENAMIENTO,config.ARCH_PRODUCTOS_PREDICCION_FUTURE,
                      config.NUM_LAGS_PARAM,config.FAMILIA_FEATURES_TEMP_PARAM,config.AMPLIA_FEATURES_PARAM,
                      config.CANT_ITERACIONES_OPT_BAY_PARAM, config.CANT_SEMILLAS]

parametros = pd.DataFrame(data={"nombre":parametros_nombres,"valor":parametros_valores})
parametros.to_csv(carpeta_exp + "parametros.csv",index=False)

In [9]:
# Entrenamiento por cluster
er_validate_iter = [None] * num_clusters
er_holdout_semillerio_iter = [None] * num_clusters
params_iter = [None] * num_clusters
best_iter_iter = [None] * num_clusters
pred_iter = [None] * num_clusters
prod_iter = [None] * num_clusters

for i in range(0,num_clusters):
    print("=========================================== Cluster", i, "===========================================")
    
    df_sellout = pd.read_csv(prefijo_arch_entrada + str(i) + ".csv")
    
    ### Categoricas
    df_sellout = extras.convertir_categoricas_prod_cust(df_sellout)

    ### Remover columnas
    df_sellout = df_sellout.drop(columns=["periodo_fecha","brand"])
    
    ### Split de datasets
    df_train = df_sellout[(df_sellout.periodo <= config.TRAIN_PERIODO_LIMITE_PARAM) & (df_sellout.periodo >= config.PERIODO_INICIO_PARAM)]
    df_validate = df_sellout[(df_sellout.periodo == config.VALIDATE_PERIODO_PARAM)]
    
    print("Periodos entrenar:",df_train.periodo.unique())
    print("Periodos validar:",df_validate.periodo.unique())
    
    df_validate = df_validate.sort_values(by="product_id",ascending=True)
    print("product_id a validar:", len(df_validate))

    ### Armado variables entrenamiento
    cols_remover_entren = ["tn_mas_2","periodo"]
    
    # Train - Validate
    X_train = df_train.drop(columns=cols_remover_entren, axis=1)
    X_validate = df_validate.drop(columns=cols_remover_entren, axis=1)
    
    y_train = df_train.tn_mas_2
    y_validate = df_validate.tn_mas_2
    
    print("X_train:", X_train.shape)
    print("y_train:", y_train.shape)
    
    print("\nX_validate:", X_validate.shape)
    print("y_validate:", y_validate.shape)
    
    cols_entren = X_train.columns.tolist()

    ### Optimizacion Bayesiana
    #Verifico si hay datos de entrenamiento y validacion en el cluster (podria haber 0)
    if len(X_train)>0 and len(X_validate)>0:
        print("Optimizacion Bayesiana")
        GLOBAL_PARAMS_ITER = []
        GLOBAL_BEST_IT_ITER = []
        GLOBAL_ER_VALIDATE_ITER = []
        GLOBAL_MODEL_ITER = []
        
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=config.CANT_ITERACIONES_OPT_BAY_PARAM)
        print('# Experimentos:', len(study.trials))
        
        if len(study.get_trials(states=[optuna.trial.TrialState.COMPLETE])) > 0:
            print('Mejor experimento:', study.best_trial.params)

            best_model_index = study.best_trial.number
            best_model = GLOBAL_MODEL_ITER[best_model_index]
    
            er_validate_iter[i]=study.best_trial.value
            params_iter[i] = GLOBAL_PARAMS_ITER[best_model_index]
            best_iter_iter[i] = GLOBAL_BEST_IT_ITER[best_model_index]
        else:
            er_validate_iter[i]=None
            params_iter[i] = None
            best_iter_iter[i] = None 
    else:
        er_validate_iter[i]=None
        params_iter[i] = None
        best_iter_iter[i] = None 

Periodos entrenar: [201701 201702 201703 201704 201705 201706 201707 201708 201709 201710
 201711 201712 201801 201802 201803 201804 201805 201806 201807 201808
 201809 201810 201811 201812 201901 201902 201903 201904 201905]
Periodos validar: [201907]
product_id a validar: 31900


[I 2023-12-17 20:53:11,150] A new study created in memory with name: no-name-15b87b98-f851-4cd4-897e-40f06a462015


X_train: (668859, 236)
y_train: (668859,)

X_validate: (31900, 236)
y_validate: (31900,)
Optimizacion Bayesiana
**************** regression ****************
Training until validation scores don't improve for 78 rounds


[W 2023-12-17 20:53:29,699] Trial 0 failed with parameters: {'lambda_l1': 921.1580591797339, 'lambda_l2': 979.1302561015495, 'learning_rate': 0.17675453316462442, 'num_leaves': 249, 'feature_fraction': 0.8910138722074481, 'min_data_in_leaf': 4513} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/dcastro_contacto/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_166530/170116548.py", line 34, in objective
    model = lgb.train(lgb_params, lgb_train,
  File "/home/dcastro_contacto/.local/lib/python3.10/site-packages/lightgbm/engine.py", line 292, in train
    booster.update(fobj=fobj)
  File "/home/dcastro_contacto/.local/lib/python3.10/site-packages/lightgbm/basic.py", line 3021, in update
    _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
KeyboardInterrupt
[W 2023-12-17 20:53:29,701] Trial 0 failed with value None.


KeyboardInterrupt: 

In [None]:
df_resultados_opt_bay = pd.DataFrame(data={"cluster":range(0,num_clusters),"error_validate":er_validate_iter,
                                   "lgbm_params":params_iter,"best_iteration":best_iter_iter})

df_resultados_opt_bay.to_excel(carpeta_exp +  "opt_bay.xlsx",index=False)

df_resultados_opt_bay

In [None]:
print("FINALIZADO")
exp_numero