In [1]:
import os
import lightgbm as lgb
import pandas as pd
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import optuna
import random

# importo los .py
import sys
sys.path.append("aux")
import transformaciones

pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', None)

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
####################################################
############# Setear segun cada maquina ############
#os.chdir("C:/Users/herna/labo3_empresa3_repo/")
#os.chdir("C:/diego_tools/labo3/")
os.chdir("/home/dcastro_contacto/buckets/b1/")
####################################################

In [3]:
arch_entrada = "datasets/emp3_sellout_train.csv"

arch_min_max_prod_cust = "datasets/emp3_min_max_prod_cust.csv"
arch_mean_std_prod_cust = "datasets/emp3_mean_std_prod_cust.csv"
arch_no_transf_prod_cust = "datasets/emp3_no_transf_prod_cust.csv"

In [4]:
CANT_ITERACIONES_OPT_BAY = 5
MAX_BIN_PARAM=255
#MAX_BIN_PARAM=1023

OBJECTIVE_PARAM = "tweedie"
#OBJECTIVE_PARAM = "regression"

GLOBAL_PROD_CUST_IDS = pd.DataFrame() #aca solamente se define, se setea mas abajo

tipos_transf = ['sin_transformacion', 'normalizacion', 'estandarizacion']
GLOBAL_TIPO_TRANSF = tipos_transf[0]

In [5]:
# Marco en train, holdout y descartar para considerarlo en las transformaciones
periodo_inicio=201701 #inclusive
train_periodo_limite = 201810 #inclusive
validate_periodo = 201812
train_all_periodo_limite = 201812 #inclusive
holdout_periodo = 201902

In [6]:
def leer_df_auxiliar_transf(tipo_transf):
    if tipo_transf=="normalizacion":
        return pd.read_csv(arch_min_max_prod_cust)
    elif tipo_transf=="estandarizacion":
        return pd.read_csv(arch_mean_std_prod_cust)
    else:
        return pd.read_csv(arch_no_transf_prod_cust)

In [7]:
############ DEFINO Vector Global de Productos #################
def actualizar_global_prod_custs(prod_cust_values, y_vector):
    # Ademas del id y los valores para "destransformar", se deja el tn original porque la destransformacion puede no ser
    # precisa para <productos,cliente> que no estaban en train
    GLOBAL_PROD_CUST_IDS = pd.DataFrame(data={"prod_cust":prod_cust_values, "tn_orig":np.array(y_vector)})
    GLOBAL_PROD_CUST_IDS = GLOBAL_PROD_CUST_IDS.merge(df_prod_cust_val1_val2,how="left",on="prod_cust")
    GLOBAL_PROD_CUST_IDS = GLOBAL_PROD_CUST_IDS.sort_values(by="prod_cust",ascending=True)
    
    return GLOBAL_PROD_CUST_IDS

In [8]:
def error_rate(y, y_pred):
    y_pred = np.array(y_pred)
    
    #Solamente destransformo la prediccion (para el y tomo el original de la columna tn_orig)
    df_calculo = GLOBAL_PROD_CUST_IDS.copy()
    df_calculo["y_pred"] = y_pred
    df_calculo["y_pred_destransformado"]=df_calculo.apply(lambda row: transformaciones.destransformar_valor(GLOBAL_TIPO_TRANSF,row["y_pred"],row["valor_1"],row["valor_2"]),axis=1)

    y = df_calculo.tn_orig
    y_pred = df_calculo.y_pred_destransformado
    
    #Las predicciones negativas se convierten a 0
    y_pred = np.maximum(y_pred, 0)
    
    dif_abs = sum(abs(y - y_pred))
    suma_real = sum(y)
    return round(100*dif_abs/suma_real,2)

def lgbm_error_rate(preds, train_data):
    labels = train_data.get_label()
    ret_val = error_rate(labels,preds)
    return 'ER', ret_val, False

In [9]:
def plot_lgb_importances(model, plot=False, num=10):
    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name(),
                             'split': model.feature_importance('split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show(block=True)
    else:
        print(feat_imp.head(num))
    return feat_imp

In [10]:
def objective(trial):
    print("****************", OBJECTIVE_PARAM, "****************")
    
    lgb_params = {
        'objective': OBJECTIVE_PARAM,
        'first_metric_only': True,
        'boost_from_average': True,
        'max_depth':-1,
        'lambda_l1': 0.0,
        'lambda_l2': 0.0,
        'min_gain_to_split':0.0,
        'force_row_wise':True,
        'feature_pre_filter':False,
        'metric': "None",
        'max_bin': MAX_BIN_PARAM,
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01,0.05,0.1,0.2,0.5,0.9]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'feature_fraction': trial.suggest_categorical('feature_fraction', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        #'bagging_freq':trial.suggest_int('bagging_freq', 1, 10),
        #'bagging_fraction': trial.suggest_categorical('bagging_fraction',[0.7,0.75,0.8,0.85,0.9,0.95]),
        #'extra_trees':trial.suggest_categorical('extra_trees',[True,False]),
        'verbose':-100,
        'num_threads':-1
    }
    
    lgb_params_num_boost_round = 10000  #un numero muy grande, lo limita early_stopping_rounds
    lgb_params_early_stopping_rounds = int(50 + 5/lgb_params['learning_rate']) #  #el parametro discolo, que depende de otro
    
    lgb_train = lgb.Dataset(data=X_train, label=y_train, feature_name=cols)
    lgb_validate = lgb.Dataset(data=X_validate, label=y_validate, reference=lgb_train, feature_name=cols)
    
    #Train using selected parameters
    model = lgb.train(lgb_params, lgb_train,
                  valid_sets=[lgb_validate],
                  num_boost_round=lgb_params_num_boost_round,
                  early_stopping_rounds=lgb_params_early_stopping_rounds,
                  feval=lgbm_error_rate,
                  verbose_eval=100)
    
    y_pred_validate =model.predict(X_validate) #Create predictions on test set
    er_validate = error_rate(y_validate,y_pred_validate)
    
    GLOBAL_PARAMS_ITER.append(lgb_params)
    GLOBAL_BEST_IT_ITER.append(model.best_iteration)
    GLOBAL_ER_VALIDATE_ITER.append(er_validate)
    GLOBAL_MODEL_ITER.append(model)
    
    return er_validate

In [11]:
##############################################

In [12]:
df_sellout = pd.read_csv(arch_entrada)

### Antes de transformar, se guardan los y_validate y holdout originales (se van a usar en el vector global)
# Esto se debe a que la "destransformacion" no siempre es perfecta, considerando productos que no estuvieron en train
df_sellout_validate_orig = df_sellout[df_sellout.periodo==validate_periodo]
df_sellout_validate_orig = df_sellout_validate_orig.sort_values(by="prod_cust",ascending=True)
y_validate_orig = df_sellout_validate_orig.tn_mas_2_original
print("<prod-cust> a validar:", len(y_validate_orig))

df_sellout_holdout_orig = df_sellout[df_sellout.periodo==holdout_periodo]
df_sellout_holdout_orig = df_sellout_holdout_orig.sort_values(by="prod_cust",ascending=True)
y_holdout_orig = df_sellout_holdout_orig.tn_mas_2_original
print("<prod-cust> a testear:", len(y_holdout_orig))

df_sellout = df_sellout.drop(columns=["tn_mas_2_original"])
df_prod_cust_val1_val2 = leer_df_auxiliar_transf(GLOBAL_TIPO_TRANSF)

### Categoricas
#Transformo todas las categoricas
categories = ["plan_precios_cuidados","cat1","cat2","cat3","product_id","customer_id","prod_cust","cero_ventas"]

for c in categories: 
    df_sellout[c] = df_sellout[c].astype("category")   
    
### Se descartan variables no utiles para la prediccion
df_sellout = df_sellout.drop(columns=["brand","periodo_fecha"])

### Split de datasets
df_train = df_sellout[(df_sellout.periodo <= train_periodo_limite) & (df_sellout.periodo >= periodo_inicio)]
df_validate = df_sellout[df_sellout.periodo == validate_periodo]

df_train_all = df_sellout[(df_sellout.periodo <= train_all_periodo_limite) & (df_sellout.periodo >= periodo_inicio)]
df_holdout = df_sellout[df_sellout.periodo == holdout_periodo]

df_validate = df_validate.sort_values(by="prod_cust",ascending=True)
print("<prod-cust> a validar:", len(df_validate))

df_holdout = df_holdout.sort_values(by="prod_cust",ascending=True)
print("<prod-cust> a testear:", len(df_holdout))

print("Periodos entrenar:",df_train.periodo.unique())
print("Periodos validar:",df_validate.periodo.unique())
print("Periodos entrenar ALL:",df_train_all.periodo.unique())
print("Periodos holdout:",df_holdout.periodo.unique())

### Armado variables entrenamiento

cols_remover_entren = ["tn_mas_2","periodo"] #,"prod_cust","customer_id","product_id"
# Train - Validate
X_train = df_train.drop(columns=cols_remover_entren, axis=1)
X_validate = df_validate.drop(columns=cols_remover_entren, axis=1)

y_train = df_train.tn_mas_2
y_validate = df_validate.tn_mas_2

# Train All - Holdout
X_train_all = df_train_all.drop(columns=cols_remover_entren, axis=1)
X_holdout = df_holdout.drop(columns=cols_remover_entren, axis=1)

y_train_all = df_train_all.tn_mas_2
y_holdout = df_holdout.tn_mas_2

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

print("\nX_validate:", X_validate.shape)
print("y_validate:", y_validate.shape)

print("\nX_train_all:", X_train_all.shape)
print("y_train_all:", y_train_all.shape)

print("\nX_holdout:", X_holdout.shape)
print("y_holdout:", y_holdout.shape)

cols = X_train.columns.tolist()

### Optimizacion Bayesiana
#Antes de entrenar, se setea vector global para validate
GLOBAL_PROD_CUST_IDS = actualizar_global_prod_custs(df_validate.prod_cust, y_validate_orig)

GLOBAL_PARAMS_ITER = []
GLOBAL_BEST_IT_ITER = []
GLOBAL_ER_VALIDATE_ITER = []
GLOBAL_MODEL_ITER = []

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=CANT_ITERACIONES_OPT_BAY)
print('# Experimentos:', len(study.trials))
print('Mejor experimento:', study.best_trial.params)

best_model_index = study.best_trial.number
best_model = GLOBAL_MODEL_ITER[best_model_index]
plot_lgb_importances(best_model, num=30, plot=True)

lgbtrain_all = lgb.Dataset(data=X_train_all, label=y_train_all, feature_name=cols)

final_params = GLOBAL_PARAMS_ITER[best_model_index]
final_best_iter = GLOBAL_BEST_IT_ITER[best_model_index]
final_model = lgb.train(final_params, lgbtrain_all, num_boost_round=final_best_iter)

GLOBAL_PROD_CUST_IDS = actualizar_global_prod_custs(df_holdout.prod_cust, y_holdout_orig)

y_pred_holdout = final_model.predict(X_holdout)
er_holdout = error_rate(y_holdout,y_pred_holdout)
print("Error Holdout:", er_holdout)

prediction_with_prod_cust = pd.DataFrame(data={"prod_cust":X_holdout.prod_cust,"actual_value":y_holdout_orig,"predicted_value":y_pred_holdout})

<prod-cust> a validar: 3987
<prod-cust> a testear: 4024


[I 2023-12-03 22:26:14,028] A new study created in memory with name: no-name-a16e10ed-4e1d-41ee-937e-aaa4cf00b9cd


<prod-cust> a validar: 3987
<prod-cust> a testear: 4024
Periodos entrenar: [201701 201702 201703 201704 201705 201706 201707 201708 201709 201710
 201711 201712 201801 201802 201803 201804 201805 201806 201807 201808
 201809 201810]
Periodos validar: [201812]
Periodos entrenar ALL: [201701 201702 201703 201704 201705 201706 201707 201708 201709 201710
 201711 201712 201801 201802 201803 201804 201805 201806 201807 201808
 201809 201810 201811 201812]
Periodos holdout: [201902]
X_train: (71642, 55)
y_train: (71642,)

X_validate: (3987, 55)
y_validate: (3987,)

X_train_all: (79604, 55)
y_train_all: (79604,)

X_holdout: (4024, 55)
y_holdout: (4024,)
**************** tweedie ****************


[LightGBM] [Fatal] [tweedie]: at least one target label is negative
[W 2023-12-03 22:26:14,604] Trial 0 failed with parameters: {'learning_rate': 0.01, 'num_leaves': 791, 'feature_fraction': 0.4, 'min_data_in_leaf': 87} because of the following error: LightGBMError('[tweedie]: at least one target label is negative').
Traceback (most recent call last):
  File "/home/dcastro_contacto/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_21248/619261924.py", line 34, in objective
    model = lgb.train(lgb_params, lgb_train,
  File "/home/dcastro_contacto/.local/lib/python3.10/site-packages/lightgbm/engine.py", line 271, in train
    booster = Booster(params=params, train_set=train_set)
  File "/home/dcastro_contacto/.local/lib/python3.10/site-packages/lightgbm/basic.py", line 2610, in __init__
    _safe_call(_LIB.LGBM_BoosterCreate(
  File "/home/dcastro_contacto/.local/lib/python3.10/site-packages/

LightGBMError: [tweedie]: at least one target label is negative