In [1]:
import os
import lightgbm as lgb
import pandas as pd
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import optuna
import random
from datetime import datetime
import pytz

# importo los .py
import sys
sys.path.append("auxiliares")
import config
import metricas
import lightgbm_aux
import extras

pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', None)

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
####################################################
############# Setear segun cada maquina ############
#os.chdir("C:/Users/herna/labo3_empresa3_repo/")
#os.chdir("C:/diego_tools/labo3/")
os.chdir("/home/dcastro_contacto/buckets/b1/")
####################################################

In [3]:
#Manual
prefijo_arch_entrada = "datasets/emp3_sellout_fe_cluster_manual_"
num_clusters=94
carpeta_base_exp = "exp/avanzada_1/"

#Time Warping
#prefijo_arch_entrada = "datasets/emp3_sellout_base_cluster_warp_"
#prefijo_arch_salida = "datasets/emp3_sellout_fe_cluster_warp_"
#num_clusters=94
#carpeta_base_exp = "exp/avanzada_2/"

In [4]:
def lgbm_error_rate(preds, train_data):
    labels = train_data.get_label()
    ret_val = metricas.error_rate(labels,preds)
    return 'ER', ret_val, False

In [5]:
def objective(trial):
    print("****************", config.OBJECTIVE_PARAM, "****************")
    
    lgb_params = {
        'objective': config.OBJECTIVE_PARAM,
        'first_metric_only': True,
        'boost_from_average': True,
        'max_depth':-1,
        'lambda_l1': trial.suggest_uniform('lambda_l1', 0.0,config.L1_UPPER_PARAM),
        'lambda_l2': trial.suggest_uniform('lambda_l2', 0.0,config.L2_UPPER_PARAM),
        'min_gain_to_split':0.0,
        'force_row_wise':True,
        'feature_pre_filter':False,
        'metric': "None",
        'max_bin': config.MAX_BIN_PARAM,
        'learning_rate': trial.suggest_uniform('learning_rate', config.LEARNING_RATE_LOWER_PARAM,config.LEARNING_RATE_UPPER_PARAM ),
        'num_leaves' : trial.suggest_int('num_leaves', config.NUM_LEAVES_LOWER_PARAM, config.NUM_LEAVES_UPPER_PARAM),
        'feature_fraction': trial.suggest_uniform('feature_fraction', config.FEATURE_FRACTION_LOWER_PARAM, config.FEATURE_FRACTION_UPPER_PARAM),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', config.MIN_DATA_IN_LEAF_LOWER_PARAM, config.MIN_DATA_IN_LEAF_UPPER_PARAM),
        #'bagging_freq':trial.suggest_int('bagging_freq', 1, 10),
        #'bagging_fraction': trial.suggest_categorical('bagging_fraction',[0.7,0.75,0.8,0.85,0.9,0.95]),
        #'extra_trees':trial.suggest_categorical('extra_trees',[True,False]),
        'verbose':-100,
        'num_threads':-1
    }
    
    lgb_params_num_boost_round = 10000  #un numero muy grande, lo limita early_stopping_rounds
    lgb_params_early_stopping_rounds = int(50 + 5/lgb_params['learning_rate']) #  #el parametro discolo, que depende de otro
    
    lgb_train = lgb.Dataset(data=X_train, label=y_train, feature_name=cols_entren)
    lgb_validate = lgb.Dataset(data=X_validate, label=y_validate, reference=lgb_train, feature_name=cols_entren)
    
    #Train using selected parameters
    model = lgb.train(lgb_params, lgb_train,
                  valid_sets=[lgb_validate],
                  num_boost_round=lgb_params_num_boost_round,
                  early_stopping_rounds=lgb_params_early_stopping_rounds,
                  feval=lgbm_error_rate,
                  verbose_eval=100)
    
    y_pred_validate =model.predict(X_validate) #Create predictions on test set
    er_validate = metricas.error_rate(y_validate,y_pred_validate)
    
    GLOBAL_PARAMS_ITER.append(lgb_params)
    GLOBAL_BEST_IT_ITER.append(model.best_iteration)
    GLOBAL_ER_VALIDATE_ITER.append(er_validate)
    GLOBAL_MODEL_ITER.append(model)
    
    return er_validate

In [6]:
##############################################

In [7]:
df_prods_prediccion_entren=pd.read_csv(config.ARCH_PRODUCTOS_PREDICCION_ENTRENAMIENTO)
df_prods_prediccion_entren.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 868 entries, 0 to 867
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   product_id  868 non-null    int64
dtypes: int64(1)
memory usage: 6.9 KB


In [8]:
current_datetime = datetime.now().astimezone(pytz.timezone('America/Argentina/Buenos_Aires'))
exp_numero = current_datetime.strftime("%Y%m%d %H%M%S").replace(" ", "_")
exp_numero

'20231219_202908'

In [9]:
# Guardo Info del experimento
carpeta_exp = carpeta_base_exp + exp_numero + "/"
if not os.path.exists(carpeta_exp):
    os.makedirs(carpeta_exp)
    
# Parametros
parametros_nombres = ["PERIODOS_EXCLUIR","PERIODO_INICIO_PARAM","TRAIN_PERIODO_LIMITE_PARAM",
                      "VALIDATE_PERIODO_PARAM","USAR_HOLDOUT_PARAM",
                      "TRAIN_ALL_PERIODO_LIMITE_PARAM","HOLDOUT_PERIODO_PARAM",
                      "MODELO_FINAL_PERIODO_LIMITE_PARAM","FUTURE_PERIODO_PARAM",
                      "ARCH_PRODUCTOS_PREDICCION_ENTRENAMIENTO","ARCH_PRODUCTOS_PREDICCION_FUTURE",
                      "NUM_LAGS_PARAM","FAMILIA_FEATURES_TEMP_PARAM","AMPLIA_FEATURES_PARAM",
                      "CANT_ITERACIONES_OPT_BAY_PARAM","CANT_SEMILLAS"]

parametros_valores = [config.PERIODOS_EXCLUIR,config.PERIODO_INICIO_PARAM,config.TRAIN_PERIODO_LIMITE_PARAM,
                      config.VALIDATE_PERIODO_PARAM,
                      config.USAR_HOLDOUT_PARAM,config.TRAIN_ALL_PERIODO_LIMITE_PARAM,config.HOLDOUT_PERIODO_PARAM,
                      config.MODELO_FINAL_PERIODO_LIMITE_PARAM,config.FUTURE_PERIODO_PARAM,
                      config.ARCH_PRODUCTOS_PREDICCION_ENTRENAMIENTO,config.ARCH_PRODUCTOS_PREDICCION_FUTURE,
                      config.NUM_LAGS_PARAM,config.FAMILIA_FEATURES_TEMP_PARAM,config.AMPLIA_FEATURES_PARAM,
                      config.CANT_ITERACIONES_OPT_BAY_PARAM, config.CANT_SEMILLAS]

parametros = pd.DataFrame(data={"nombre":parametros_nombres,"valor":parametros_valores})
parametros.to_csv(carpeta_exp + "parametros.csv",index=False)

In [None]:
# Entrenamiento por cluster
er_validate_iter = [None] * num_clusters
er_holdout_semillerio_iter = [None] * num_clusters
params_iter = [None] * num_clusters
best_iter_iter = [None] * num_clusters
pred_iter = [None] * num_clusters
prod_iter = [None] * num_clusters

for i in range(0,num_clusters):
    print("=========================================== Cluster", i, "===========================================")
    
    df_sellout = pd.read_csv(prefijo_arch_entrada + str(i) + ".csv")
    
    ### Categoricas
    df_sellout = extras.convertir_categoricas_prod_cust(df_sellout)

    ### Remover columnas
    df_sellout = df_sellout.drop(columns=["periodo_fecha","brand"])
    
    ### Split de datasets
    df_train = df_sellout[(df_sellout.periodo <= config.TRAIN_PERIODO_LIMITE_PARAM) & (df_sellout.periodo >= config.PERIODO_INICIO_PARAM)]
    df_validate = df_sellout[(df_sellout.periodo == config.VALIDATE_PERIODO_PARAM)]
    
    print("Periodos entrenar:",df_train.periodo.unique())
    print("Periodos validar:",df_validate.periodo.unique())
    
    df_validate = df_validate.sort_values(by="product_id",ascending=True)
    print("product_id a validar:", len(df_validate))

    ### Armado variables entrenamiento
    cols_remover_entren = ["tn_mas_2","periodo"]
    
    # Train - Validate
    X_train = df_train.drop(columns=cols_remover_entren, axis=1)
    X_validate = df_validate.drop(columns=cols_remover_entren, axis=1)
    
    y_train = df_train.tn_mas_2
    y_validate = df_validate.tn_mas_2
    
    print("X_train:", X_train.shape)
    print("y_train:", y_train.shape)
    
    print("\nX_validate:", X_validate.shape)
    print("y_validate:", y_validate.shape)
    
    cols_entren = X_train.columns.tolist()

    ### Optimizacion Bayesiana
    #Verifico si hay datos de entrenamiento y validacion en el cluster (podria haber 0)
    if len(X_train)>0 and len(X_validate)>0:
        print("Optimizacion Bayesiana")
        GLOBAL_PARAMS_ITER = []
        GLOBAL_BEST_IT_ITER = []
        GLOBAL_ER_VALIDATE_ITER = []
        GLOBAL_MODEL_ITER = []
        
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=config.CANT_ITERACIONES_OPT_BAY_PARAM)
        print('# Experimentos:', len(study.trials))
        
        if len(study.get_trials(states=[optuna.trial.TrialState.COMPLETE])) > 0:
            print('Mejor experimento:', study.best_trial.params)

            best_model_index = study.best_trial.number
            best_model = GLOBAL_MODEL_ITER[best_model_index]
    
            er_validate_iter[i]=study.best_trial.value
            params_iter[i] = GLOBAL_PARAMS_ITER[best_model_index]
            best_iter_iter[i] = GLOBAL_BEST_IT_ITER[best_model_index]
        else:
            er_validate_iter[i]=None
            params_iter[i] = None
            best_iter_iter[i] = None 
    else:
        er_validate_iter[i]=None
        params_iter[i] = None
        best_iter_iter[i] = None 



[I 2023-12-19 23:29:13,026] A new study created in memory with name: no-name-32df2164-e8cf-4792-8283-9ddf3cf9c946


Periodos entrenar: [201701 201702 201703 201704 201705 201706 201707 201708 201709 201710
 201711 201712 201801 201802 201803 201804 201805 201806 201807 201808
 201809 201810]
Periodos validar: [201812]
product_id a validar: 27900
X_train: (469490, 30)
y_train: (469490,)

X_validate: (27900, 30)
y_validate: (27900,)
Optimizacion Bayesiana
**************** tweedie ****************
Training until validation scores don't improve for 82 rounds
[100]	valid_0's ER: 81.8


[I 2023-12-19 23:29:19,785] Trial 0 finished with value: 81.58 and parameters: {'lambda_l1': 215.74659694559784, 'lambda_l2': 783.904179023682, 'learning_rate': 0.15249711615979497, 'num_leaves': 69, 'feature_fraction': 0.5127341536630576, 'min_data_in_leaf': 3470}. Best is trial 0 with value: 81.58.


Early stopping, best iteration is:
[106]	valid_0's ER: 81.58
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 71 rounds
[100]	valid_0's ER: 83.58


[I 2023-12-19 23:29:23,804] Trial 1 finished with value: 83.55 and parameters: {'lambda_l1': 842.481332865014, 'lambda_l2': 546.3460478326037, 'learning_rate': 0.23427388866163415, 'num_leaves': 343, 'feature_fraction': 0.9236629462988275, 'min_data_in_leaf': 3174}. Best is trial 0 with value: 81.58.


Early stopping, best iteration is:
[56]	valid_0's ER: 83.55
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 70 rounds
[100]	valid_0's ER: 75.36
[200]	valid_0's ER: 72.79
[300]	valid_0's ER: 72.16
[400]	valid_0's ER: 71.43
Early stopping, best iteration is:
[378]	valid_0's ER: 71.34
Evaluated only: ER


[I 2023-12-19 23:30:49,450] Trial 2 finished with value: 71.34 and parameters: {'lambda_l1': 39.560078670138, 'lambda_l2': 457.20676991807466, 'learning_rate': 0.2464459782075642, 'num_leaves': 475, 'feature_fraction': 0.49486486704925625, 'min_data_in_leaf': 440}. Best is trial 2 with value: 71.34.


**************** tweedie ****************
Training until validation scores don't improve for 507 rounds
[100]	valid_0's ER: 122.82
[200]	valid_0's ER: 101.78
[300]	valid_0's ER: 92.97
[400]	valid_0's ER: 88.25
[500]	valid_0's ER: 86.42
[600]	valid_0's ER: 85.21
[700]	valid_0's ER: 84.98
[800]	valid_0's ER: 84.74
[900]	valid_0's ER: 84.18
[1000]	valid_0's ER: 83.78
[1100]	valid_0's ER: 83.4
[1200]	valid_0's ER: 83.16
[1300]	valid_0's ER: 83.09
[1400]	valid_0's ER: 82.98
[1500]	valid_0's ER: 82.78
[1600]	valid_0's ER: 82.63
[1700]	valid_0's ER: 82.51
[1800]	valid_0's ER: 82.39
[1900]	valid_0's ER: 82.3
[2000]	valid_0's ER: 82.28
[2100]	valid_0's ER: 82.27
[2200]	valid_0's ER: 82.27
[2300]	valid_0's ER: 82.27
[2400]	valid_0's ER: 82.27
Early stopping, best iteration is:
[1935]	valid_0's ER: 82.26
Evaluated only: ER


[I 2023-12-19 23:32:01,238] Trial 3 finished with value: 82.26 and parameters: {'lambda_l1': 392.93059491809635, 'lambda_l2': 989.5535512754971, 'learning_rate': 0.010928734493769408, 'num_leaves': 53, 'feature_fraction': 0.596354406261195, 'min_data_in_leaf': 6827}. Best is trial 2 with value: 71.34.


**************** tweedie ****************
Training until validation scores don't improve for 71 rounds
[100]	valid_0's ER: 84.84


[I 2023-12-19 23:32:05,307] Trial 4 finished with value: 84.84 and parameters: {'lambda_l1': 646.2805659580777, 'lambda_l2': 313.8262399844448, 'learning_rate': 0.22873554813786653, 'num_leaves': 875, 'feature_fraction': 0.7356104128928871, 'min_data_in_leaf': 6268}. Best is trial 2 with value: 71.34.


Early stopping, best iteration is:
[72]	valid_0's ER: 84.84
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 82 rounds
[100]	valid_0's ER: 77.61
[200]	valid_0's ER: 76.78
Early stopping, best iteration is:
[149]	valid_0's ER: 76.53
Evaluated only: ER


[I 2023-12-19 23:32:32,649] Trial 5 finished with value: 76.53 and parameters: {'lambda_l1': 206.9861572628313, 'lambda_l2': 764.8180068950719, 'learning_rate': 0.1545960071781213, 'num_leaves': 806, 'feature_fraction': 0.44967553101801727, 'min_data_in_leaf': 313}. Best is trial 2 with value: 71.34.


**************** tweedie ****************
Training until validation scores don't improve for 76 rounds
[100]	valid_0's ER: 81.16


[I 2023-12-19 23:32:37,640] Trial 6 finished with value: 81.15 and parameters: {'lambda_l1': 494.3541757182491, 'lambda_l2': 45.283582965313315, 'learning_rate': 0.18673621050803846, 'num_leaves': 601, 'feature_fraction': 0.7604038528469432, 'min_data_in_leaf': 1826}. Best is trial 2 with value: 71.34.


Early stopping, best iteration is:
[69]	valid_0's ER: 81.15
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 102 rounds
[100]	valid_0's ER: 80.03
[200]	valid_0's ER: 78.47
Early stopping, best iteration is:
[194]	valid_0's ER: 78.4
Evaluated only: ER


[I 2023-12-19 23:32:53,968] Trial 7 finished with value: 78.41 and parameters: {'lambda_l1': 721.0029088366725, 'lambda_l2': 220.15671886835986, 'learning_rate': 0.09454760448433763, 'num_leaves': 371, 'feature_fraction': 0.31772286108688147, 'min_data_in_leaf': 814}. Best is trial 2 with value: 71.34.


**************** tweedie ****************
Training until validation scores don't improve for 74 rounds
[100]	valid_0's ER: 84.24


[I 2023-12-19 23:32:58,312] Trial 8 finished with value: 84.15 and parameters: {'lambda_l1': 555.1083439567553, 'lambda_l2': 357.6560501390165, 'learning_rate': 0.20757779260435422, 'num_leaves': 306, 'feature_fraction': 0.6517002619962594, 'min_data_in_leaf': 7533}. Best is trial 2 with value: 71.34.


Early stopping, best iteration is:
[80]	valid_0's ER: 84.15
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 165 rounds
[100]	valid_0's ER: 85.57
[200]	valid_0's ER: 80.78
[300]	valid_0's ER: 80.24
[400]	valid_0's ER: 80.12
[500]	valid_0's ER: 80.2
Early stopping, best iteration is:
[362]	valid_0's ER: 80
Evaluated only: ER


[I 2023-12-19 23:33:17,302] Trial 9 finished with value: 80.0 and parameters: {'lambda_l1': 126.00834798991578, 'lambda_l2': 273.8119942548254, 'learning_rate': 0.043439334028378476, 'num_leaves': 576, 'feature_fraction': 0.5864025985867315, 'min_data_in_leaf': 4607}. Best is trial 2 with value: 71.34.


# Experimentos: 10
Mejor experimento: {'lambda_l1': 39.560078670138, 'lambda_l2': 457.20676991807466, 'learning_rate': 0.2464459782075642, 'num_leaves': 475, 'feature_fraction': 0.49486486704925625, 'min_data_in_leaf': 440}


[I 2023-12-19 23:33:19,571] A new study created in memory with name: no-name-c5c53f86-4e3e-467e-95cc-cebf58a390f3


Periodos entrenar: [201701 201702 201703 201704 201705 201706 201707 201708 201709 201710
 201711 201712 201801 201802 201803 201804 201805 201806 201807 201808
 201809 201810]
Periodos validar: [201812]
product_id a validar: 19530
X_train: (361369, 30)
y_train: (361369,)

X_validate: (19530, 30)
y_validate: (19530,)
Optimizacion Bayesiana
**************** tweedie ****************
Training until validation scores don't improve for 84 rounds
[100]	valid_0's ER: 89.38


[I 2023-12-19 23:33:23,920] Trial 0 finished with value: 88.62 and parameters: {'lambda_l1': 422.7467015667554, 'lambda_l2': 788.8263607082303, 'learning_rate': 0.14391381547345433, 'num_leaves': 767, 'feature_fraction': 0.2314000402155701, 'min_data_in_leaf': 2067}. Best is trial 0 with value: 88.62.


Early stopping, best iteration is:
[78]	valid_0's ER: 88.62
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 87 rounds
[100]	valid_0's ER: 86.64


[I 2023-12-19 23:33:27,364] Trial 1 finished with value: 85.79 and parameters: {'lambda_l1': 451.11968563446646, 'lambda_l2': 346.4172181115993, 'learning_rate': 0.1325550717142388, 'num_leaves': 141, 'feature_fraction': 0.34590146109697995, 'min_data_in_leaf': 3431}. Best is trial 1 with value: 85.79.


Early stopping, best iteration is:
[48]	valid_0's ER: 85.79
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 67 rounds
[100]	valid_0's ER: 83.96


[I 2023-12-19 23:33:30,499] Trial 2 finished with value: 83.92 and parameters: {'lambda_l1': 645.8076489007032, 'lambda_l2': 823.3147412806402, 'learning_rate': 0.28927821986479346, 'num_leaves': 80, 'feature_fraction': 0.7076255385017669, 'min_data_in_leaf': 1839}. Best is trial 2 with value: 83.92.


Early stopping, best iteration is:
[53]	valid_0's ER: 83.92
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 80 rounds
[100]	valid_0's ER: 84


[I 2023-12-19 23:33:37,562] Trial 3 finished with value: 83.7 and parameters: {'lambda_l1': 830.1774861541536, 'lambda_l2': 123.960421637055, 'learning_rate': 0.16663232312298892, 'num_leaves': 943, 'feature_fraction': 0.768251157742325, 'min_data_in_leaf': 931}. Best is trial 3 with value: 83.7.


Early stopping, best iteration is:
[76]	valid_0's ER: 83.7
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 96 rounds
[100]	valid_0's ER: 83.71
[200]	valid_0's ER: 82.49
[300]	valid_0's ER: 82.1


[I 2023-12-19 23:33:47,882] Trial 4 finished with value: 81.98 and parameters: {'lambda_l1': 10.977997867606447, 'lambda_l2': 416.76018755470335, 'learning_rate': 0.10848088089505181, 'num_leaves': 735, 'feature_fraction': 0.5838364075132563, 'min_data_in_leaf': 6976}. Best is trial 4 with value: 81.98.


Early stopping, best iteration is:
[234]	valid_0's ER: 81.98
Evaluated only: ER
**************** tweedie ****************
Training until validation scores don't improve for 105 rounds
[100]	valid_0's ER: 84.6


In [None]:
df_opt_bay = pd.DataFrame(data={"cluster":range(0,num_clusters),"error_validate":er_validate_iter,
                                   "lgb_params":params_iter,"best_iteration":best_iter_iter})
df_opt_bay

In [None]:
# Holdout
prod_cust_iter = []
prod_iter = []
cust_iter = []
y_iter = []
y_pred_iter = []
err_holdout_cluster_iter = []

if(config.USAR_HOLDOUT_PARAM):
    
    for i in range(0,num_clusters):   
        print("******************************************* CLUSTER",i,"*******************************************")

        df_sellout = pd.read_csv(prefijo_arch_entrada + str(i) + ".csv")
              
        ### Categoricas
        df_sellout = extras.convertir_categoricas_prod_cust(df_sellout)
    
        ### Remover columnas
        df_sellout = df_sellout.drop(columns=["periodo_fecha","brand"])

        print("# Productos:",len(df_sellout.product_id.unique()))
        print("# Clientes:",len(df_sellout.customer_id.unique()))

        # Dataframes
        df_train_all = df_sellout[(df_sellout.periodo <= config.TRAIN_ALL_PERIODO_LIMITE_PARAM) & (df_sellout.periodo >= config.PERIODO_INICIO_PARAM)]
        df_holdout = df_sellout[df_sellout.periodo == config.HOLDOUT_PERIODO_PARAM]
           
        print("Periodos entrenar ALL:",df_train_all.periodo.unique())
        print("Periodos holdout:",df_holdout.periodo.unique())
    
        # Variables entrenamiento
        X_train_all = df_train_all.drop(columns=cols_remover_entren, axis=1)
        X_holdout = df_holdout.drop(columns=cols_remover_entren, axis=1)
        
        y_train_all = df_train_all.tn_mas_2
        y_holdout = df_holdout.tn_mas_2
        print("\nX_train_all:", X_train_all.shape)
        print("y_train_all:", y_train_all.shape)
        
        print("\nX_holdout:", X_holdout.shape)
        print("y_holdout:", y_holdout.shape)

        if len(X_train_all)>0:
            # Si se pudo entrenar la OB...
            if df_opt_bay.iloc[i].lgb_params!=None:
                lgbtrain_all = lgb.Dataset(data=X_train_all, label=y_train_all, feature_name=cols_entren)
        
                #Semillerio
                y_pred_holdout_semillerio = lightgbm_aux.semillerio(lgbtrain_all, df_opt_bay.iloc[i].lgb_params, int(df_opt_bay.iloc[i].best_iteration), X_holdout,config.CANT_SEMILLAS)
            # Si no se pudo entrenar...
            else:
                print("IMPUTACION!!!")
                #Obtengo promedio de valores de entrenamiento y asumo que esa es la prediccion
                media_imputada = X_train_all.tn.mean()
                y_pred_holdout_semillerio = np.full(len(X_holdout.prod_cust),media_imputada)

            er_holdout_semillerio= metricas.error_rate(y_holdout,y_pred_holdout_semillerio)
            print("Error holdout semillerio:",er_holdout_semillerio, "(Toneladas cluster: ", y_holdout.sum(),")")
            
            prod_cust_iter.extend(X_holdout.prod_cust.reset_index(drop=True))
            prod_iter.extend(X_holdout.product_id.reset_index(drop=True))
            cust_iter.extend(X_holdout.customer_id.reset_index(drop=True))
            y_iter.extend(y_holdout)
            y_pred_iter.extend(y_pred_holdout_semillerio)
            err_holdout_cluster_iter.append(er_holdout_semillerio)
        else:
            err_holdout_cluster_iter.append(None)

In [None]:
df_opt_bay["error_holdout"]=err_holdout_cluster_iter
df_opt_bay.to_excel(carpeta_exp +  "opt_bay.xlsx",index=False)

df_opt_bay

In [None]:
df_pred = pd.DataFrame(data={"prod_cust":prod_cust_iter,"product_id":prod_iter,"customer_id":cust_iter,"tn_real":y_iter,"tn_pred":y_pred_iter})
df_pred = df_pred[df_pred.product_id.isin(df_prods_prediccion_entren.product_id)]
er_holdout_general = metricas.error_rate(df_pred.tn_real,df_pred.tn_pred)
print("ER Holdout General:",er_holdout_general)
df_pred.to_csv(carpeta_exp +  "general_prediction_" + str(er_holdout_general) + ".csv",index=False)

In [None]:
df_pred_prod = df_pred.groupby("product_id").agg({"tn_real":"sum","tn_pred":"sum"}).reset_index()
er_holdout_prod = metricas.error_rate(df_pred_prod.tn_real,df_pred_prod.tn_pred)
print("ER Holdout por Producto:",er_holdout_prod)
df_pred_prod.to_csv(carpeta_exp +  "prod_prediction_" + str(er_holdout_prod) + ".csv",index=False)

In [None]:
star_products = [20001, 20002, 20003, 20004, 20005, 20006, 20007, 20009, 20011, 20032]
star_customers = [10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10009, 10011, 10012, 10013]
df_pred_star = df_pred[(df_pred.product_id.isin(star_products))&(df_pred.customer_id.isin(star_customers))]

er_holdout_star= metricas.error_rate(df_pred_star.tn_real,df_pred_star.tn_pred)
print("ER Holdout por Producto-Cliente Estrellas:",er_holdout_star)
df_pred_prod.to_csv(carpeta_exp +  "prod_cust_prediction_" + str(er_holdout_star) + ".csv",index=False)

In [None]:
print("FINALIZADO")
exp_numero