In [1]:
import os

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split, KFold
from sklearn import datasets
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV,PredefinedSplit
from sklearn.metrics import classification_report, confusion_matrix 
import lightgbm
from lightgbm import LGBMRegressor
import numpy as np
from skopt import BayesSearchCV, space, plots
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from time import time
import pprint

import pandas as pd

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
####################################################
############# Setear segun cada maquina ############
#os.chdir("C:/Users/herna/labo3_empresa3_repo/datasets")
os.chdir("C:/diego_tools/labo3/dataset")
####################################################

In [3]:
def error_rate(y, y_pred):
    dif_abs = sum(abs(y - y_pred))
    suma_real = sum(y)
    print("Y REAL:" ,suma_real)
    return round(100*dif_abs/suma_real,2)

In [4]:
def report_perf(optimizer, X, y, title="model", callbacks=None):
    """
    A wrapper for measuring time and performances of different optimizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    
    if callbacks is not None:
        optimizer.fit(X, y, callbacks=callbacks)
    else:
        optimizer.fit(X, y)
        
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1"+" %.3f") % (time() - start, 
                                   len(optimizer.cv_results_['params']),
                                   best_score,
                                   best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [5]:
df_sellout = pd.read_csv("emp3_sellout_base_period_product_FE_sin_norm.csv")

In [6]:
df_sellout.head()

Unnamed: 0,periodo,product_id,tn,cust_request_qty,cust_request_tn,plan_precios_cuidados,cat1,cat2,cat3,sku_size,...,lag_mean_tn_q2,lag_sum_tn_q3,lag_mean_tn_q3,lag_sum_tn_q4,lag_mean_tn_q4,lag_trend_q1,lag_trend_q2,lag_trend_q3,lag_trend_q4,tn_mas_2
0,2019-04-01,20001,1647.63848,478,1757.73271,0,HC,ROPA LAVADO,Liquido,3000.0,...,1864.966707,4710.04632,1570.01544,3695.97419,1231.991397,-97.44151,404.255815,15.86777,50.246465,1109.93769
1,2019-04-01,20002,1287.62346,454,1360.44402,0,HC,ROPA LAVADO,Liquido,3000.0,...,1384.918527,3093.52244,1031.174147,3136.4297,1045.476567,91.580995,184.51787,11.58332,-17.309555,928.36431
2,2019-04-01,20003,565.33774,282,569.69482,0,FOODS,ADEREZOS,Mayonesa,475.0,...,1096.696177,2524.53935,841.513117,2210.57046,736.85682,163.364545,271.75671,-128.05728,52.372575,662.38654
3,2019-04-01,20004,466.70901,346,468.21007,0,FOODS,ADEREZOS,Mayonesa,240.0,...,732.52744,2538.70653,846.23551,1700.72775,566.90925,-54.216855,112.053045,-147.72016,81.83381,667.19411
4,2019-04-01,20005,624.9988,327,629.64621,0,FOODS,ADEREZOS,Mayonesa,120.0,...,578.546193,2139.00521,713.001737,1681.15422,560.38474,-62.314745,260.55329,-129.717215,-25.603695,876.39696


In [7]:
df_sellout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11420 entries, 0 to 11419
Columns: 343 entries, periodo to tn_mas_2
dtypes: bool(1), float64(322), int64(16), object(4)
memory usage: 29.8+ MB


In [8]:
df_sellout.isna().sum()

periodo               0
product_id            0
tn                    0
cust_request_qty      0
cust_request_tn       0
                   ... 
lag_trend_q1          0
lag_trend_q2          0
lag_trend_q3          0
lag_trend_q4          0
tn_mas_2            261
Length: 343, dtype: int64

In [9]:
print(len(df_sellout))
df_sellout = df_sellout[~df_sellout.tn_mas_2.isna()]
print(len(df_sellout))

11420
11159


In [10]:
lag_cols = [col for col in df_sellout.columns if "lag" in col]
#for col in lag_cols:
#    print(col)
    
df_sellout = df_sellout.drop(columns=lag_cols)

In [11]:
for col in df_sellout.columns:
    print(col)

periodo
product_id
tn
cust_request_qty
cust_request_tn
plan_precios_cuidados
cat1
cat2
cat3
sku_size
temp_media
temp_max_media
temp_min_media
IPC
promedio_mens_dolar_venta
catastrofe
tn_mas_2


In [12]:
#Transformo todas las categoricas
categories = ["plan_precios_cuidados","cat1","cat2","cat3","product_id"]

for c in categories: 
    df_sellout[c] = df_sellout[c].astype("category")   

In [13]:
df_sellout.periodo.head()

0    2019-04-01
1    2019-04-01
2    2019-04-01
3    2019-04-01
4    2019-04-01
Name: periodo, dtype: object

In [14]:
df_train_test = df_sellout[df_sellout.periodo <= "2018-12-01"]
df_train_test = df_train_test[df_train_test.periodo != "2018-11-01"]
df_holdout = df_sellout[df_sellout.periodo == "2019-02-01"]

In [15]:
df_train_test.periodo.unique()

array(['2018-12-01', '2018-10-01', '2018-09-01', '2018-08-01',
       '2018-07-01', '2018-06-01', '2018-05-01', '2018-04-01'],
      dtype=object)

In [16]:
df_holdout.periodo.unique()

array(['2019-02-01'], dtype=object)

model_tuning_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

# Since we want to use a predefined Test/Val set, we'll use PredefinedSplit and pass it as the CV parameter
# We need to merge both the datasets and label 0 for test and -1 for the train set
df_train_test["test"] = np.where(df_train_test.periodo=="2018-12-01",0,-1)

test_fold = np.array(df_train_test.test)
ps = PredefinedSplit(test_fold)

X = df_train_test.drop(columns=["tn_mas_2","periodo","test"], axis=1)
y = df_train_test.tn_mas_2

opt = RandomizedSearchCV(
            LGBMRegressor(),
            model_tuning_params,
            scoring=make_scorer(error_rate,greater_is_better=False),
            cv=ps,
            verbose=5,
            return_train_score=True,
            random_state=seed,
            n_iter=param_iterations)

#overdone_control = DeltaYStopper(delta=0.0001)               # We stop if the gain of the optimization becomes too small
#time_limit_control = DeadlineStopper(total_time=60 * param_minutes_limit)     # We impose a time limit (xx minutes)
#best_params = report_perf(opt, X, y,'LightGBM', callbacks=[overdone_control, time_limit_control])

opt.fit(X, y)
        
d=pd.DataFrame(opt.cv_results_)
best_score = opt.best_score_
best_score_std = d.iloc[opt.best_index_].std_test_score
best_params = opt.best_params_
best_score

pd.DataFrame(opt.cv_results_) \
            .sort_values(by='rank_test_score').head(50).to_csv("opt_results.csv")

model = LGBMClassifier(**best_all_model_params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
test_result = metrica(y_test, y_pred)
test_result

In [17]:
#lightgbm.plot_importance(model, max_num_features = 30, height=1)
