### Objetivo: modelo de previsão de vendas
##### Autor: Yan Sym

#### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer
from hyperopt import fmin, hp, tpe
from math import sqrt
import lightgbm as lgb
import warnings
import random
import gc

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
%matplotlib inline

#### Variáveis globais e parâmetros

In [2]:
random_seed = 42
n_fold = 5

params = {'num_leaves': 8,
          'learning_rate': 0.05,
          'subsample': 0.85,
          'feature_fraction': 0.85,
          'boosting_type': 'gbdt',
          'n_jobs': 3,
          'max_depth': -1,
          "metric": 'rmse',
          'random_state': 42,
          'reg_lambda': 10}

lista_vars_explicativas = ['cod_municipio',
'mes',
'qtde_media',
'qtde_soma',
'qtde_max',
'qtde_min',
'feature_04',
'feature_08',
'feature_09',
'feature_14',
'feature_18',
'feature_04_vezes_06',
'receita_target_lag_12_meses',
'receita_loja_std_3_meses',
'receita_target_media_movel_3_meses',
'receita_total_loja_anomes',
'receita_loja_media_movel_3_meses',
'soma_quantidade_items_loja_media_movel_3_meses',
'receita_loja_lag_1']

#### Métodos

In [3]:
# métricas de erro
def mae_score(true, pred):
    return mean_absolute_error(true, pred)

def mse_score(true, pred):
    return mean_squared_error(true, pred)

def rmse_score(true, pred):
    return (sqrt(mean_squared_error(true, pred)))

#### Leitura de bases

In [4]:
df_lojas = pd.read_csv('lojas_atuais.csv')
print (df_lojas.shape)
df_lojas.head()

(3130, 21)


Unnamed: 0,cod_loja,cod_ap,cod_municipio,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18
0,1,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,
1,2,5300108005004,5300108,,FEATURE_02_VALUE_03,True,22.0,FEATURE_05_VALUE_03,27,0.215967,0.197287,1562.674817,3.670883,,,,,,,,
2,3,3550308005052,3550308,,FEATURE_02_VALUE_03,True,45.66,FEATURE_05_VALUE_03,6,0.190102,0.17914,5593.905463,2.815883,,,,,,,,
3,4,3145604003002,3145604,,FEATURE_02_VALUE_04,False,30.0,FEATURE_05_VALUE_03,3,0.141988,0.143677,12.824022,7.218132,,,,,,,,
4,5,1600303005010,1600303,FEATURE_01_VALUE_10,FEATURE_02_VALUE_02,False,62.49,FEATURE_05_VALUE_03,30,0.107619,0.103083,325.53112,1.972944,FEATURE_11_VALUE_02,FEATURE_12_VALUE_01,100.0,88.1,100.0,100.0,88.5,93.8


In [5]:
df_faturamento = pd.read_csv('faturamento_lojas_atuais.csv')
print (df_faturamento.shape)
df_faturamento.head()

(1003094, 6)


Unnamed: 0,cod_loja,ano,mes,categoria,qtde,receita
0,1,2016,1,CATEG_01,301.0,9164.86
1,1,2016,2,CATEG_01,187.0,8175.55
2,1,2016,3,CATEG_01,242.0,10521.67
3,1,2016,4,CATEG_01,104.0,4560.91
4,1,2016,5,CATEG_01,100.0,4263.3


#### engenharia de variáveis explicativas

In [6]:
df_lojas['feature_04_vezes_06'] = df_lojas['feature_04'] * df_lojas['feature_06']

#### Agrupamento de produtos por loja

In [7]:
df_faturamento['ano_mes'] = df_faturamento['ano'].map(str) + df_faturamento['mes'].map(str).str.zfill(2)
df_faturamento['ano_mes'] = pd.to_datetime(df_faturamento['ano_mes'], format='%Y%m')

In [8]:
df_quantidade_anomes = pd.DataFrame(df_faturamento.groupby(['cod_loja', 'ano', 'mes']).\
                                    agg({'qtde': ['sum', 'mean', 'max', 'min']})).reset_index()

df_quantidade_anomes.columns = ['cod_loja', 'ano', 'mes', 'qtde_soma', 'qtde_media', 'qtde_min', 'qtde_max']

df_faturamento_medio_loja_anomes = pd.DataFrame(df_faturamento.groupby(['cod_loja', 'ano', 'mes'])['receita'].sum()).reset_index()
df_faturamento_medio_loja_anomes.columns = ['cod_loja', 'ano', 'mes', 'receita_total_loja_anomes']

df_faturamento = pd.merge(df_faturamento_medio_loja_anomes,
                          df_quantidade_anomes,
                          on=['cod_loja', 'ano', 'mes'],
                          how='inner')

In [9]:
df_faturamento['ano_mes'] = df_faturamento['ano'].map(str) + df_faturamento['mes'].map(str).str.zfill(2)
df_faturamento['ano_mes'] = pd.to_datetime(df_faturamento['ano_mes'], format='%Y%m')
df_faturamento['ano_mes'].value_counts(dropna=False)

2017-12-01    3123
2017-11-01    3108
2017-10-01    3100
2017-08-01    3093
2017-09-01    3092
2017-07-01    3091
2017-06-01    3088
2017-01-01    3087
2017-05-01    3087
2017-03-01    3084
2016-12-01    3083
2017-02-01    3083
2017-04-01    3078
2016-11-01    3064
2016-10-01    3049
2016-08-01    3048
2016-09-01    3048
2016-07-01    3046
2016-06-01    3041
2016-05-01    3036
2016-04-01    3031
2016-01-01    3029
2016-03-01    3028
2016-02-01    3027
Name: ano_mes, dtype: int64

In [10]:
df_faturamento.head()

Unnamed: 0,cod_loja,ano,mes,receita_total_loja_anomes,qtde_soma,qtde_media,qtde_min,qtde_max,ano_mes
0,1,2016,1,441833.37,13034.0,931.0,3820.0,24.0,2016-01-01
1,1,2016,2,395340.22,11411.0,815.071429,2637.0,24.0,2016-02-01
2,1,2016,3,587810.18,15030.0,1073.571429,4251.0,71.0,2016-03-01
3,1,2016,4,534618.84,12726.0,909.0,3688.0,7.0,2016-04-01
4,1,2016,5,595477.92,13978.0,998.428571,4345.0,84.0,2016-05-01


In [11]:
df_faturamento['receita_m1'] = df_faturamento.groupby('cod_loja')['receita_total_loja_anomes'].transform(lambda x: x.shift(-1))
df_faturamento['receita_m2'] = df_faturamento.groupby('cod_loja')['receita_total_loja_anomes'].transform(lambda x: x.shift(-2))
df_faturamento['receita_m3'] = df_faturamento.groupby('cod_loja')['receita_total_loja_anomes'].transform(lambda x: x.shift(-3))
df_faturamento['receita_target'] = (df_faturamento['receita_m1'] + df_faturamento['receita_m2'] + df_faturamento['receita_m3'])/3

In [12]:
df_media_movel1 = pd.DataFrame(df_faturamento.groupby(['cod_loja'])\
                              ['qtde_soma'].transform(lambda x: x.rolling(3).mean()))

df_media_movel2 = pd.DataFrame(df_faturamento.groupby(['cod_loja'])\
                              ['receita_total_loja_anomes'].transform(lambda x: x.rolling(3).std()))

df_media_movel3 = pd.DataFrame(df_faturamento.groupby(['cod_loja'])\
                              ['receita_total_loja_anomes'].transform(lambda x: x.rolling(3).mean()))

df_media_movel4 = pd.DataFrame(df_faturamento.groupby(['cod_loja'])\
                              ['receita_total_loja_anomes'].transform(lambda x: x.shift(6)))

df_media_movel5 = pd.DataFrame(df_faturamento.groupby(['cod_loja'])\
                              ['receita_total_loja_anomes'].transform(lambda x: x.shift(12)))

df_media_movel6 = pd.DataFrame(df_faturamento.groupby(['cod_loja'])\
                              ['receita_target'].transform(lambda x: x.rolling(3).mean()))

df_media_movel7 = pd.DataFrame(df_faturamento.groupby(['cod_loja'])\
                              ['receita_target'].transform(lambda x: x.rolling(6).mean()))

df_media_movel8 = pd.DataFrame(df_faturamento.groupby(['cod_loja'])\
                              ['receita_target'].transform(lambda x: x.shift(12)))

df_media_movel9 = pd.DataFrame(df_faturamento.groupby(['cod_loja'])\
                              ['receita_target'].transform(lambda x: x.shift(1)))

df_faturamento['soma_quantidade_items_loja_media_movel_3_meses'] = df_media_movel1['qtde_soma'].values.tolist()
df_faturamento['receita_loja_std_3_meses'] = df_media_movel2['receita_total_loja_anomes'].values.tolist()
df_faturamento['receita_loja_media_movel_3_meses'] = df_media_movel3['receita_total_loja_anomes'].values.tolist()
df_faturamento['receita_loja_lag_6_meses'] = df_media_movel4['receita_total_loja_anomes'].values.tolist()
df_faturamento['receita_loja_lag_12_meses'] = df_media_movel5['receita_total_loja_anomes'].values.tolist()
df_faturamento['receita_target_media_movel_3_meses'] = df_media_movel6['receita_target'].values.tolist()
df_faturamento['receita_target_media_movel_6_meses'] = df_media_movel7['receita_target'].values.tolist()
df_faturamento['receita_target_lag_12_meses'] = df_media_movel8['receita_target'].values.tolist()
df_faturamento['receita_loja_lag_1'] = df_media_movel9['receita_target'].values.tolist()

df_faturamento.head(10)

Unnamed: 0,cod_loja,ano,mes,receita_total_loja_anomes,qtde_soma,qtde_media,qtde_min,qtde_max,ano_mes,receita_m1,receita_m2,receita_m3,receita_target,soma_quantidade_items_loja_media_movel_3_meses,receita_loja_std_3_meses,receita_loja_media_movel_3_meses,receita_loja_lag_6_meses,receita_loja_lag_12_meses,receita_target_media_movel_3_meses,receita_target_media_movel_6_meses,receita_target_lag_12_meses,receita_loja_lag_1
0,1,2016,1,441833.37,13034.0,931.0,3820.0,24.0,2016-01-01,395340.22,587810.18,534618.84,505923.08,,,,,,,,,
1,1,2016,2,395340.22,11411.0,815.071429,2637.0,24.0,2016-02-01,587810.18,534618.84,595477.92,572635.646667,,,,,,,,,505923.08
2,1,2016,3,587810.18,15030.0,1073.571429,4251.0,71.0,2016-03-01,534618.84,595477.92,589947.05,573347.936667,13158.333333,100428.687433,474994.59,,,550635.554444,,,572635.646667
3,1,2016,4,534618.84,12726.0,909.0,3688.0,7.0,2016-04-01,595477.92,589947.05,563814.03,583079.666667,13055.666667,99391.933058,505923.08,,,576354.416667,,,573347.936667
4,1,2016,5,595477.92,13978.0,998.428571,4345.0,84.0,2016-05-01,589947.05,563814.03,625181.36,592980.813333,13911.333333,33145.991477,572635.646667,,,583136.138889,,,583079.666667
5,1,2016,6,589947.05,14693.0,1049.5,3894.0,95.0,2016-06-01,563814.03,625181.36,583509.12,590834.836667,13799.0,33654.194794,573347.936667,,,588965.105556,569800.33,,592980.813333
6,1,2016,7,563814.03,13570.0,969.285714,3437.0,90.0,2016-07-01,625181.36,583509.12,611797.3,606829.26,14080.333333,16912.161242,583079.666667,441833.37,,596881.636667,586618.026667,,590834.836667
7,1,2016,8,625181.36,13889.0,992.071429,4159.0,81.0,2016-08-01,583509.12,611797.3,876921.63,690742.683333,14050.666667,30795.942717,592980.813333,395340.22,,629468.926667,606302.532778,,606829.26
8,1,2016,9,583509.12,13309.0,950.642857,3924.0,90.0,2016-09-01,611797.3,876921.63,1257613.0,915443.976667,13589.333333,31332.680883,590834.836667,587810.18,,737671.973333,663318.539444,,690742.683333
9,1,2016,10,611797.3,14709.0,1131.461538,3907.0,79.0,2016-10-01,876921.63,1257613.0,404108.77,846214.466667,13969.0,21275.689477,606829.26,534618.84,,817467.042222,707174.339444,,915443.976667


In [13]:
df_faturamento.shape

(73644, 22)

In [14]:
df_faturamento.head(100)

Unnamed: 0,cod_loja,ano,mes,receita_total_loja_anomes,qtde_soma,qtde_media,qtde_min,qtde_max,ano_mes,receita_m1,receita_m2,receita_m3,receita_target,soma_quantidade_items_loja_media_movel_3_meses,receita_loja_std_3_meses,receita_loja_media_movel_3_meses,receita_loja_lag_6_meses,receita_loja_lag_12_meses,receita_target_media_movel_3_meses,receita_target_media_movel_6_meses,receita_target_lag_12_meses,receita_loja_lag_1
0,1,2016,1,441833.37,13034.0,931.000000,3820.0,24.0,2016-01-01,395340.22,587810.18,534618.84,505923.080000,,,,,,,,,
1,1,2016,2,395340.22,11411.0,815.071429,2637.0,24.0,2016-02-01,587810.18,534618.84,595477.92,572635.646667,,,,,,,,,505923.080000
2,1,2016,3,587810.18,15030.0,1073.571429,4251.0,71.0,2016-03-01,534618.84,595477.92,589947.05,573347.936667,13158.333333,100428.687433,474994.590000,,,550635.554444,,,572635.646667
3,1,2016,4,534618.84,12726.0,909.000000,3688.0,7.0,2016-04-01,595477.92,589947.05,563814.03,583079.666667,13055.666667,99391.933058,505923.080000,,,576354.416667,,,573347.936667
4,1,2016,5,595477.92,13978.0,998.428571,4345.0,84.0,2016-05-01,589947.05,563814.03,625181.36,592980.813333,13911.333333,33145.991477,572635.646667,,,583136.138889,,,583079.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4,2017,12,173683.70,4148.0,319.076923,947.0,13.0,2017-12-01,,,,,2380.333333,68932.752823,94818.060000,59034.85,168548.71,,,47384.846667,
96,5,2016,1,248834.01,5800.0,414.285714,2040.0,7.0,2016-01-01,182297.02,210579.83,202545.65,198474.166667,,,,,,,,,
97,5,2016,2,182297.02,3896.0,278.285714,1059.0,13.0,2016-02-01,210579.83,202545.65,343893.54,252339.673333,,,,,,,,,198474.166667
98,5,2016,3,210579.83,3920.0,280.000000,1287.0,4.0,2016-03-01,202545.65,343893.54,256717.01,267718.733333,4538.666667,33392.790308,213903.620000,,,239510.857778,,,252339.673333


In [15]:
df_faturamento.shape

(73644, 22)

In [16]:
df_faturamento['ano_mes'].unique()

array(['2016-01-01T00:00:00.000000000', '2016-02-01T00:00:00.000000000',
       '2016-03-01T00:00:00.000000000', '2016-04-01T00:00:00.000000000',
       '2016-05-01T00:00:00.000000000', '2016-06-01T00:00:00.000000000',
       '2016-07-01T00:00:00.000000000', '2016-08-01T00:00:00.000000000',
       '2016-09-01T00:00:00.000000000', '2016-10-01T00:00:00.000000000',
       '2016-11-01T00:00:00.000000000', '2016-12-01T00:00:00.000000000',
       '2017-01-01T00:00:00.000000000', '2017-02-01T00:00:00.000000000',
       '2017-03-01T00:00:00.000000000', '2017-04-01T00:00:00.000000000',
       '2017-05-01T00:00:00.000000000', '2017-06-01T00:00:00.000000000',
       '2017-07-01T00:00:00.000000000', '2017-08-01T00:00:00.000000000',
       '2017-09-01T00:00:00.000000000', '2017-10-01T00:00:00.000000000',
       '2017-11-01T00:00:00.000000000', '2017-12-01T00:00:00.000000000'],
      dtype='datetime64[ns]')

#### junta variáveis explicativas

In [17]:
df_faturamento = pd.merge(df_faturamento,
                          df_lojas,
                          on='cod_loja',
                          how='left')

In [18]:
df_faturamento.shape

(73644, 43)

In [19]:
df_faturamento.head()

Unnamed: 0,cod_loja,ano,mes,receita_total_loja_anomes,qtde_soma,qtde_media,qtde_min,qtde_max,ano_mes,receita_m1,receita_m2,receita_m3,receita_target,soma_quantidade_items_loja_media_movel_3_meses,receita_loja_std_3_meses,receita_loja_media_movel_3_meses,receita_loja_lag_6_meses,receita_loja_lag_12_meses,receita_target_media_movel_3_meses,receita_target_media_movel_6_meses,receita_target_lag_12_meses,receita_loja_lag_1,cod_ap,cod_municipio,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_04_vezes_06
0,1,2016,1,441833.37,13034.0,931.0,3820.0,24.0,2016-01-01,395340.22,587810.18,534618.84,505923.08,,,,,,,,,,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,674.56
1,1,2016,2,395340.22,11411.0,815.071429,2637.0,24.0,2016-02-01,587810.18,534618.84,595477.92,572635.646667,,,,,,,,,505923.08,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,674.56
2,1,2016,3,587810.18,15030.0,1073.571429,4251.0,71.0,2016-03-01,534618.84,595477.92,589947.05,573347.936667,13158.333333,100428.687433,474994.59,,,550635.554444,,,572635.646667,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,674.56
3,1,2016,4,534618.84,12726.0,909.0,3688.0,7.0,2016-04-01,595477.92,589947.05,563814.03,583079.666667,13055.666667,99391.933058,505923.08,,,576354.416667,,,573347.936667,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,674.56
4,1,2016,5,595477.92,13978.0,998.428571,4345.0,84.0,2016-05-01,589947.05,563814.03,625181.36,592980.813333,13911.333333,33145.991477,572635.646667,,,583136.138889,,,583079.666667,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,674.56


#### separa base em treino e teste

In [20]:
df_train = df_faturamento.loc[df_faturamento['ano_mes'] <= '2017-09-01']
df_test = df_faturamento.loc[df_faturamento['ano_mes'] == '2017-12-01']

In [21]:
df_train.shape

(64313, 43)

In [22]:
df_train = df_train.loc[(~df_train['receita_target'].isnull())]

In [23]:
df_train.shape

(64283, 43)

In [24]:
df_test.shape

(3123, 43)

In [25]:
df_train['ano_mes'].unique()

array(['2016-01-01T00:00:00.000000000', '2016-02-01T00:00:00.000000000',
       '2016-03-01T00:00:00.000000000', '2016-04-01T00:00:00.000000000',
       '2016-05-01T00:00:00.000000000', '2016-06-01T00:00:00.000000000',
       '2016-07-01T00:00:00.000000000', '2016-08-01T00:00:00.000000000',
       '2016-09-01T00:00:00.000000000', '2016-10-01T00:00:00.000000000',
       '2016-11-01T00:00:00.000000000', '2016-12-01T00:00:00.000000000',
       '2017-01-01T00:00:00.000000000', '2017-02-01T00:00:00.000000000',
       '2017-03-01T00:00:00.000000000', '2017-04-01T00:00:00.000000000',
       '2017-05-01T00:00:00.000000000', '2017-06-01T00:00:00.000000000',
       '2017-07-01T00:00:00.000000000', '2017-08-01T00:00:00.000000000',
       '2017-09-01T00:00:00.000000000'], dtype='datetime64[ns]')

In [26]:
df_test['ano_mes'].unique()

array(['2017-12-01T00:00:00.000000000'], dtype='datetime64[ns]')

#### Divide dados em variáveis explicativas e target

In [27]:
X_train = df_train[lista_vars_explicativas].copy()
y_train = df_train[['receita_target']].copy()

X_test = df_test[lista_vars_explicativas].copy()
y_test = df_test[['receita_target']].copy()

print (f'X_train: {X_train.shape}')
print (f'y_train: {y_train.shape}')
print (f'X_test: {X_test.shape}')
print (f'y_test: {y_test.shape}')

X_train: (64283, 19)
y_train: (64283, 1)
X_test: (3123, 19)
y_test: (3123, 1)


#### Out of time Cross Validation

In [28]:
folds = TimeSeriesSplit(n_splits=n_fold)

splits = folds.split(X_train, y_train)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X_train.shape[0])
columns = X_train.columns.tolist()

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
mean_score = []
for fold_n, (train_index, valid_index) in enumerate(splits):
    print('Fold:',fold_n+1)
    X_train_tmp, X_valid_tmp = X_train[columns].iloc[train_index], X_train[columns].iloc[valid_index]
    y_train_tmp, y_valid_tmp = y_train.iloc[train_index], y_train.iloc[valid_index]
    dtrain = lgb.Dataset(X_train_tmp, label=y_train_tmp)
    dvalid = lgb.Dataset(X_valid_tmp, label=y_valid_tmp)
    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], early_stopping_rounds = 50, verbose_eval=100)
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    y_pred_valid = clf.predict(X_valid_tmp,num_iteration=clf.best_iteration)
    y_oof[valid_index] = y_pred_valid
    val_score = np.sqrt(mean_squared_error(y_pred_valid, y_valid_tmp))
    print(f'val rmse score is {val_score}')
    mean_score.append(val_score)
    y_preds += clf.predict(X_test[columns], num_iteration=clf.best_iteration)/n_fold
    del X_train_tmp, X_valid_tmp, y_train_tmp, y_valid_tmp
    gc.collect()
print('mean rmse score over folds is',np.mean(mean_score))
df_test['pred'] = y_preds

Fold: 1
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 14136.6	valid_1's rmse: 15784.1
[200]	training's rmse: 10162	valid_1's rmse: 11659.4
[300]	training's rmse: 9123.75	valid_1's rmse: 10978.4
[400]	training's rmse: 8499.23	valid_1's rmse: 10530.3
[500]	training's rmse: 8053.37	valid_1's rmse: 10319.5
[600]	training's rmse: 7687.09	valid_1's rmse: 10169.5
[700]	training's rmse: 7386.92	valid_1's rmse: 10062.8
[800]	training's rmse: 7124.3	valid_1's rmse: 9941.86
[900]	training's rmse: 6867.06	valid_1's rmse: 9806.57
[1000]	training's rmse: 6640.9	valid_1's rmse: 9720.92
[1100]	training's rmse: 6431.33	valid_1's rmse: 9641.34
[1200]	training's rmse: 6232.53	valid_1's rmse: 9558.56
[1300]	training's rmse: 6074.24	valid_1's rmse: 9515.04
[1400]	training's rmse: 5909.95	valid_1's rmse: 9474.14
[1500]	training's rmse: 5763.42	valid_1's rmse: 9425.81
[1600]	training's rmse: 5631.35	valid_1's rmse: 9387.76
[1700]	training's rmse: 5502.99	valid_1's rmse: 

[3500]	training's rmse: 4678.84	valid_1's rmse: 7918.2
[3600]	training's rmse: 4637.59	valid_1's rmse: 7909.92
[3700]	training's rmse: 4599.51	valid_1's rmse: 7894.51
[3800]	training's rmse: 4559.32	valid_1's rmse: 7883.44
[3900]	training's rmse: 4522.18	valid_1's rmse: 7872.59
[4000]	training's rmse: 4486.26	valid_1's rmse: 7863.85
[4100]	training's rmse: 4453.3	valid_1's rmse: 7854.81
[4200]	training's rmse: 4418	valid_1's rmse: 7846.24
[4300]	training's rmse: 4387.01	valid_1's rmse: 7838.2
Early stopping, best iteration is:
[4338]	training's rmse: 4374.25	valid_1's rmse: 7833.64
val rmse score is 7833.640399740041
Fold: 4
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 13133.2	valid_1's rmse: 12751.9
[200]	training's rmse: 9347.21	valid_1's rmse: 9384.68
[300]	training's rmse: 8434.19	valid_1's rmse: 8757.94
[400]	training's rmse: 7935.12	valid_1's rmse: 8439.67
[500]	training's rmse: 7592.34	valid_1's rmse: 8308.65
[600]	training's rmse: 7291.75	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['pred'] = y_preds


#### Prepara dados de teste para output

In [29]:
df_resultados = df_test.copy()
df_resultados['pred'] = y_preds
df_resultados = df_resultados.reset_index()
df_resultados = df_resultados[['cod_loja', 'pred']].copy()
df_resultados.columns = ['cod_loja', 'faturamento']

In [30]:
df_resultados.head(100)

Unnamed: 0,cod_loja,faturamento
0,1,282916.965038
1,2,83572.813836
2,3,230192.906579
3,4,88639.154186
4,5,207483.192548
...,...,...
95,112,103023.708435
96,113,235201.327003
97,114,101710.280181
98,115,88547.795238


In [31]:
df_resultados.shape

(3123, 2)

In [32]:
#df_resultados['faturamento'] = 0

In [33]:
df_resultados.shape

(3123, 2)

In [34]:
df_resultados.head()

Unnamed: 0,cod_loja,faturamento
0,1,282916.965038
1,2,83572.813836
2,3,230192.906579
3,4,88639.154186
4,5,207483.192548


#### Salva resultado das predições em um csv

In [35]:
df_resultados.to_csv('desafio_1.csv', sep=';', index=False)

#### Importância das variáveis explicativas

In [36]:
feature_importances

Unnamed: 0,feature,fold_1,fold_2,fold_3,fold_4,fold_5
0,cod_municipio,1739,1792,1487,174,455
1,mes,4211,4799,4082,881,1634
2,qtde_media,1189,1073,741,83,178
3,qtde_soma,1134,1070,693,145,308
4,qtde_max,1154,1062,746,71,183
5,qtde_min,1982,1788,1386,167,457
6,feature_04,1264,1097,871,142,323
7,feature_08,957,805,668,39,152
8,feature_09,1294,1331,1052,79,242
9,feature_14,969,946,724,63,104
