### Objetivo: modelo de previsão de vendas
##### Autor: Yan Sym

#### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from simpledbf import Dbf5
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer
from hyperopt import fmin, hp, tpe
from math import sqrt
import lightgbm as lgb
import warnings
import random
import gc

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
%matplotlib inline

#### Variáveis globais e parâmetros

In [2]:
random_seed = 42

#### Métodos

In [3]:
class CatEncoder():
    
    def __init__(self):
        self.dic = {}
        self.rev_dic = {}
        
    def fit(self, vet):
        uniques = vet.unique()
        for a, b in enumerate(uniques):
            self.dic[b] = a
            self.rev_dic[a] = b
        return self
    
    def check(self, vet):
        if type(vet) == list:
            return pd.Series(vet)
        return vet
    
    def transform(self, vet):
        vet = self.check(vet)
        return vet.map(self.dic).replace(np.nan, -1).astype(int)
    
    def inverse_transform(sekf, vet):
        vet = self.check(vet)
        return vet.map(self.rev_dic).replace(np.nan, 'NaN')
    
# métricas de erro
def mae_score(true, pred):
    return mean_absolute_error(true, pred)

def mse_score(true, pred):
    return mean_squared_error(true, pred)

def rmse_score(true, pred):
    return (sqrt(mean_squared_error(true, pred)))

#### Leitura de bases

In [4]:
df_lojas = pd.read_csv('lojas_atuais.csv')
print (df_lojas.shape)

df_lojas['cod_municipio_6_digitos'] = df_lojas['cod_municipio'].astype(str)

lista_municipios_6_digitos = []

for municipio in df_lojas['cod_municipio'].values.tolist():
    municipio = str(municipio)
    if len(municipio) > 6:
        lista_municipios_6_digitos.append(int(municipio[:-1]))
    else:
        lista_municipios_6_digitos.append(municipio)
    
df_lojas['cod_municipio_6_digitos'] = lista_municipios_6_digitos
df_lojas['cod_municipio_6_digitos'] = df_lojas['cod_municipio_6_digitos'].astype(int)
df_lojas.head()

(3130, 21)


Unnamed: 0,cod_loja,cod_ap,cod_municipio,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,cod_municipio_6_digitos
0,1,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,412550
1,2,5300108005004,5300108,,FEATURE_02_VALUE_03,True,22.0,FEATURE_05_VALUE_03,27,0.215967,0.197287,1562.674817,3.670883,,,,,,,,,530010
2,3,3550308005052,3550308,,FEATURE_02_VALUE_03,True,45.66,FEATURE_05_VALUE_03,6,0.190102,0.17914,5593.905463,2.815883,,,,,,,,,355030
3,4,3145604003002,3145604,,FEATURE_02_VALUE_04,False,30.0,FEATURE_05_VALUE_03,3,0.141988,0.143677,12.824022,7.218132,,,,,,,,,314560
4,5,1600303005010,1600303,FEATURE_01_VALUE_10,FEATURE_02_VALUE_02,False,62.49,FEATURE_05_VALUE_03,30,0.107619,0.103083,325.53112,1.972944,FEATURE_11_VALUE_02,FEATURE_12_VALUE_01,100.0,88.1,100.0,100.0,88.5,93.8,160030


In [5]:
# base de dados pública do IBGE (relativos a 2016)
df = Dbf5('RENDABR10.dbf')
df_municipios = df.to_dataframe()
df_municipios = df_municipios.groupby(['MUNCOD']).mean().reset_index()
print (df_municipios.shape)
df_municipios.head()

(5565, 12)


Unnamed: 0,MUNCOD,NUMRENDA,DENRENDA,DENCRIREND,NUMPOBRES,NUMPOBRESX,NUMCRIPOB,NUMCRIPOBX,NUMDESOCUP,DENDESOCUP,NUMTRABINF,DENTRABINF
0,110001,2254135.0,4819.4,1266.8,2333.6,1259.2,810.0,445.4,100.6,2008.6,65.6,581.2
1,110002,12080140.0,17953.2,4970.6,5574.6,2304.4,2227.6,953.8,407.6,8789.4,223.2,2197.2
2,110003,562952.8,1260.6,324.2,607.2,260.0,206.4,94.8,13.8,553.2,18.6,143.4
3,110004,11233380.0,15628.2,3926.6,5016.8,2302.4,1748.2,788.0,452.8,7714.0,248.6,1764.0
4,110005,1875595.0,3388.8,841.8,1253.6,506.6,449.8,189.6,87.8,1676.6,50.2,388.6


In [6]:
df_pib = pd.read_excel('df_ibge_2015.xlsx')
df_pib.shape

(5570, 18)

In [7]:
for column in df_municipios.columns.tolist():
    print (column)
    if column == 'SITUACAO':
        continue
        
    if column == 'NUMRENDA':
        df_municipios[column] = df_municipios[column].astype(float)
    else:
        df_municipios[column] = df_municipios[column].astype(int)
        
df_join_municipios = pd.merge(df_lojas, df_municipios, how='left', left_on='cod_municipio_6_digitos', right_on='MUNCOD')
print (df_join_municipios.shape)
df_join_municipios.head()

MUNCOD
NUMRENDA
DENRENDA
DENCRIREND
NUMPOBRES
NUMPOBRESX
NUMCRIPOB
NUMCRIPOBX
NUMDESOCUP
DENDESOCUP
NUMTRABINF
DENTRABINF
(3130, 34)


Unnamed: 0,cod_loja,cod_ap,cod_municipio,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,cod_municipio_6_digitos,MUNCOD,NUMRENDA,DENRENDA,DENCRIREND,NUMPOBRES,NUMPOBRESX,NUMCRIPOB,NUMCRIPOBX,NUMDESOCUP,DENDESOCUP,NUMTRABINF,DENTRABINF
0,1,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,412550,412550.0,36533530.0,43865.0,11050.0,6136.0,1731.0,2393.0,618.0,1089.0,23967.0,451.0,4888.0
1,2,5300108005004,5300108,,FEATURE_02_VALUE_03,True,22.0,FEATURE_05_VALUE_03,27,0.215967,0.197287,1562.674817,3.670883,,,,,,,,,530010,530010.0,706081300.0,423966.0,101062.0,77681.0,28689.0,29735.0,10787.0,18031.0,230356.0,2892.0,43877.0
2,3,3550308005052,3550308,,FEATURE_02_VALUE_03,True,45.66,FEATURE_05_VALUE_03,6,0.190102,0.17914,5593.905463,2.815883,,,,,,,,,355030,355030.0,2636974000.0,1862101.0,388246.0,372964.0,177763.0,125227.0,53378.0,70980.0,985472.0,11427.0,172724.0
3,4,3145604003002,3145604,,FEATURE_02_VALUE_04,False,30.0,FEATURE_05_VALUE_03,3,0.141988,0.143677,12.824022,7.218132,,,,,,,,,314560,314560.0,4566713.0,7857.0,1649.0,2050.0,551.0,708.0,213.0,194.0,3996.0,36.0,764.0
4,5,1600303005010,1600303,FEATURE_01_VALUE_10,FEATURE_02_VALUE_02,False,62.49,FEATURE_05_VALUE_03,30,0.107619,0.103083,325.53112,1.972944,FEATURE_11_VALUE_02,FEATURE_12_VALUE_01,100.0,88.1,100.0,100.0,88.5,93.8,160030,160030.0,54577620.0,79093.0,24747.0,31536.0,15347.0,13028.0,6728.0,4089.0,36036.0,866.0,10682.0


In [8]:
# base de dados pública do ibge, com informações socioeconômicas de cada município
df_pib = pd.read_excel('df_ibge_2015.xlsx')
df_join_municipios = pd.merge(df_join_municipios, df_pib, how='left', left_on='cod_municipio', right_on='cod_municipio')
print (df_join_municipios.shape)
df_join_municipios.head()

(3130, 51)


Unnamed: 0,cod_loja,cod_ap,cod_municipio,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,cod_municipio_6_digitos,MUNCOD,NUMRENDA,DENRENDA,DENCRIREND,NUMPOBRES,NUMPOBRESX,NUMCRIPOB,NUMCRIPOBX,NUMDESOCUP,DENDESOCUP,NUMTRABINF,DENTRABINF,uf,nome,flag_amazonia,flag_semiarido,flag_sp,valor_agropecuaria,valor_industria,valor_servicos,valor_administracao,valor_adicionado,valor_impostos,valor_pib,valor_pib_per_capta,atividade_1,atividade_2,atividade_3,populacao
0,1,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,412550,412550.0,36533530.0,43865.0,11050.0,6136.0,1731.0,2393.0,618.0,1089.0,23967.0,451.0,4888.0,PR,São José dos Pinhais,0.0,0.0,0.0,360437.265,6289830.0,8617474.0,1573847.0,16841590.0,5739605.0,22581190.0,73427.61,Demais serviços,Indústrias de transformação,Comércio e reparação de veículos automotores e...,297895.0
1,2,5300108005004,5300108,,FEATURE_02_VALUE_03,True,22.0,FEATURE_05_VALUE_03,27,0.215967,0.197287,1562.674817,3.670883,,,,,,,,,530010,530010.0,706081300.0,423966.0,101062.0,77681.0,28689.0,29735.0,10787.0,18031.0,230356.0,2892.0,43877.0,DF,Brasília,0.0,0.0,0.0,828313.642,8443861.0,108287600.0,98002590.0,215562300.0,29120430.0,244682800.0,80502.47,"Administração, defesa, educação e saúde públic...",Demais serviços,Comércio e reparação de veículos automotores e...,2789761.0
2,3,3550308005052,3550308,,FEATURE_02_VALUE_03,True,45.66,FEATURE_05_VALUE_03,6,0.190102,0.17914,5593.905463,2.815883,,,,,,,,,355030,355030.0,2636974000.0,1862101.0,388246.0,372964.0,177763.0,125227.0,53378.0,70980.0,985472.0,11427.0,172724.0,SP,São Paulo,0.0,0.0,1.0,37648.699,59119810.0,474646000.0,42958610.0,576762100.0,122526200.0,699288400.0,57759.39,Demais serviços,Comércio e reparação de veículos automotores e...,"Administração, defesa, educação e saúde públic...",11967825.0
3,4,3145604003002,3145604,,FEATURE_02_VALUE_04,False,30.0,FEATURE_05_VALUE_03,3,0.141988,0.143677,12.824022,7.218132,,,,,,,,,314560,314560.0,4566713.0,7857.0,1649.0,2050.0,551.0,708.0,213.0,194.0,3996.0,36.0,764.0,MG,Oliveira,0.0,0.0,0.0,52492.165,116481.6,428152.2,166061.6,763187.5,84817.04,848004.6,20235.39,Demais serviços,"Administração, defesa, educação e saúde públic...",Comércio e reparação de veículos automotores e...,41562.0
4,5,1600303005010,1600303,FEATURE_01_VALUE_10,FEATURE_02_VALUE_02,False,62.49,FEATURE_05_VALUE_03,30,0.107619,0.103083,325.53112,1.972944,FEATURE_11_VALUE_02,FEATURE_12_VALUE_01,100.0,88.1,100.0,100.0,88.5,93.8,160030,160030.0,54577620.0,79093.0,24747.0,31536.0,15347.0,13028.0,6728.0,4089.0,36036.0,866.0,10682.0,AP,Macapá,1.0,0.0,0.0,57994.153,628824.4,4633651.0,3911056.0,9231525.0,763352.0,9994877.0,21054.88,"Administração, defesa, educação e saúde públic...",Demais serviços,Comércio e reparação de veículos automotores e...,456171.0


#### engenharia de variáveis explicativas

In [9]:
df_lojas['feature_04_vezes_06'] = df_lojas['feature_04'] * df_lojas['feature_06']

In [10]:
df_faturamento = pd.read_csv('faturamento_lojas_atuais.csv')
print (df_faturamento.shape)
df_faturamento.head()

(1003094, 6)


Unnamed: 0,cod_loja,ano,mes,categoria,qtde,receita
0,1,2016,1,CATEG_01,301.0,9164.86
1,1,2016,2,CATEG_01,187.0,8175.55
2,1,2016,3,CATEG_01,242.0,10521.67
3,1,2016,4,CATEG_01,104.0,4560.91
4,1,2016,5,CATEG_01,100.0,4263.3


#### Análise por produto de cada loja

In [11]:
df_faturamento = pd.DataFrame(df_faturamento.groupby(['cod_loja', 'ano', 'mes'])['receita'].sum()).reset_index()
df_faturamento.columns = ['cod_loja', 'ano', 'mes', 'receita_total_loja_anomes']

In [12]:
df_faturamento['ano_mes'] = df_faturamento['ano'].map(str) + df_faturamento['mes'].map(str).str.zfill(2)
df_faturamento['ano_mes'] = pd.to_datetime(df_faturamento['ano_mes'], format='%Y%m')
df_faturamento['ano_mes'].value_counts(dropna=False)

2017-12-01    3123
2017-11-01    3108
2017-10-01    3100
2017-08-01    3093
2017-09-01    3092
2017-07-01    3091
2017-06-01    3088
2017-01-01    3087
2017-05-01    3087
2017-03-01    3084
2016-12-01    3083
2017-02-01    3083
2017-04-01    3078
2016-11-01    3064
2016-10-01    3049
2016-08-01    3048
2016-09-01    3048
2016-07-01    3046
2016-06-01    3041
2016-05-01    3036
2016-04-01    3031
2016-01-01    3029
2016-03-01    3028
2016-02-01    3027
Name: ano_mes, dtype: int64

In [13]:
df_faturamento

Unnamed: 0,cod_loja,ano,mes,receita_total_loja_anomes,ano_mes
0,1,2016,1,441833.37,2016-01-01
1,1,2016,2,395340.22,2016-02-01
2,1,2016,3,587810.18,2016-03-01
3,1,2016,4,534618.84,2016-04-01
4,1,2016,5,595477.92,2016-05-01
...,...,...,...,...,...
73639,3687,2017,8,71171.48,2017-08-01
73640,3687,2017,9,48284.66,2017-09-01
73641,3687,2017,10,61198.10,2017-10-01
73642,3687,2017,11,73877.02,2017-11-01


#### Análise por Loja

In [14]:
df_faturamento['receita_m1'] = df_faturamento.groupby('cod_loja')['receita_total_loja_anomes'].transform(lambda x: x.shift(-1))
df_faturamento['receita_m2'] = df_faturamento.groupby('cod_loja')['receita_total_loja_anomes'].transform(lambda x: x.shift(-2))
df_faturamento['receita_m3'] = df_faturamento.groupby('cod_loja')['receita_total_loja_anomes'].transform(lambda x: x.shift(-3))
df_faturamento['receita_target'] = (df_faturamento['receita_m1'] + df_faturamento['receita_m2'] + df_faturamento['receita_m3'])/3

In [15]:
df_faturamento.head()

Unnamed: 0,cod_loja,ano,mes,receita_total_loja_anomes,ano_mes,receita_m1,receita_m2,receita_m3,receita_target
0,1,2016,1,441833.37,2016-01-01,395340.22,587810.18,534618.84,505923.08
1,1,2016,2,395340.22,2016-02-01,587810.18,534618.84,595477.92,572635.646667
2,1,2016,3,587810.18,2016-03-01,534618.84,595477.92,589947.05,573347.936667
3,1,2016,4,534618.84,2016-04-01,595477.92,589947.05,563814.03,583079.666667
4,1,2016,5,595477.92,2016-05-01,589947.05,563814.03,625181.36,592980.813333


In [16]:
df_faturamento.shape

(73644, 9)

In [17]:
df_faturamento.head(100)

Unnamed: 0,cod_loja,ano,mes,receita_total_loja_anomes,ano_mes,receita_m1,receita_m2,receita_m3,receita_target
0,1,2016,1,441833.37,2016-01-01,395340.22,587810.18,534618.84,505923.080000
1,1,2016,2,395340.22,2016-02-01,587810.18,534618.84,595477.92,572635.646667
2,1,2016,3,587810.18,2016-03-01,534618.84,595477.92,589947.05,573347.936667
3,1,2016,4,534618.84,2016-04-01,595477.92,589947.05,563814.03,583079.666667
4,1,2016,5,595477.92,2016-05-01,589947.05,563814.03,625181.36,592980.813333
...,...,...,...,...,...,...,...,...,...
95,4,2017,12,173683.70,2017-12-01,,,,
96,5,2016,1,248834.01,2016-01-01,182297.02,210579.83,202545.65,198474.166667
97,5,2016,2,182297.02,2016-02-01,210579.83,202545.65,343893.54,252339.673333
98,5,2016,3,210579.83,2016-03-01,202545.65,343893.54,256717.01,267718.733333


In [18]:
df_faturamento.loc[(df_faturamento['receita_target'].isnull()) & (df_faturamento['ano_mes'] <= '2017-06-01')]

Unnamed: 0,cod_loja,ano,mes,receita_total_loja_anomes,ano_mes,receita_m1,receita_m2,receita_m3,receita_target
29862,1456,2017,5,226281.63,2017-05-01,113736.8,206277.76,,
48683,2393,2017,3,67995.07,2017-03-01,44922.34,107451.14,,
66441,3275,2017,3,16317.19,2017-03-01,16238.59,31449.19,,
66442,3275,2017,4,16238.59,2017-04-01,31449.19,,,
66443,3275,2017,5,31449.19,2017-05-01,,,,
69627,3459,2017,2,9672.27,2017-02-01,4307.34,35331.13,,
69628,3459,2017,3,4307.34,2017-03-01,35331.13,,,
69645,3460,2017,4,674.4,2017-04-01,117.6,60096.08,,
69646,3460,2017,5,117.6,2017-05-01,60096.08,,,
71104,3550,2016,12,167726.99,2016-12-01,2385.46,38768.8,,


In [19]:
df_faturamento = df_faturamento.loc[(~df_faturamento['receita_target'].isnull())]

In [20]:
df_faturamento.shape

(64283, 9)

In [21]:
df_faturamento['ano_mes'].unique()

array(['2016-01-01T00:00:00.000000000', '2016-02-01T00:00:00.000000000',
       '2016-03-01T00:00:00.000000000', '2016-04-01T00:00:00.000000000',
       '2016-05-01T00:00:00.000000000', '2016-06-01T00:00:00.000000000',
       '2016-07-01T00:00:00.000000000', '2016-08-01T00:00:00.000000000',
       '2016-09-01T00:00:00.000000000', '2016-10-01T00:00:00.000000000',
       '2016-11-01T00:00:00.000000000', '2016-12-01T00:00:00.000000000',
       '2017-01-01T00:00:00.000000000', '2017-02-01T00:00:00.000000000',
       '2017-03-01T00:00:00.000000000', '2017-04-01T00:00:00.000000000',
       '2017-05-01T00:00:00.000000000', '2017-06-01T00:00:00.000000000',
       '2017-07-01T00:00:00.000000000', '2017-08-01T00:00:00.000000000',
       '2017-09-01T00:00:00.000000000'], dtype='datetime64[ns]')

#### junta variáveis explicativas

In [22]:
df_faturamento = pd.merge(df_faturamento,
                          df_lojas,
                          on='cod_loja',
                          how='left')

In [23]:
df_faturamento.shape

(64283, 31)

In [24]:
df_faturamento.head()

Unnamed: 0,cod_loja,ano,mes,receita_total_loja_anomes,ano_mes,receita_m1,receita_m2,receita_m3,receita_target,cod_ap,cod_municipio,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,cod_municipio_6_digitos,feature_04_vezes_06
0,1,2016,1,441833.37,2016-01-01,395340.22,587810.18,534618.84,505923.08,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,412550,674.56
1,1,2016,2,395340.22,2016-02-01,587810.18,534618.84,595477.92,572635.646667,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,412550,674.56
2,1,2016,3,587810.18,2016-03-01,534618.84,595477.92,589947.05,573347.936667,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,412550,674.56
3,1,2016,4,534618.84,2016-04-01,595477.92,589947.05,563814.03,583079.666667,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,412550,674.56
4,1,2016,5,595477.92,2016-05-01,589947.05,563814.03,625181.36,592980.813333,4125506005002,4125506,,FEATURE_02_VALUE_04,True,21.76,FEATURE_05_VALUE_04,31,0.168922,0.167659,115.958848,2.349821,,,,,,,,,412550,674.56


In [25]:
df_faturamento['random_var'] = [np.random.normal() for i in range(len(df_faturamento))]

#### separa base em treino e teste

In [26]:
df_train = df_faturamento.loc[df_faturamento['ano_mes'] <= '2017-06-01']
df_test = df_faturamento.loc[df_faturamento['ano_mes'] >= '2017-07-01']

In [27]:
df_train.shape

(55024, 32)

In [28]:
df_test.shape

(9259, 32)

In [29]:
df_train['ano_mes'].unique()

array(['2016-01-01T00:00:00.000000000', '2016-02-01T00:00:00.000000000',
       '2016-03-01T00:00:00.000000000', '2016-04-01T00:00:00.000000000',
       '2016-05-01T00:00:00.000000000', '2016-06-01T00:00:00.000000000',
       '2016-07-01T00:00:00.000000000', '2016-08-01T00:00:00.000000000',
       '2016-09-01T00:00:00.000000000', '2016-10-01T00:00:00.000000000',
       '2016-11-01T00:00:00.000000000', '2016-12-01T00:00:00.000000000',
       '2017-01-01T00:00:00.000000000', '2017-02-01T00:00:00.000000000',
       '2017-03-01T00:00:00.000000000', '2017-04-01T00:00:00.000000000',
       '2017-05-01T00:00:00.000000000', '2017-06-01T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [30]:
df_test['ano_mes'].unique()

array(['2017-07-01T00:00:00.000000000', '2017-08-01T00:00:00.000000000',
       '2017-09-01T00:00:00.000000000'], dtype='datetime64[ns]')

#### Desenvolvimento do modelo

In [31]:
lista_vars_explicativas = ['cod_municipio',
'populacao',
'mes',
'random_var',
'feature_04_vezes_06',
'feature_02',
'feature_04',
'feature_05',
'feature_06',
'feature_09',
'feature_13',
'feature_14',
'feature_17',
'feature_18',
'uf',
'valor_servicos',
'NUMPOBRES']

In [32]:
len(lista_vars_explicativas)

17

### Correlações

In [33]:
df_check = df_train[lista_vars_explicativas + ['receita_target']].copy()
df_check[df_check.columns].corr()['receita_target'][:].sort_values(ascending=False)

KeyError: "['NUMPOBRES', 'uf', 'valor_servicos', 'populacao'] not in index"

### Divide dados em variáveis explicativas e target

In [None]:
X_train = df_train[lista_vars_explicativas].copy()
y_train = df_train[['receita_target']].copy()

X_test = df_test[lista_vars_explicativas].copy()
y_test = df_test[['receita_target']].copy()

print (f'X_train: {X_train.shape}')
print (f'y_train: {y_train.shape}')
print (f'X_test: {X_test.shape}')
print (f'y_test: {y_test.shape}')

In [None]:
cat_cols = [col for col in X_train.dtypes[X_train.dtypes == 'object'].index]
print (cat_cols)

In [None]:
# converte variáveis categóricas
mapeamento = {}
for col in cat_cols:
    ce = CatEncoder()
    print (f'Criando mapeamento para coluna {col}')
    ce.fit(X_train[col].astype(str))
    mapeamento[col] = ce
     
for col in cat_cols:
    ce = mapeamento[col]
    X_train[col] = ce.transform(X_train[col].astype(str))

for col in cat_cols:
    ce = mapeamento[col]
    X_test[col] = ce.transform(X_test[col].astype(str))

In [None]:
n_fold = 5
folds = TimeSeriesSplit(n_splits=n_fold)

params = {'num_leaves': 10,
          'learning_rate': 0.03,
          'subsample': 0.7,
          'feature_fraction': 0.7,
          'boosting_type': 'gbdt',
          'n_jobs': 3,
          'max_depth': -1,
          'metric': 'rmse',
          'random_state': 42,
          'reg_lambda':2}

splits = folds.split(X_train, y_train)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X_train.shape[0])
columns = X_train.columns.tolist()

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
mean_score = []
for fold_n, (train_index, valid_index) in enumerate(splits):
    print('Fold:',fold_n+1)
    X_train_tmp, X_valid_tmp = X_train[columns].iloc[train_index], X_train[columns].iloc[valid_index]
    y_train_tmp, y_valid_tmp = y_train.iloc[train_index], y_train.iloc[valid_index]
    dtrain = lgb.Dataset(X_train_tmp, label=y_train_tmp, categorical_feature=cat_cols)
    dvalid = lgb.Dataset(X_valid_tmp, label=y_valid_tmp, categorical_feature=cat_cols)
    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], early_stopping_rounds=50, verbose_eval=100)
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    y_pred_valid = clf.predict(X_valid_tmp,num_iteration=clf.best_iteration)
    y_oof[valid_index] = y_pred_valid
    val_score = np.sqrt(mean_squared_error(y_pred_valid, y_valid_tmp))
    print(f'val rmse score is {val_score}')
    mean_score.append(val_score)
    y_preds += clf.predict(X_test[columns], num_iteration=clf.best_iteration)/n_fold
    del X_train_tmp, X_valid_tmp, y_train_tmp, y_valid_tmp
    gc.collect()
print('mean rmse score over folds is',np.mean(mean_score))
df_test['pred'] = y_preds

print ('Valor estimado:', y_preds.sum())
print ('Valor realizado:', y_test.sum())
print ('MAE:', mae_score(y_test, y_preds))
print ('MSE:', mse_score(y_test, y_preds))
print ('RMSE:', rmse_score(y_test, y_preds))

#### Importância das variáveis explicativas do último fold

In [None]:
i = 1
for a,b in sorted(list(zip(clf.feature_importance(), X_train.columns))):
    print (i, a, b, sep=' ')
    i+=1

In [None]:
i = 1
for a,b in sorted(list(zip(clf.feature_importance(), X_train.columns))):
    if (a > 0):
        print (b)
        i+1

#### erro de treino

In [None]:
df_preds = df_train.copy()
y_pred = clf.predict(X_train)
df_preds['true'] = y_train
df_preds['pred'] = y_pred
df_resultados = df_preds[['true', 'pred']].copy()
erro = rmse_score(df_preds['true'], df_preds['pred'])
print ('Erro de treino:', erro)

#### erro de teste

In [None]:
df_preds = df_test.copy()
y_pred = clf.predict(X_test)
df_preds['true'] = y_test
df_preds['pred'] = y_pred
df_resultados = df_preds[['true', 'pred']].copy()
erro = rmse_score(df_preds['true'], df_preds['pred'])
print ('Erro de teste:', erro)

In [None]:
df_preds['pred']

In [None]:
df_preds.shape

In [None]:
y_test.shape

In [None]:
df_preds.head()

#### Avaliação dos resultados

In [None]:
i = 0
while (i < 10):
    cod_loja = random.choice(df_preds['cod_loja'].unique())
    data_plot = df_preds.loc[df_preds['cod_loja'] == cod_loja]
    if (len(data_plot) > 0):
        plt.figure(figsize=(8,4))
        plt.plot(data_plot['ano_mes'], data_plot['true'], color='green', label='True')
        plt.plot(data_plot['ano_mes'], data_plot['pred'], color='red', label='Pred')
        plt.title(f'Loja: {cod_loja}')
        plt.show()
        i+=1

In [None]:
df_preds = df_test.copy()
y_pred = clf.predict(X_test)
df_preds['true'] = y_test
df_preds['pred'] = y_pred
df_resultados = df_preds[['true', 'pred']].copy()
erro = rmse_score(df_preds['true'], df_preds['pred'])
print ('Erro de teste:', erro)

In [None]:
df_preds['ano_mes'].unique()

In [None]:
df_faturamento['cod_tmp'] = df_faturamento['cod_loja'].astype(int)
               
i = 0
while (i < 10):
    cod_loja = np.random.randint(1, df_faturamento.cod_tmp.max() + 1)
    data_plot = df_faturamento.loc[df_faturamento['cod_tmp'] == cod_loja]
    if (len(data_plot) > 0):
        data_plot.plot(x='ano_mes', y='receita_target',
                       label = f"Loja: {cod_loja}",
                       figsize = (8,4),
                       title = "Receita Média");
        i+=1

In [None]:
i = 0
while (i < 10):
    cod_loja = random.choice(df_preds['cod_loja'].unique())
    data_plot = df_preds.loc[df_preds['cod_loja'] == cod_loja]
    if (len(data_plot) > 0):
        plt.figure(figsize=(8,4))
        plt.plot(data_plot['ano_mes'], data_plot['true'], color='green', label='True')
        plt.plot(data_plot['ano_mes'], data_plot['pred'], color='red', label='Pred')
        plt.title(f'Loja: {cod_loja}')
        plt.show()
        i+=1

In [None]:
df_expansao = pd.read_csv('cenarios_expansao.csv')
print (df_expansao.shape)
df_expansao.head()

In [None]:
df_expansao['feature_04_vezes_06'] = df_expansao['feature_04'] * df_expansao['feature_06']

In [None]:
df_expansao['mes'] = 12

In [None]:
X_test = df_expansao[lista_vars_explicativas].copy()

In [None]:
for col in cat_cols:
    ce = mapeamento[col]
    X_test[col] = ce.transform(X_test[col].astype(str))

In [None]:
df_preds = df_expansao.copy()
df_preds['pred'] = clf.predict(X_test)

In [None]:
df_preds

In [None]:
df_preds.shape

In [None]:
df_expansao.shape