# <div style="color:white;display:fill;border-radius:15px;background-color:#032137;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:180%">Template DS</p></div>

## <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">1. Importings</p></div>

## 1.1 Bibliotecas

In [46]:
import pandas as pd
import numpy as np
import math
import datetime

import plotly.express as px
from matplotlib            import pyplot as plt

import xgboost as xgb
from sklearn.metrics       import mean_absolute_error, mean_squared_error
from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from category_encoders     import CountEncoder


In [47]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.expand_frame_repr', False )
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 100)
plt.rcParams['figure.figsize'] = [16, 12]
plt.rcParams['font.size'] = 24

## 1.2 Dados

In [48]:
stores_raw = pd.read_csv('./data/stores.csv')
train_raw = pd.read_csv('./data/train.csv')
train_features_raw = pd.read_csv('./data/train_features.csv')
test_raw = pd.read_csv('./data/test.csv')
test_features_raw = pd.read_csv('./data/test_features.csv')


## 1.3 Funções auxiliares

In [49]:
def check_adjacent_value(df, column):
    return (df[column].shift(-1) == 1) | (df[column].shift(1) == 1 | (df[column] == 1))

def tratamento_dados(df):
    """
    Faz o tratamento dos dados aplicando todas as conversões necessárias.

    Args:
        df (DataFrame): Um DataFrame de Pandas a ser modificado

    Returns:
        DataFrame: Um DataFrame de Pandas modificado.
    """
    count_encoder = CountEncoder()
    
    # Imputando os valores de tamanho
    df['tamanho'] = df['tamanho'].apply(lambda x: 39778.0 if x <= 60 else 150046.423077 if x > 70 else 67256.666667)
    
    # Imputando zeros nos descontos ausentes
    df[['desconto_1', 'desconto_2', 'desconto_3', 'desconto_4', 'desconto_5']] = df[['desconto_1', 'desconto_2', 'desconto_3', 'desconto_4', 'desconto_5']].fillna(0)
    
    # Transformando a data em datetime
    df['data'] = df['data'] + '-2023'
    df['data'] = pd.to_datetime(df['data'], format= "%m-%d-%Y")
    
    # Distrinchando colunas de data
    df['dia'] = df['data'].dt.day
    df['mes'] = df['data'].dt.month
    df['semana_do_ano'] = df['data'].dt.isocalendar().week
    df['dia_da_semana'] = df['data'].dt.day_of_week
    
    # Convertendo a coluna feriado para variável binária
    df['feriado'] = df['feriado'].apply(lambda x: 1 if x == 'sim' else 0)
    
    # Imputando distâncias para competidores enormes no lugar dos NAs
    df['distancia_competidores'] = df['distancia_competidores'].apply(lambda x: 400000.0 if math.isnan(x) else x)
    
    # Criando uma coluna com a soma dos descontos oferecidos
    df['descontos'] = df[['desconto_1', 'desconto_2', 'desconto_3', 'desconto_4', 'desconto_5']].sum(axis=1)
    
    # Criando uma coluna boolena de desconto
    df['tem_desconto'] = df['descontos'].apply(lambda x: 1 if x != 0 else 0)
    
    # dia da semana
    df['dia_da_semana_sin'] = df['dia_da_semana'].apply( lambda x: np.sin( x * ( 2. * np.pi/7 ) ) )
    df['dia_da_semana_cos'] = df['dia_da_semana'].apply( lambda x: np.cos( x * ( 2. * np.pi/7 ) ) )

    # mês
    df['mes_sin'] = df['mes'].apply( lambda x: np.sin( x * ( 2. * np.pi/12 ) ) )
    df['mes_cos'] = df['mes'].apply( lambda x: np.cos( x * ( 2. * np.pi/12 ) ) )

    # dia 
    df['dia_sin'] = df['dia'].apply( lambda x: np.sin( x * ( 2. * np.pi/30 ) ) )
    df['dia_cos'] = df['dia'].apply( lambda x: np.cos( x * ( 2. * np.pi/30 ) ) )

    # semana_do_ano
    df['semana_do_ano_sin'] = df['semana_do_ano'].apply( lambda x: np.sin( x * ( 2. * np.pi/52 ) ) )
    df['semana_do_ano_cos'] = df['semana_do_ano'].apply( lambda x: np.cos( x * ( 2. * np.pi/52 ) ) )

    # Encoding
    le = LabelEncoder()
    df['tipo'] = le.fit_transform(df['tipo'])
    
    return df

def realizar_scalings(df):
    # Scalers
    rs = RobustScaler()
    mms= MinMaxScaler()
    df['tipo'] = mms.fit_transform(df[['tipo']].values)
    df['tamanho'] = mms.fit_transform(df[['tamanho']].values)
    df['distancia_competidores'] = rs.fit_transform(df[['distancia_competidores']].values)
    df['temperatura'] = rs.fit_transform(df[['temperatura']].values)
    df['combustivel'] = rs.fit_transform(df[['combustivel']].values)
    df['clientes'] = rs.fit_transform(df[['clientes']].values)
    
    return df

def cross_validation( x_training, kfold, model_name, model, verbose=False ):
    mae_list = []
    mape_list = []
    rmse_list = []
    for k in reversed( range( 1, kfold+1 ) ):
        if verbose:
            print( '\nKFold Number: {}'.format( k ) )
        # start and end date for validation 
        validation_start_date = x_training['data'].max() - datetime.timedelta( days=k*5*7)
        validation_end_date = x_training['data'].max() - datetime.timedelta( days=(k-1)*5*7)

        # filtering dataset
        training = x_training[x_training['data'] < validation_start_date]
        validation = x_training[(x_training['data'] >= validation_start_date) & (x_training['data'] <= validation_end_date)]

        # training and validation dataset
        # training
        xtraining = training[selected_columns]
        ytraining = training['vendas_semanais']

        # validation
        xvalidation = validation[selected_columns]
        yvalidation = validation['vendas_semanais']

        # model
        m = model.fit( xtraining, ytraining )

        # prediction
        yhat = m.predict( xvalidation )

        # performance
        m_result = ml_error( model_name, np.expm1( yvalidation ), np.expm1( yhat ) )

        # store performance of each kfold iteration
        mae_list.append(  m_result['MAE'] )
        mape_list.append( m_result['MAPE'] )
        rmse_list.append( m_result['RMSE'] )

    return pd.DataFrame( {'Model Name': model_name,
                          'MAE CV': np.round( np.mean( mae_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( mae_list ), 2 ).astype( str ),
                          'MAPE CV': np.round( np.mean( mape_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( mape_list ), 2 ).astype( str ),
                          'RMSE CV': np.round( np.mean( rmse_list ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( rmse_list ), 2 ).astype( str ) }, index=[0] )


def mean_percentage_error( y, yhat ):
    return np.mean( ( y - yhat ) / y )
     
    
def mean_absolute_percentage_error( y, yhat ):
    return np.mean( np.abs( ( y - yhat ) / y ) )

def ml_error( model_name, y, yhat ):
    mae = mean_absolute_error( y, yhat )
    mape = mean_absolute_percentage_error( y, yhat )
    rmse = np.sqrt( mean_squared_error( y, yhat ) )
    
    return pd.DataFrame( { 'Model Name': model_name, 
                           'MAE': mae, 
                           'MAPE': mape,
                           'RMSE': rmse }, index=[0] )

# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">2. Data exploration and problem comprehension</p></div>


- Main goal/problem
- Sub-goals
- What will the finished product be?

## 2.1 Primeiras explorações

### Stores

In [50]:
stores_raw.shape

(45, 3)

In [51]:
stores_raw.isna().sum()

loja        0
tipo        0
tamanho    11
dtype: int64

In [52]:
stores_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   loja     45 non-null     int64  
 1   tipo     45 non-null     object 
 2   tamanho  34 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.2+ KB


In [53]:
stores_raw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
loja,45.0,23.0,13.133926,1.0,12.0,23.0,34.0,45.0
tamanho,34.0,126525.5,60178.808984,34875.0,76331.75,126172.5,177610.25,219622.0


### Train


In [54]:
train_raw.shape


(135385, 6)

In [55]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135385 entries, 0 to 135384
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               135385 non-null  int64  
 1   loja             135385 non-null  int64  
 2   setor            135385 non-null  int64  
 3   data             135385 non-null  object 
 4   vendas_semanais  122523 non-null  float64
 5   feriado          135385 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 6.2+ MB


### Train_features

In [56]:
train_features_raw.shape

(2070, 13)

In [57]:
train_features_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2070 entries, 0 to 2069
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   loja                    2070 non-null   int64  
 1   data                    2070 non-null   object 
 2   temperatura             1336 non-null   float64
 3   combustivel             1262 non-null   float64
 4   desconto_1              90 non-null     float64
 5   desconto_2              89 non-null     float64
 6   desconto_3              84 non-null     float64
 7   desconto_4              75 non-null     float64
 8   desconto_5              90 non-null     float64
 9   desemprego              2070 non-null   float64
 10  feriado                 2070 non-null   object 
 11  distancia_competidores  90 non-null     float64
 12  clientes                2070 non-null   int64  
dtypes: float64(9), int64(2), object(2)
memory usage: 210.4+ KB


In [58]:
train_features_raw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
loja,2070.0,23.0,12.990311,1.0,12.0,23.0,34.0,45.0
temperatura,1336.0,16.047276,10.730922,-18.922222,8.711111,17.538889,23.668056,37.588889
combustivel,1262.0,3.596067,0.286606,2.891,3.452,3.623,3.8065,4.211
desconto_1,90.0,8365.174333,7969.138443,5.64,3224.8675,6119.695,10845.145,34348.14
desconto_2,89.0,4888.274494,7607.164623,2.63,247.29,1090.92,7331.95,44021.61
desconto_3,84.0,212.091667,232.489814,1.32,57.0275,141.97,277.01,1134.49
desconto_4,75.0,2446.9244,3081.04566,38.35,670.685,1292.58,3180.255,20834.37
desconto_5,90.0,5297.985556,3756.863013,578.02,3062.5575,4614.91,6522.0225,27754.23
desemprego,2070.0,0.081183,0.018841,0.0442,0.07193,0.07866,0.08549,0.14021
distancia_competidores,90.0,16933.288889,14847.975897,1576.0,6071.0,12023.5,21748.5,68224.0


### Test


In [59]:
test_raw.shape


(18068, 5)

In [60]:
test_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18068 entries, 0 to 18067
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       18068 non-null  int64 
 1   loja     18068 non-null  int64 
 2   setor    18068 non-null  int64 
 3   data     18068 non-null  object
 4   feriado  18068 non-null  object
dtypes: int64(3), object(2)
memory usage: 705.9+ KB


In [61]:
test_raw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,18068.0,285097.5,5215.926667,276064.0,280580.75,285097.5,289614.25,294131.0
loja,18068.0,22.173899,12.788277,1.0,11.0,22.0,33.0,45.0
setor,18068.0,44.723268,30.643778,1.0,18.0,38.0,74.0,99.0


### Test_features


In [62]:
test_features_raw.shape


(270, 13)

In [63]:
test_features_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   loja                    270 non-null    int64  
 1   data                    270 non-null    object 
 2   temperatura             168 non-null    float64
 3   combustivel             174 non-null    float64
 4   desconto_1              264 non-null    float64
 5   desconto_2              204 non-null    float64
 6   desconto_3              258 non-null    float64
 7   desconto_4              227 non-null    float64
 8   desconto_5              270 non-null    float64
 9   desemprego              270 non-null    float64
 10  feriado                 270 non-null    object 
 11  distancia_competidores  264 non-null    float64
 12  clientes                270 non-null    int64  
dtypes: float64(9), int64(2), object(2)
memory usage: 27.5+ KB


### Cópia de segurança

In [64]:
stores = stores_raw.copy()
train = train_raw.copy()
train_features = train_features_raw.copy()
test = test_raw.copy()
test_features = test_features_raw.copy()

## 2.2 Fundindo os dataframes

### Train


In [65]:
df_train_merged = train.merge(stores, how='left', on='loja')
df_train_merged = df_train_merged.merge(train_features, how='left', on=['loja', 'data', 'feriado'])
df_train_merged.sample(20)

Unnamed: 0,id,loja,setor,data,vendas_semanais,feriado,tipo,tamanho,temperatura,combustivel,desconto_1,desconto_2,desconto_3,desconto_4,desconto_5,desemprego,distancia_competidores,clientes
91303,231982,8,13,08-12,23823.16,nao,eletrodomestico,155078.0,30.027778,3.638,,,,,,0.06425,,217
73513,214192,9,25,06-24,,nao,eletronico,125833.0,,,,,,,,0.0638,,289
41597,182276,34,28,04-15,276.95,nao,eletrodomestico,158114.0,,3.724,,,,,,0.10581,,86
65870,206549,11,27,06-10,1557.66,nao,eletrodomestico,,,,,,,,,0.07574,,798
11405,152084,9,9,01-28,,nao,eletronico,125833.0,,3.01,,,,,,0.06416,,989
35301,175980,39,71,04-01,5780.26,nao,eletrodomestico,184109.0,19.205556,3.524,,,,,,0.083,,287
99498,240177,34,25,08-26,4736.71,nao,eletrodomestico,158114.0,25.15,3.523,,,,,,0.10641,,515
82582,223261,20,52,07-22,2164.57,nao,eletrodomestico,,,3.811,,,,,,0.07274,,308
10584,151263,20,26,01-28,8744.56,nao,eletrodomestico,,-4.883333,,,,,,,0.07343,,676
48494,189173,27,72,04-29,51932.39,nao,eletrodomestico,,16.505556,4.117,,,,,,0.07725,,291


### Test


In [66]:
df_test_merged = test.merge(stores, how='left', on='loja')
df_test_merged = df_test_merged.merge(test_features, how='left', on=['loja', 'data', 'feriado'])
df_test_merged.sample(20)

Unnamed: 0,id,loja,setor,data,feriado,tipo,tamanho,temperatura,combustivel,desconto_1,desconto_2,desconto_3,desconto_4,desconto_5,desemprego,distancia_competidores,clientes
12776,288840,15,41,12-23,nao,eletronico,123737.0,1.627778,3.587,260.78,7.27,1120.42,80.13,622.48,0.07866,4899.0,769
4983,281047,15,87,12-02,nao,eletronico,123737.0,,,3751.34,191.82,6485.06,1803.17,16136.43,0.07866,3634.0,944
15977,292041,22,46,12-30,sim,eletronico,119557.0,,3.402,6840.65,55072.53,35.75,388.44,5826.4,0.07706,12855.0,824
15115,291179,6,87,12-30,sim,eletrodomestico,202505.0,8.222222,,6098.54,82881.16,326.68,814.58,2728.99,0.06551,14468.0,807
11011,287075,35,83,12-16,nao,eletronico,103681.0,,,5115.93,0.27,10748.31,601.89,14608.23,0.08745,624.0,469
5146,281210,37,5,12-02,nao,outro,,11.983333,,1.5,,6.03,,1180.32,0.07716,3823.0,718
15580,291644,34,46,12-30,sim,eletrodomestico,158114.0,-1.755556,3.119,3312.59,21909.93,210.22,146.1,3299.3,0.10148,8326.0,233
17197,293261,37,59,12-30,sim,outro,,9.033333,,373.92,1057.77,1.5,10.08,741.34,0.07716,5006.0,553
2665,278729,26,20,11-25,sim,eletrodomestico,152513.0,,,362.32,9.0,44999.94,,660.57,0.07598,5064.0,191
1931,277995,16,72,11-25,sim,eletronico,57197.0,-0.338889,,90.28,126.47,34754.69,24.64,1560.32,0.06232,3620.0,407


### Re-inspecionando o conjunto de treino

In [67]:
df_train_merged.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,135385.0,208371.0,39082.427434,140679.0,174525.0,208371.0,242217.0,276063.0
loja,135385.0,22.210681,12.783303,1.0,11.0,22.0,33.0,45.0
setor,135385.0,44.150327,30.451533,1.0,18.0,37.0,72.0,99.0
vendas_semanais,122523.0,62991.39223,198445.286559,-1321.48,2228.58,8236.32,23589.14,999967.424022
tamanho,102058.0,132619.11031,57153.086978,34875.0,103681.0,128107.0,196321.0,219622.0
temperatura,87440.0,15.733733,10.735085,-18.922222,8.244444,17.283333,23.388889,37.588889
combustivel,82396.0,3.597816,0.285991,2.891,3.459,3.623,3.807,4.211
desconto_1,5926.0,8883.068046,7933.224879,5.64,3648.4,6756.03,10941.05,34348.14
desconto_2,5875.0,5213.345593,7754.955048,2.63,274.85,1229.96,7898.33,44021.61
desconto_3,5639.0,221.416875,232.376139,1.32,61.08,143.88,292.44,1134.49


In [68]:
df_train_merged = df_train_merged[df_train_merged['vendas_semanais']>0]

### Obtendo o número de setores e o tamanho de cada loja

In [69]:

aux = df_train_merged[['loja', 'setor', 'tamanho']].groupby('loja').agg({'setor':'nunique', 'tamanho':'mean'}).reset_index()
aux.head()

Unnamed: 0,loja,setor,tamanho
0,1,75,151315.0
1,2,75,202307.0
2,3,69,
3,4,74,
4,5,69,34875.0


## 2.3 Fazendo o primeiro tratamento de dados no conjunto de treino


In [70]:
# print(aux[aux['setor']<= 60][['tamanho']].mean())
# print(aux[(aux['setor'] > 60) & (aux['setor'] <= 70)][['tamanho']].mean())
# print(aux[aux['setor']> 70][['tamanho']].mean())

In [71]:
# df_train_merged['tamanho'] = df_train_merged['tamanho'].apply(lambda x: 39778.0 if x <= 60 else 150046.423077 if x > 70 else 67256.666667)

In [72]:
# df_train_merged[['desconto_1', 'desconto_2', 'desconto_3', 'desconto_4', 'desconto_5']] = df_train_merged[['desconto_1', 'desconto_2', 'desconto_3', 'desconto_4', 'desconto_5']].fillna(0)
# df_train_merged['data'] = df_train_merged['data'] + '-2023'
# df_train_merged['data'] = pd.to_datetime(df_train_merged['data'], format= "%m-%d-%Y")
# df_train_merged['feriado'] = df_train_merged['feriado'].apply(lambda x: 1 if x == 'sim' else 0)
# df_train_merged['distancia_competidores'] = df_train_merged['distancia_competidores'].apply(lambda x: 400000.0 if math.isnan(x) else x)

In [73]:
# df_train_merged['descontos'] = df_train_merged[['desconto_1', 'desconto_2', 'desconto_3', 'desconto_4', 'desconto_5']].sum(axis=1)

In [74]:
# df_train_merged['dia'] = df_train_merged['data'].dt.day
# df_train_merged['mes'] = df_train_merged['data'].dt.month
# df_train_merged['semana_do_ano'] = df_train_merged['data'].dt.isocalendar().week
# df_train_merged['ano_semana'] = df_train_merged['data'].dt.strftime('%Y-%U')

In [75]:
df_train = tratamento_dados(df_train_merged)

In [76]:
df_train

Unnamed: 0,id,loja,setor,data,vendas_semanais,feriado,tipo,tamanho,temperatura,combustivel,desconto_1,desconto_2,desconto_3,desconto_4,desconto_5,desemprego,distancia_competidores,clientes,dia,mes,semana_do_ano,dia_da_semana,descontos,tem_desconto,dia_da_semana_sin,dia_da_semana_cos,mes_sin,mes_cos,dia_sin,dia_cos,semana_do_ano_sin,semana_do_ano_cos
0,140679,17,93,2023-01-07,6283.00,0,1,150046.423077,-14.316667,2.891,0.00,0.00,0.00,0.00,0.00,0.06866,400000.0,541,7,1,1,5,0.00,0,-0.974928,-0.222521,0.5,0.866025,0.994522,0.104528,0.120537,0.992709
1,140680,12,17,2023-01-07,10006.77,0,1,150046.423077,3.133333,3.287,0.00,0.00,0.00,0.00,0.00,0.14021,400000.0,463,7,1,1,5,0.00,0,-0.974928,-0.222521,0.5,0.866025,0.994522,0.104528,0.120537,0.992709
2,140681,3,25,2023-01-07,2658.57,0,1,67256.666667,11.861111,,0.00,0.00,0.00,0.00,0.00,0.07551,400000.0,901,7,1,1,5,0.00,0,-0.974928,-0.222521,0.5,0.866025,0.994522,0.104528,0.120537,0.992709
3,140682,42,1,2023-01-07,8836.00,0,2,150046.423077,,,0.00,0.00,0.00,0.00,0.00,0.08744,400000.0,176,7,1,1,5,0.00,0,-0.974928,-0.222521,0.5,0.866025,0.994522,0.104528,0.120537,0.992709
4,140683,26,9,2023-01-07,9526.27,0,0,150046.423077,-6.038889,3.193,0.00,0.00,0.00,0.00,0.00,0.07907,400000.0,124,7,1,1,5,0.00,0,-0.974928,-0.222521,0.5,0.866025,0.994522,0.104528,0.120537,0.992709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135380,276059,44,46,2023-11-18,5458.65,0,2,150046.423077,4.805556,,861.43,0.00,0.00,0.00,940.29,0.06078,5783.0,935,18,11,46,5,1801.72,1,-0.974928,-0.222521,-0.5,0.866025,-0.587785,-0.809017,-0.663123,0.748511
135381,276060,9,81,2023-11-18,2550.65,0,1,150046.423077,14.888889,,3648.40,2.63,21.12,1639.62,3623.62,0.06054,8673.0,338,18,11,46,5,8935.39,1,-0.974928,-0.222521,-0.5,0.866025,-0.587785,-0.809017,-0.663123,0.748511
135382,276061,27,36,2023-11-18,3155.09,0,0,67256.666667,10.788889,,32812.24,5487.82,143.88,567.08,14703.26,0.07906,55921.0,190,18,11,46,5,53714.28,1,-0.974928,-0.222521,-0.5,0.866025,-0.587785,-0.809017,-0.663123,0.748511
135383,276062,7,22,2023-11-18,9091.59,0,1,150046.423077,,,3221.47,1229.96,7.35,494.15,3947.57,0.08513,7495.0,483,18,11,46,5,8900.50,1,-0.974928,-0.222521,-0.5,0.866025,-0.587785,-0.809017,-0.663123,0.748511


In [77]:
df_train.isna().sum()

id                            0
loja                          0
setor                         0
data                          0
vendas_semanais               0
feriado                       0
tipo                          0
tamanho                       0
temperatura               43194
combustivel               47830
desconto_1                    0
desconto_2                    0
desconto_3                    0
desconto_4                    0
desconto_5                    0
desemprego                    0
distancia_competidores        0
clientes                      0
dia                           0
mes                           0
semana_do_ano                 0
dia_da_semana                 0
descontos                     0
tem_desconto                  0
dia_da_semana_sin             0
dia_da_semana_cos             0
mes_sin                       0
mes_cos                       0
dia_sin                       0
dia_cos                       0
semana_do_ano_sin             0
semana_d

## 2.4 Fazendo o primeiro tratamento de dados no conjunto de teste

In [78]:
df_test = tratamento_dados(df_test_merged)

In [79]:
df_test

Unnamed: 0,id,loja,setor,data,feriado,tipo,tamanho,temperatura,combustivel,desconto_1,desconto_2,desconto_3,desconto_4,desconto_5,desemprego,distancia_competidores,clientes,dia,mes,semana_do_ano,dia_da_semana,descontos,tem_desconto,dia_da_semana_sin,dia_da_semana_cos,mes_sin,mes_cos,dia_sin,dia_cos,semana_do_ano_sin,semana_do_ano_cos
0,276064,20,16,2023-11-25,1,0,67256.666667,7.988889,3.492,335.66,80.00,101378.79,64.46,2251.98,0.07082,3419.0,143,25,11,47,5,104110.89,1,-0.974928,-0.222521,-5.000000e-01,0.866025,-8.660254e-01,0.5,-5.680647e-01,0.822984
1,276065,39,14,2023-11-25,1,0,150046.423077,19.088889,3.236,224.08,292.94,77126.16,77.74,4875.43,0.07716,573.0,516,25,11,47,5,82596.35,1,-0.974928,-0.222521,-5.000000e-01,0.866025,-8.660254e-01,0.5,-5.680647e-01,0.822984
2,276066,40,5,2023-11-25,1,0,150046.423077,0.422222,3.536,247.58,387.88,40362.07,47.00,1788.31,0.04420,3707.0,227,25,11,47,5,42832.84,1,-0.974928,-0.222521,-5.000000e-01,0.866025,-8.660254e-01,0.5,-5.680647e-01,0.822984
3,276067,24,92,2023-11-25,1,0,67256.666667,5.461111,3.689,2571.98,66.94,64304.51,221.93,3661.62,0.08454,6482.0,533,25,11,47,5,70826.98,1,-0.974928,-0.222521,-5.000000e-01,0.866025,-8.660254e-01,0.5,-5.680647e-01,0.822984
4,276068,22,20,2023-11-25,1,1,150046.423077,7.933333,,1649.31,0.00,70087.94,7.50,5266.90,0.07706,3032.0,551,25,11,47,5,77011.65,1,-0.974928,-0.222521,-5.000000e-01,0.866025,-8.660254e-01,0.5,-5.680647e-01,0.822984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18063,294127,11,94,2023-12-30,1,0,67256.666667,,,4241.32,58046.41,239.33,78.09,586.72,0.07197,12896.0,302,30,12,52,5,63191.87,1,-0.974928,-0.222521,-2.449294e-16,1.000000,-2.449294e-16,1.0,-2.449294e-16,1.000000
18064,294128,15,18,2023-12-30,1,1,150046.423077,-0.311111,3.566,3248.40,31122.20,111.35,605.88,3474.84,0.07866,8022.0,363,30,12,52,5,38562.67,1,-0.974928,-0.222521,-2.449294e-16,1.000000,-2.449294e-16,1.0,-2.449294e-16,1.000000
18065,294129,11,30,2023-12-30,1,0,67256.666667,,,4241.32,58046.41,239.33,78.09,586.72,0.07197,12896.0,302,30,12,52,5,63191.87,1,-0.974928,-0.222521,-2.449294e-16,1.000000,-2.449294e-16,1.0,-2.449294e-16,1.000000
18066,294130,37,46,2023-12-30,1,2,67256.666667,9.033333,,373.92,1057.77,1.50,10.08,741.34,0.07716,5006.0,553,30,12,52,5,2184.61,1,-0.974928,-0.222521,-2.449294e-16,1.000000,-2.449294e-16,1.0,-2.449294e-16,1.000000


## 2.5 Removendo os NAs sobressalentes

In [80]:
df_train.isna().sum()

id                            0
loja                          0
setor                         0
data                          0
vendas_semanais               0
feriado                       0
tipo                          0
tamanho                       0
temperatura               43194
combustivel               47830
desconto_1                    0
desconto_2                    0
desconto_3                    0
desconto_4                    0
desconto_5                    0
desemprego                    0
distancia_competidores        0
clientes                      0
dia                           0
mes                           0
semana_do_ano                 0
dia_da_semana                 0
descontos                     0
tem_desconto                  0
dia_da_semana_sin             0
dia_da_semana_cos             0
mes_sin                       0
mes_cos                       0
dia_sin                       0
dia_cos                       0
semana_do_ano_sin             0
semana_d

In [81]:
# df_train_clean = df_train.dropna(axis=0)
# df_train_clean

# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">3. Hypothesis Mental Map Creation</p></div>


- Mental map for hypothesis and questions
- Hypothesis and questions list

In [82]:
# H1: As vendas estão correlacionadas positivamente com o tamanho da loja.

# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">4. Feature Engineering</p></div>


- Fillout remaining NAs 
- Derive new variables as needed

In [83]:
df_train_fillNA = df_train.sort_values(by=['loja', 'setor', 'semana_do_ano'], ascending=True)

In [84]:
df_train_fillNA

Unnamed: 0,id,loja,setor,data,vendas_semanais,feriado,tipo,tamanho,temperatura,combustivel,desconto_1,desconto_2,desconto_3,desconto_4,desconto_5,desemprego,distancia_competidores,clientes,dia,mes,semana_do_ano,dia_da_semana,descontos,tem_desconto,dia_da_semana_sin,dia_da_semana_cos,mes_sin,mes_cos,dia_sin,dia_cos,semana_do_ano_sin,semana_do_ano_cos
2903,143582,1,1,2023-01-07,15984.24,0,0,150046.423077,9.038889,2.976,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,160,7,1,1,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,0.994522,0.104528,0.120537,0.992709
4292,144971,1,1,2023-01-14,17359.70,0,0,150046.423077,,2.983,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,262,14,1,2,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,0.207912,-0.978148,0.239316,0.970942
6223,146902,1,1,2023-01-21,17341.47,0,0,150046.423077,6.688889,3.016,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,535,21,1,3,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,-0.951057,-0.309017,0.354605,0.935016
11629,152308,1,1,2023-01-28,18461.18,0,0,150046.423077,6.572222,3.010,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,278,28,1,4,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,-0.406737,0.913545,0.464723,0.885456
14328,155007,1,1,2023-02-04,21665.76,0,0,150046.423077,5.705556,,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,977,4,2,5,5,0.00,0,-0.974928,-0.222521,0.866025,0.500000,0.743145,0.669131,0.568065,0.822984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118627,259306,45,98,2023-10-14,996.40,0,1,67256.666667,17.677778,3.541,0.00,0.00,0.00,0.00,0.0,0.08523,400000.0,966,14,10,41,5,0.00,0,-0.974928,-0.222521,-0.866025,0.500000,0.207912,-0.978148,-0.970942,0.239316
120712,261391,45,98,2023-10-21,1058.84,0,1,67256.666667,15.333333,3.570,0.00,0.00,0.00,0.00,0.0,0.08523,400000.0,588,21,10,42,5,0.00,0,-0.974928,-0.222521,-0.866025,0.500000,-0.951057,-0.309017,-0.935016,0.354605
126480,267159,45,98,2023-10-28,1167.90,0,1,67256.666667,10.988889,,0.00,0.00,0.00,0.00,0.0,0.08523,400000.0,427,28,10,43,5,0.00,0,-0.974928,-0.222521,-0.866025,0.500000,-0.406737,0.913545,-0.885456,0.464723
126753,267432,45,98,2023-11-04,749.18,0,1,67256.666667,6.622222,3.551,0.00,0.00,0.00,0.00,0.0,0.08523,400000.0,387,4,11,44,5,0.00,0,-0.974928,-0.222521,-0.500000,0.866025,0.743145,0.669131,-0.822984,0.568065


## Preenchendo os valores faltantes

In [85]:
df_train_fillNA.set_index('data', inplace=True)
df_train_fillNA['combustivel'] = df_train_fillNA['combustivel'].interpolate(method='time')
df_train_fillNA['temperatura'] = df_train_fillNA['temperatura'].interpolate(method='time')

In [86]:
print(df_train_fillNA['combustivel'].isna().sum())
print(df_train_fillNA['temperatura'].isna().sum())

0
0


In [87]:
df_train_fillNA = df_train_fillNA.reset_index()

In [88]:
df_train_fillNA[(df_train_fillNA['loja']==2)&(df_train_fillNA['setor']==1)&(df_train_fillNA['semana_do_ano'].isin([1,2,3,4,5,6,7,8,9]))]

Unnamed: 0,data,id,loja,setor,vendas_semanais,feriado,tipo,tamanho,temperatura,combustivel,desconto_1,desconto_2,desconto_3,desconto_4,desconto_5,desemprego,distancia_competidores,clientes,dia,mes,semana_do_ano,dia_da_semana,descontos,tem_desconto,dia_da_semana_sin,dia_da_semana_cos,mes_sin,mes_cos,dia_sin,dia_cos,semana_do_ano_sin,semana_do_ano_cos
2972,2023-01-07,141982,2,1,19092.94,0,0,150046.423077,7.05,2.976,0.0,0.0,0.0,0.0,0.0,0.08028,400000.0,910,7,1,1,5,0.0,0,-0.974928,-0.222521,0.5,0.8660254,0.994522,0.104528,0.120537,0.992709
2973,2023-01-14,146480,2,1,22159.74,0,0,150046.423077,0.566667,2.983,0.0,0.0,0.0,0.0,0.0,0.08028,400000.0,557,14,1,2,5,0.0,0,-0.974928,-0.222521,0.5,0.8660254,0.207912,-0.978148,0.239316,0.970942
2974,2023-01-28,151233,2,1,25461.57,0,0,150046.423077,13.677778,3.01,0.0,0.0,0.0,0.0,0.0,0.08028,400000.0,120,28,1,4,5,0.0,0,-0.974928,-0.222521,0.5,0.8660254,-0.406737,0.913545,0.464723,0.885456
2975,2023-02-11,156115,2,1,802070.519455,1,0,150046.423077,0.661111,3.022,0.0,0.0,0.0,0.0,0.0,0.08028,400000.0,309,11,2,6,5,0.0,0,-0.974928,-0.222521,0.866025,0.5,0.743145,-0.669131,0.663123,0.748511
2976,2023-02-25,162661,2,1,22631.87,0,0,150046.423077,16.0,3.398,0.0,0.0,0.0,0.0,0.0,0.08028,400000.0,242,25,2,8,5,0.0,0,-0.974928,-0.222521,0.866025,0.5,-0.866025,0.5,0.822984,0.568065
2977,2023-03-04,164414,2,1,25562.33,0,0,150046.423077,14.316667,3.288,0.0,0.0,0.0,0.0,0.0,0.08028,400000.0,7,4,3,9,5,0.0,0,-0.974928,-0.222521,1.0,6.123234000000001e-17,0.743145,0.669131,0.885456,0.464723


In [89]:
df_train_fillNA

Unnamed: 0,data,id,loja,setor,vendas_semanais,feriado,tipo,tamanho,temperatura,combustivel,desconto_1,desconto_2,desconto_3,desconto_4,desconto_5,desemprego,distancia_competidores,clientes,dia,mes,semana_do_ano,dia_da_semana,descontos,tem_desconto,dia_da_semana_sin,dia_da_semana_cos,mes_sin,mes_cos,dia_sin,dia_cos,semana_do_ano_sin,semana_do_ano_cos
0,2023-01-07,143582,1,1,15984.24,0,0,150046.423077,9.038889,2.976,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,160,7,1,1,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,0.994522,0.104528,0.120537,0.992709
1,2023-01-14,144971,1,1,17359.70,0,0,150046.423077,1.188889,2.983,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,262,14,1,2,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,0.207912,-0.978148,0.239316,0.970942
2,2023-01-21,146902,1,1,17341.47,0,0,150046.423077,6.688889,3.016,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,535,21,1,3,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,-0.951057,-0.309017,0.354605,0.935016
3,2023-01-28,152308,1,1,18461.18,0,0,150046.423077,6.572222,3.010,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,278,28,1,4,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,-0.406737,0.913545,0.464723,0.885456
4,2023-02-04,155007,1,1,21665.76,0,0,150046.423077,5.705556,3.348,0.00,0.00,0.00,0.00,0.0,0.07742,400000.0,977,4,2,5,5,0.00,0,-0.974928,-0.222521,0.866025,0.500000,0.743145,0.669131,0.568065,0.822984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122137,2023-10-14,259306,45,98,996.40,0,1,67256.666667,17.677778,3.541,0.00,0.00,0.00,0.00,0.0,0.08523,400000.0,966,14,10,41,5,0.00,0,-0.974928,-0.222521,-0.866025,0.500000,0.207912,-0.978148,-0.970942,0.239316
122138,2023-10-21,261391,45,98,1058.84,0,1,67256.666667,15.333333,3.570,0.00,0.00,0.00,0.00,0.0,0.08523,400000.0,588,21,10,42,5,0.00,0,-0.974928,-0.222521,-0.866025,0.500000,-0.951057,-0.309017,-0.935016,0.354605
122139,2023-10-28,267159,45,98,1167.90,0,1,67256.666667,10.988889,3.604,0.00,0.00,0.00,0.00,0.0,0.08523,400000.0,427,28,10,43,5,0.00,0,-0.974928,-0.222521,-0.866025,0.500000,-0.406737,0.913545,-0.885456,0.464723
122140,2023-11-04,267432,45,98,749.18,0,1,67256.666667,6.622222,3.551,0.00,0.00,0.00,0.00,0.0,0.08523,400000.0,387,4,11,44,5,0.00,0,-0.974928,-0.222521,-0.500000,0.866025,0.743145,0.669131,-0.822984,0.568065


# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">5. Data selection and filtering</p></div>



- Filter data rows
- Filter data columns
- Based on the questions and hypothesis, select columns
- Create a new filtered dataframe
- Create the widgets to filter the data

# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">6. Exploratory Data Analysis (EDA)</p></div>


- Answer the hypothesis list
- Build data visualization solutions and plots

# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">7. Data Preparation</p></div>


- Normalize, re-scale and transform (enconding) variables to suit model requirements
- It may be a good idea to normalize all of the features so they are comparable in magnitude

## 7.1 Encodings

In [90]:
# Transformação da variável-alvo
df_train_fillNA['vendas_semanais'] = np.log1p( df_train_fillNA['vendas_semanais'] )
df_train_fillNA = realizar_scalings(df_train_fillNA)

In [91]:
df_train_fillNA

Unnamed: 0,data,id,loja,setor,vendas_semanais,feriado,tipo,tamanho,temperatura,combustivel,desconto_1,desconto_2,desconto_3,desconto_4,desconto_5,desemprego,distancia_competidores,clientes,dia,mes,semana_do_ano,dia_da_semana,descontos,tem_desconto,dia_da_semana_sin,dia_da_semana_cos,mes_sin,mes_cos,dia_sin,dia_cos,semana_do_ano_sin,semana_do_ano_cos
0,2023-01-07,143582,1,1,9.679421,0,0.0,1.0,-0.600852,-1.834734,0.00,0.00,0.00,0.00,0.0,0.07742,0.0,-0.712575,7,1,1,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,0.994522,0.104528,0.120537,0.992709
1,2023-01-14,144971,1,1,9.761964,0,0.0,1.0,-1.160511,-1.815126,0.00,0.00,0.00,0.00,0.0,0.07742,0.0,-0.508982,14,1,2,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,0.207912,-0.978148,0.239316,0.970942
2,2023-01-21,146902,1,1,9.760914,0,0.0,1.0,-0.768393,-1.722689,0.00,0.00,0.00,0.00,0.0,0.07742,0.0,0.035928,21,1,3,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,-0.951057,-0.309017,0.354605,0.935016
3,2023-01-28,152308,1,1,9.823480,0,0.0,1.0,-0.776711,-1.739496,0.00,0.00,0.00,0.00,0.0,0.07742,0.0,-0.477046,28,1,4,5,0.00,0,-0.974928,-0.222521,0.500000,0.866025,-0.406737,0.913545,0.464723,0.885456
4,2023-02-04,155007,1,1,9.983535,0,0.0,1.0,-0.838499,-0.792717,0.00,0.00,0.00,0.00,0.0,0.07742,0.0,0.918164,4,2,5,5,0.00,0,-0.974928,-0.222521,0.866025,0.500000,0.743145,0.669131,0.568065,0.822984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122137,2023-10-14,259306,45,98,6.905152,0,0.5,0.0,0.015051,-0.252101,0.00,0.00,0.00,0.00,0.0,0.08523,0.0,0.896208,14,10,41,5,0.00,0,-0.974928,-0.222521,-0.866025,0.500000,0.207912,-0.978148,-0.970942,0.239316
122138,2023-10-21,261391,45,98,6.965873,0,0.5,0.0,-0.152094,-0.170868,0.00,0.00,0.00,0.00,0.0,0.08523,0.0,0.141717,21,10,42,5,0.00,0,-0.974928,-0.222521,-0.866025,0.500000,-0.951057,-0.309017,-0.935016,0.354605
122139,2023-10-28,267159,45,98,7.063818,0,0.5,0.0,-0.461828,-0.075630,0.00,0.00,0.00,0.00,0.0,0.08523,0.0,-0.179641,28,10,43,5,0.00,0,-0.974928,-0.222521,-0.866025,0.500000,-0.406737,0.913545,-0.885456,0.464723
122140,2023-11-04,267432,45,98,6.620313,0,0.5,0.0,-0.773146,-0.224090,0.00,0.00,0.00,0.00,0.0,0.08523,0.0,-0.259481,4,11,44,5,0.00,0,-0.974928,-0.222521,-0.500000,0.866025,0.743145,0.669131,-0.822984,0.568065


# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">8. Feature Selection through Boruta algorithm</p></div>


- Use Boruta algorithm to select best features to machine learning models

In [92]:
df_train.columns

Index(['id', 'loja', 'setor', 'data', 'vendas_semanais', 'feriado', 'tipo',
       'tamanho', 'temperatura', 'combustivel', 'desconto_1', 'desconto_2',
       'desconto_3', 'desconto_4', 'desconto_5', 'desemprego',
       'distancia_competidores', 'clientes', 'dia', 'mes', 'semana_do_ano',
       'dia_da_semana', 'descontos', 'tem_desconto', 'dia_da_semana_sin',
       'dia_da_semana_cos', 'mes_sin', 'mes_cos', 'dia_sin', 'dia_cos',
       'semana_do_ano_sin', 'semana_do_ano_cos'],
      dtype='object')

In [95]:
selected_columns = ['id', 'loja', 'setor', 'feriado', 'tipo',
       'tamanho', 'temperatura', 'combustivel', 'desconto_1', 'desconto_2',
       'desconto_3', 'desconto_4', 'desconto_5', 'desemprego',
       'distancia_competidores', 'clientes', 'dia', 'mes', 'semana_do_ano',
       'dia_da_semana', 'descontos', 'tem_desconto', 'dia_da_semana_sin',
       'dia_da_semana_cos', 'mes_sin', 'mes_cos', 'dia_sin', 'dia_cos',
       'semana_do_ano_sin', 'semana_do_ano_cos'] # 'data', 'vendas_semanais', |  

selected_columns_with_data = selected_columns.copy()
selected_columns_with_data.extend([ 'data', 'vendas_semanais' ])

In [96]:
X_train = df_train_fillNA[selected_columns]
y_train = df_train_fillNA['vendas_semanais']

X_training = df_train_fillNA[selected_columns_with_data]

# X_val = validation[selected_columns]
# y_val = validation['vendas_semanais']

X_test = df_test[selected_columns]

# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">9. Model Implementation</p></div>


- Implement different machine learning models and algorithms
- Conduct cross-velidation computing
- Conduct single performance metrics computing

## 9.0 Cross-validation

## 9.1 XGBoostRegressor

In [97]:
# model
model_xgb = xgb.XGBRegressor( objective='reg:squarederror',
                              n_estimators=100, 
                              eta=0.01, 
                              max_depth=10, 
                              subsample=0.7,
                              colsample_bytree=0.9 ).fit( X_train, y_train )

# # prediction
# yhat_val_xgb = model_xgb.predict( X_val )

# # performance
# xgb_result = ml_error( 'XGBoost Regressor', y=y_val, yhat=yhat_val_xgb)
# xgb_result

In [98]:
# yhat_val_xgb

In [99]:
xgb_result_cv = cross_validation(x_training=X_training, kfold=5, model_name='XGBoost', model=model_xgb, verbose=False )
xgb_result_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,XGBoost,55792.24 +/- 2616.77,15.75 +/- 5.64,205190.26 +/- 5558.2


In [100]:
print('55788.04 +/- 2621.57')

55788.04 +/- 2621.57


## 9.3 Retreinando os modelos com todos os dados de treino

In [None]:
X_train = df_train[selected_columns]
y_train = df_train['vendas_semanais']

X_test = df_test[selected_columns]

### XGBoost

In [None]:
# model
model_xgb = xgb.XGBRegressor( objective='reg:squarederror',
                              n_estimators=100, 
                              eta=0.01, 
                              max_depth=10, 
                              subsample=0.7,
                              colsample_bytree=0.9 ).fit( X_train, y_train )

# prediction
yhat_xgb = model_xgb.predict( X_test )

resultados = pd.DataFrame({'id': X_test['id'], 'vendas_semanais':yhat_xgb})

In [None]:
resultados

Unnamed: 0,id,vendas_semanais
0,276064,77207.703125
1,276065,60119.332031
2,276066,65750.421875
3,276067,125834.179688
4,276068,78532.484375
...,...,...
18063,294127,83662.359375
18064,294128,66211.898438
18065,294129,65177.359375
18066,294130,74148.414062


In [None]:
resultados.to_csv('./resultados/submission.csv', index=False)

# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">10. Hyperparameter Fine-Tuning</p></div>


- Implement hyperparameter search (i.e. Bayes Search) to find best model hyperparameter values
- Re-train model using best values

In [None]:
# param_tuned = {
#     'n_estimators': 3000,
#     'eta': 0.03,
#     'max_depth': 5,
#     'subsample': 0.7,
#     'colsample_bytree': 0.7,
#     'min_child_weight': 3 
#         }

In [None]:
# # model
# model_xgb_tuned = xgb.XGBRegressor( objective='reg:squarederror',
#                                     n_estimators=param_tuned['n_estimators'], 
#                                     eta=param_tuned['eta'], 
#                                     max_depth=param_tuned['max_depth'], 
#                                     subsample=param_tuned['subsample'],
#                                     colsample_bytree=param_tuned['colsample_bytree'],
#                                     min_child_weight=param_tuned['min_child_weight'] ).fit( X_train, y_train )

# # prediction
# yhat_xgb_tuned = model_xgb_tuned.predict( X_test )

In [None]:
# resultados_tuned = pd.DataFrame({'id': X_test['id'], 'vendas_semanais':yhat_xgb_tuned})

In [None]:
# resultados_tuned.to_csv('./resultados/submission.csv', index=False)

# <div style="color:white;display:fill;border-radius:15px;background-color:#123752;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:110%">11. Model Error Estimation and Interpretation</p></div>


- Use model errors to interpret the goals 
- Model learning performance
- Model generalization performance
- What it means to business?