# 0.0. IMPORTS

In [None]:
import json
import math
# import pylab 
import random
import pickle
import requests
import datetime
import warnings
warnings.filterwarnings( 'ignore')
import inflection
import numpy as np
import pandas as pd 
import seaborn as sns
import xgboost as xgb


                

from scipy                 import stats  as ss
from sklearn.metrics       import mean_absolute_error, mean_squared_error
from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression
from sklearn.linear_model  import Lasso
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, RobustScaler



from boruta                        import BorutaPy
from matplotlib                    import pyplot as plt
from matplotlib                    import gridspec
from IPython.display               import Image
from IPython.core.display          import HTML
from IPython.core.interactiveshell import InteractiveShell

%pylab inline
%matplotlib inline


plt.style.use( 'bmh' )
plt.rcParams['figure.figsize'] = [25, 12]
plt.rcParams['font.size'] = 24

display( HTML( '<style>.container { width:100% !important; }</style>') )
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option( 'display.expand_frame_repr', False )

sns.set();


## 0.1 Helper Functions

In [None]:
def cramer_v(x,y):
    cm = pd.crosstab( x, y ).to_numpy()
    n = cm.sum()
    r, k = cm.shape
    
    chi2 = ss.chi2_contingency( cm ) [0]
    chi2corr = max(0, chi2 - (k-1)*(r-1)/(n-1))
    
    kcorr = k -(k-1)**2/(n-1)
    rcorr = r -(r-1)**2/(n-1)
    
    return np.sqrt( (chi2corr/n) / (min(kcorr-1, rcorr-1 )))

def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False)
    
    sns.set()
    
def mean_absolute_percentage_error( y, yhat):
    return np.mean(np.abs(y - yhat) / y)


def ml_error( model_name, y, yhat ):
    mae = mean_absolute_error( y, yhat )
    mape = mean_absolute_percentage_error( y, yhat )
    rmse = np.sqrt(mean_squared_error( y, yhat) )
    
    return pd.DataFrame( { 'Model Name': model_name,
                         'MAE': mae,
                         'MAPE': mape,
                         'RMSE': rmse }, index=[0] )


def cross_validation(x_training, kfold, model_name, model, verbose=False ):
    mae_list = []
    mape_list = []
    rmse_list = []
    for k in reversed( range(1, kfold+1)):
        if verbose:
            print ( '\nKFold Number: {}'.format( k ))

        # start and end date for validation
        validation_start_date = x_training['date'].max() - datetime.timedelta( days=(k)*6*7 )
        validation_end_date = x_training['date'].max() - datetime.timedelta( days=(k-1)*6*7 ) 

        # filtering dataset
        training = x_training[ x_training['date' ] < validation_start_date] 
        validation = x_training[ (x_training['date' ] >= validation_start_date) & (x_training['date'] <= validation_end_date ) ]

        # training and validation dataset
        # training
        xtraining = training.drop(['date', 'sales'], axis=1 )
        ytraining = training['sales']

        # validation
        xvalidation = validation.drop(['date', 'sales'], axis=1 )
        yvalidation = validation['sales']

        # model
        m = model.fit( xtraining, ytraining )

        # prediction
        yhat = m.predict( xvalidation )

        # performance
        m_result = ml_error( model_name, np.expm1(yvalidation), np.expm1(yhat))
    
        # store performance of each kfold iteration
        mae_list.append( m_result['MAE'])
        mape_list.append(m_result['MAPE'])
        rmse_list.append(m_result['RMSE'])
    
 
    return pd.DataFrame( {'Model Name': model_name,
                          'MAE CV': np.round(np.mean(mae_list), 2).astype( str ) + ' +/- ' + np.round(np.std(mae_list), 2 ).astype( str ),
                          'MAPE CV': np.round(np.mean(mape_list), 2).astype( str ) + ' +/- ' + np.round(np.std(mape_list), 2 ).astype( str ),
                          'RMSE CV': np.round(np.mean(rmse_list), 2).astype( str ) + ' +/- ' + np.round(np.std(rmse_list), 2 ).astype( str )}, index=[0])
 

In [None]:
jupyter_settings()

## 0.2 Loading data

In [None]:
df_sales_raw = pd.read_csv('data/train.csv', low_memory=False)
df_store_raw = pd.read_csv('data/store.csv', low_memory=False)

# merge
df_raw = pd.merge( df_sales_raw, df_store_raw, how='left', on='Store')

In [None]:
df_raw.sample()

# 1.0. PASSO 01 - DESCRICAO DOS DADOS

In [None]:
df1 = df_raw.copy()

## 1.1. Rename Columns

In [None]:
cols_old = ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 
            'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 
            'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']

snakecase = lambda x: inflection.underscore( x )

cols_new = list (map(snakecase, cols_old))

# rename
df1.columns = cols_new



## 1.2. Data Dimensions

In [None]:
print ( 'Number of Rows: {}'.format( df1.shape[0]))
print ( 'Number of Cols: {}'.format( df1.shape[1]))

## 1.3. Data Types

In [None]:
df1['date'] = pd.to_datetime(df1['date'])

df1.dtypes

## 1.4. Check NA

In [None]:
df1.isna().sum()

## 1.5. Fillout NA

In [None]:
df1['competition_distance'].max()

In [None]:
# competition_distance
df1['competition_distance'] = df1['competition_distance'].apply ( lambda x: 200000.0 if math.isnan( x ) else x )

# competition_open_since_month 
df1['competition_open_since_month'] = df1.apply( lambda x: x['date'].month if math.isnan( x['competition_open_since_month'] ) else x['competition_open_since_month'], axis=1 )

# competition_open_since_year 
df1['competition_open_since_year'] = df1.apply( lambda x: x['date'].year if math.isnan( x['competition_open_since_year'] ) else x['competition_open_since_year'], axis=1 )
                    
# promo2_since_week  
df1['promo2_since_week'] = df1.apply( lambda x: x['date'].week if math.isnan( x['promo2_since_week'] ) else x['promo2_since_week'], axis=1 )

# promo2_since_year  
df1['promo2_since_year'] = df1.apply( lambda x: x['date'].year if math.isnan( x['promo2_since_year'] ) else x['promo2_since_year'], axis=1 )

# promo_interval   
month_map = {1: 'Jan', 2: 'Fev', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}

df1['promo_interval'].fillna(0, inplace=True)

df1['month_map'] = df1['date'].dt.month.map(month_map)

df1['is_promo'] = df1[['promo_interval','month_map']].apply( lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split( ',') else 0, axis=1)


In [None]:
df1.dtypes

In [None]:
df1.sample(5).T

In [None]:
df1.isna().sum()

## 1.6. Change Types

In [None]:
df1['competition_open_since_month'] = df1['competition_open_since_month'].astype( 'int64' )
df1['competition_open_since_year'] = df1['competition_open_since_year'].astype( 'int64' )

df1['promo2_since_week'] = df1['promo2_since_week'].astype( 'int64' )
df1['promo2_since_year'] = df1['promo2_since_year'].astype( 'int64' )

In [None]:
df1.dtypes

## 1.7. Descriptive Statistical

In [None]:
num_attributes = df1.select_dtypes( include=['int64', 'float64'] )
cat_attributes = df1.select_dtypes( exclude=['int64', 'float64', 'datetime64[ns]'] )

### 1.7.1 Numerical Attributes

In [None]:
# Central Tendency - mean, median
ct1 = pd.DataFrame( num_attributes.apply( np.mean ) ).T
ct2 = pd.DataFrame( num_attributes.apply( np.median ) ).T

# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T
d2 = pd.DataFrame( num_attributes.apply( min ) ).T
d3 = pd.DataFrame( num_attributes.apply( max ) ).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.max() - x.min() ) ).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T

# concatenate
m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ( ['attributes','min','max','range','mean','median','std','skew','kurtosis'] )
m

In [None]:
sns.distplot( df1['competition_distance'] );

### 1.7.2. Categorical Attributes

In [None]:
cat_attributes.apply( lambda x: x.unique().shape[0] )

In [None]:
aux1 = df1[(df1['state_holiday'] != '0') & (df1['sales'] > 0)]

plt.subplot(1,3,1)
sns.boxplot ( x='state_holiday', y='sales', data=aux1)

plt.subplot(1,3,2)
sns.boxplot ( x='store_type', y='sales', data=aux1)

plt.subplot(1,3,3)
sns.boxplot ( x='assortment', y='sales', data=aux1);

# 2.0. PASSO 02 - FEATURE ENGINNERING

In [None]:
df2 = df1.copy()

## 2.1. Mapa Mental de Hipótese

In [None]:
Image('img/MindMapHypothesis.png')

## 2.2. Criação das Hipóteses

### 2.2.1. Hipótese Loja

**1.** Lojas com numero maior de funcionários deveriam vender mais.

**2.** Lojas com maior capacidade de estoque deveriam vender mais.

**3.** Lojas com maior porte deveriam vender mais.

**4.** Lojas com menor porte deveriam vender menos.

**5.** Lojas com maior sortimento deveriam vender mais.

**6.** Lojas com competidadores mais próximos deveriam vender menos.

**7.** Lojas com competidadores à mais tempo deveriam vender mais.

### 2.2.2. Hipóteses Produto

**1.** Lojas que investem mais em Marketing deveriam vender mais.

**2.** Lojas com maior exposição de produtos deveriam vender mais.

**3.** Lojas com produtos com preço menor deveriam vender mais.

**4.** Lojas com promoções mais agressivas (descontos maiores), deveriam vender mais.

**5.** Lojas com promoções ativas por mais tempo deveriam vender mais.

**6.** Lojas com mais dias de promoção deveriam vender mais.

**7.** Lojas com mais promoções consecutivas deveriam vender mais.

### 2.2.3. Hipóteses Tempo

**1** Lojas abertas durante o feriador de Natal deveriam vender mais.

**2** Lojas deveriam vender mais ao longo dos anos.

**3** Lojas deveriam vender mais no segundo semestre do ano.

**4** Lojas deveriam vender mais depois do dia 10 de cada mês.

**5** Lojas deveriam vender menos aos finais de semana.

**6** Lojas deveriam vender menos durante os feriados escolares.

## 2.3. Lista Final de Hipótese

**1.** Lojas com maior sortimento deveriam vender mais.

**2.** Lojas com competidadores mais próximos deveriam vender menos.

**3.** Lojas com competidadores à mais tempo deveriam vender mais.

**4.** Lojas com promoções ativas por mais tempo deveriam vender mais.

**5.** Lojas com mais dias de promoção deveriam vender mais.

**7.** Lojas com mais promoções consecutivas deveriam vender mais.

**8** Lojas abertas durante o feriador de Natal deveriam vender mais.

**9** Lojas deveriam vender mais ao longo dos anos.

**10** Lojas deveriam vender mais no segundo semestre do ano.

**11** Lojas deveriam vender mais depois do dia 10 de cada mês.

**12** Lojas deveriam vender menos aos finais de semana.

**13** Lojas deveriam vender menos durante os feriados escolares.

## 2.4. Feature Engineering

In [None]:
# year
df2['year'] = df2['date'].dt.year

# month
df2['month'] = df2['date'].dt.month

# day
df2['day'] = df2['date'].dt.day

# week of year
df2['week_of_year'] = df2['date'].dt.weekofyear

# year week
df2['year_week'] = df2['date'].dt.strftime( '%Y-%W' )

# competition since
df2['competition_since'] = df2.apply( lambda x: datetime.datetime( year=x['competition_open_since_year'], month=x['competition_open_since_month'],day=1 ), axis=1 )
df2['competition_time_month'] = ( ( df2['date'] - df2['competition_since'] )/30 ).apply( lambda x: x.days ).astype( 'int64' )

# promo since
df2['promo_since'] = df2['promo2_since_year'].astype( str ) + '-' + df2['promo2_since_week'].astype( str )
df2['promo_since'] = df2['promo_since'].apply( lambda x: datetime.datetime.strptime( x + '-1', '%Y-%W-%w' ) - datetime.timedelta( days=7 ) )
df2['promo_time_week'] = ( ( df2['date'] - df2['promo_since'] )/7 ).apply( lambda x: x.days ).astype( 'int64')

# assortment
df2['assortment'] = df2['assortment'].apply( lambda x: 'basic' if x == 'a' else 'extra' if x == 'b' else 'extended' )

# state holiday
df2['state_holiday'] = df2['state_holiday'].apply( lambda x: 'public_holiday' if x == 'a' else 'easter_holiday' if x == 'b' else 'christmas' if x == 'c' else 'regular_day' )

In [None]:
df2.head().T

# 3.0. PASSO 03 - FILTRAGEM DE VARIAVEIS

In [None]:
df3 = df2.copy()

In [None]:
df3.head()

## 3.1. Filtragem das Linhas

In [None]:
df3 = df3[(df3['open'] != 0) & (df3['sales'] > 0)]

## 3.2. Selecao das Colunas

In [None]:
cols_drop = ['customers', 'open', 'promo_interval', 'month_map']
df3 = df3.drop( cols_drop, axis=1 )

In [None]:
df3.columns

# 4.0. PASSO 04 - ANALISE EXPLORATORIA DE DADOS (EDA)

In [None]:
df4 = df3.copy()

## 4.1. Analise Univariada

### 4.1.1. Response Variable

In [None]:
sns.distplot(df4['sales'], kde=False)

### 4.1.2. Numerical Variable

In [None]:
num_attributes.hist( bins=25);

### 4.1.3. Categorical Variable

In [None]:
df4['state_holiday'].drop_duplicates()

In [None]:
# state_holiday
plt.subplot(3,2,1)
a = df4[df4['state_holiday'] != 'regular_day']
sns.countplot(a['state_holiday'])

plt.subplot(3,2,2)
sns.kdeplot( df4[df4['state_holiday'] == 'public_holiday']['sales'], label='public_holiday', shade=True)
sns.kdeplot( df4[df4['state_holiday'] == 'easter_holiday']['sales'], label='easter_holiday', shade=True)
sns.kdeplot( df4[df4['state_holiday'] == 'christmas']['sales'], label='christmas', shade=True)

# store_type
plt.subplot(3,2,3)
sns.countplot(df4['store_type'])

plt.subplot(3,2,4)
sns.kdeplot( df4[df4['store_type'] == 'a']['sales'], label='a', shade=True)
sns.kdeplot( df4[df4['store_type'] == 'b']['sales'], label='b', shade=True)
sns.kdeplot( df4[df4['store_type'] == 'c']['sales'], label='c', shade=True)
sns.kdeplot( df4[df4['store_type'] == 'd']['sales'], label='d', shade=True)

# assortment
plt.subplot(3,2,5)
sns.countplot(df4['assortment'])

plt.subplot(3,2,6)
sns.kdeplot( df4[df4['assortment'] == 'extended']['sales'], label='extended', shade=True)
sns.kdeplot( df4[df4['assortment'] == 'basic']['sales'], label='basic', shade=True)
sns.kdeplot( df4[df4['assortment'] == 'extra']['sales'], label='extra', shade=True);


## 4.2. Analise Bivariada

### **H1.** Lojas com maior sortimento deveriam vender mais.
**FALSA** Lojas com MAIOR SORTUMENTO vendem MENOS

In [None]:
aux1 = df4[['assortment', 'sales']].groupby( 'assortment' ).sum().reset_index()
sns.barplot( x='assortment', y='sales', data=aux1 );

aux2 = df4[['year_week', 'assortment', 'sales']].groupby( ['year_week','assortment'] ).sum().reset_index()
aux2.pivot( index='year_week', columns='assortment', values='sales' ).plot()

aux3 = aux2[aux2['assortment']== 'extra']
aux3.pivot( index='year_week', columns='assortment', values='sales' ).plot();

In [None]:
df4.head()

### **H2.** Lojas com competidadores mais próximos deveriam vender menos.
**FALSA** Lojas com COMPETIDORES MAIS PRÓXIMOS vendem MAIS.

In [None]:
aux1 = df4[[ 'competition_distance', 'sales']].groupby('competition_distance').sum().reset_index()

plt.subplot(1, 3, 1)
sns.scatterplot (x ='competition_distance', y='sales', data=aux1);

plt.subplot(1, 3, 2)
bins = list(np.arange(0, 20000, 1000))
aux1['competition_distance_binned'] = pd.cut( aux1['competition_distance'], bins=bins )
aux2 = aux1[[ 'competition_distance_binned', 'sales']].groupby('competition_distance_binned').sum().reset_index()
sns.barplot( x='competition_distance_binned', y='sales', data=aux2);
plt.xticks( rotation=90)

plt.subplot(1, 3, 3)
x = sns.heatmap( aux1.corr(method='pearson'), annot=True);
bottom, top = x.get_ylim()
x.set_ylim( bottom+0.5, top-0.5 );

In [None]:
aux1.head()

### **H3.** Lojas com competidadores à mais tempo deveriam vender mais.
**FALSA** Lojas com COMPETIDORES À MAIS TEMPO vendem MENOS.

In [None]:
plt.subplot(1, 3, 1)
aux1 = df4[['competition_time_month', 'sales']].groupby( 'competition_time_month').sum().reset_index()
aux2 = aux1[(aux1['competition_time_month'] < 120) & (aux1['competition_time_month'] != 0)]
sns.barplot( x='competition_time_month', y='sales', data=aux2 );
plt.xticks( rotation=90);

plt.subplot(1, 3, 2)
sns.regplot( x='competition_time_month', y='sales', data=aux2 );

plt.subplot(1, 3, 3)
x = sns.heatmap( aux1.corr( method='pearson'), annot=True)
bottom, top = x.get_ylim()
x.set_ylim( bottom+0.5, top-0.5);

### **H4.** Lojas com promoções ativas por mais tempo deveriam vender mais.
**FALSA** Lojas com PROMOÇÕES ATIVAS POR MAIS TEMPO NÃO vendem MAIS.

In [None]:
aux1 = df4[['promo_time_week', 'sales']].groupby( 'promo_time_week' ).sum().reset_index()

grid = GridSpec ( 2, 3)

plt.subplot( grid[0,0] )
aux2 = aux1[aux1['promo_time_week'] > 0] # Promo extendido
sns.barplot( x='promo_time_week', y='sales', data=aux2 );
plt.xticks ( rotation=90 );

plt.subplot( grid[0,1] )
sns.regplot( x='promo_time_week', y='sales', data=aux2 );

plt.subplot( grid[1,0] )
aux3 = aux1[aux1['promo_time_week'] < 0] # Promo regular
sns.barplot( x='promo_time_week', y='sales', data=aux3 );
plt.xticks ( rotation=90 );

plt.subplot(grid[1,1] )
sns.regplot( x='promo_time_week', y='sales', data=aux3 );

plt.subplot( grid[:,2] )
sns.heatmap ( aux1.corr ( method='pearson'), annot=True);

### <s>**H5.** Lojas com mais dias de promoção deveriam vender mais.</s>

### **H7.** Lojas com mais promoções consecutivas deveriam vender mais.
**FALSA** Lojas com MAIS PROMOÇÕES CONSECUTIVAS vendem MENOS.

In [None]:
df4[['promo','promo2', 'sales']].groupby(['promo', 'promo2']).sum().reset_index().sort_values(['sales'])

In [None]:
aux1 = df4[(df4['promo'] == 1) & (df4['promo2'] == 1)] [['year_week', 'sales']].groupby( 'year_week' ).sum().reset_index()
ax = aux1.plot()

aux2 = df4[(df4['promo'] == 1) & (df4['promo2'] == 0)] [['year_week', 'sales']].groupby( 'year_week' ).sum().reset_index()
aux2.plot( ax=ax )

ax.legend( labels=['Tradicional & Extendida', 'Extendida']);

### H8. Lojas abertas durante o feriado de Natal deveriam vender mais.
**FALSA** Lojas abertas DURANTE O FERIADO DO NATAL vendem MENOS.

In [None]:
aux = df4[df4['state_holiday'] != 'regular_day']

plt.subplot(1,2,1)
aux1 = aux[['state_holiday', 'sales']].groupby ( 'state_holiday' ).sum().reset_index()
sns.barplot( x='state_holiday', y='sales', data=aux1 );

plt.subplot(1,2,2)
aux2 = aux[['year', 'state_holiday', 'sales']].groupby ( ['year','state_holiday'] ).sum().reset_index()
sns.barplot( x='year', y='sales', hue='state_holiday', data=aux2 );

### H9. Lojas deveriam vender mais ao longo dos anos.
**FALSA** Lojas VENDEM MENOS ao LONGO DOS ANOS.

In [None]:
aux1 = df4[['sales','year']].groupby('year').sum().reset_index()

plt.subplot( 1,3,1 )
sns.barplot( x='year', y='sales', data=aux1);

plt.subplot( 1,3,2 )
sns.regplot( x='year', y='sales', data=aux1);

plt.subplot( 1,3,3 )
sns.heatmap( aux1.corr( method='pearson'), annot=True);

### H10. Lojas deveriam vender mais no segundo semestre do ano.
**FALSA** Lojas VENDEM MENOS no SEGUNDO SEMESTRE DO ANO.

In [None]:
aux1 = df4[['sales','month']].groupby('month').sum().reset_index()

plt.subplot( 1,3,1 )
sns.barplot( x='month', y='sales', data=aux1);

plt.subplot( 1,3,2 )
sns.regplot( x='month', y='sales', data=aux1);

plt.subplot( 1,3,3 )
sns.heatmap( aux1.corr( method='pearson'), annot=True);

### H11. Lojas deveriam vender mais depois do dia 10 de cada mês.
**VERDADEIRA** Lojas VENDEM MAIS depoi do DIA 10 DE CADA MÊS.

In [None]:
aux1 = df4[['sales','day']].groupby('day').sum().reset_index()

plt.subplot( 2,2,1 )
sns.barplot( x='day', y='sales', data=aux1);

plt.subplot( 2,2,2 )
sns.regplot( x='day', y='sales', data=aux1);

plt.subplot( 2,2,3 )
sns.heatmap( aux1.corr( method='pearson'), annot=True);


aux1['before_after'] = aux1['day'].apply (lambda x: 'before_10_days' if x <= 10 else 'after_10_days')
aux2 = aux1[['before_after', 'sales']].groupby( 'before_after' ).sum().reset_index()

plt.subplot( 2,2,4 )
sns.barplot( x='before_after', y='sales', data=aux2 );

### H12. Lojas deveriam vender menos aos finais de semana.
**VERDADEIRA** Lojas VENDEM MENOS aos FINAIS DE SEMANA.

In [None]:
aux1 = df4[['sales','day_of_week']].groupby('day_of_week').sum().reset_index()

plt.subplot( 1,3,1 )
sns.barplot( x='day_of_week', y='sales', data=aux1);

plt.subplot( 1,3,2 )
sns.regplot( x='day_of_week', y='sales', data=aux1);

plt.subplot( 1,3,3 )
sns.heatmap( aux1.corr( method='pearson'), annot=True);

### H13. Lojas deveriam vender menos durante os feriados escolares.
**VERDADEIRA** Lojas VENDEM MENOS durante os FERIADOS ESCOLARES, exceto o mês de AGOSTO.

In [None]:
aux1 = df4[['sales','school_holiday']].groupby('school_holiday').sum().reset_index()

plt.subplot( 2,1,1 )
sns.barplot( x='school_holiday', y='sales', data=aux1);

plt.subplot( 2,1,2 )
aux2 = df4[['month','sales','school_holiday']].groupby(['month','school_holiday']).sum().reset_index()
sns.barplot( x='month', y='sales', hue='school_holiday', data=aux2);


### 4.2.1. Resumo das Hipóteses

In [None]:
from tabulate import tabulate

In [None]:
tab = [['hipoteses', 'Conclusão', 'Relevancia'],
      ['H1', 'Falsa', 'Baixa'],
      ['H2', 'Falsa', 'Media'],
      ['H2', 'Falsa', 'Baixa'],
      ['H4', 'Falsa', 'Baixa'],
      ['H5', '-', '-'],
      ['H7', 'Falsa', 'Baixa'],
      ['H8', 'Falsa', 'Media'],
      ['H9', 'Falsa', 'Alta'],
      ['H10', 'Falsa', 'Alta'],
      ['H11', 'Verdadeira', 'Alta'],
      ['H12', 'Verdadeira', 'Alta'],
      ['H13', 'Verdadeira', 'Baixa'],
      ]

print( tabulate(tab, headers='firstrow' ))

## 4.3. Analise Multivariada

### 4.3.1. Numerical Attributes

In [None]:
correlation = num_attributes.corr( method='pearson')
sns.heatmap( correlation, annot=True);

### 4.3.2. Categorical Atributes

In [None]:
# only categorical data
a = df4.select_dtypes( include='object')

# Calculate cramer V
a1 = cramer_v(a['state_holiday'], a['state_holiday'])
a2 = cramer_v(a['state_holiday'], a['store_type'])
a3 = cramer_v(a['state_holiday'], a['assortment'])

a4 = cramer_v(a['store_type'], a['state_holiday'])
a5 = cramer_v(a['store_type'], a['store_type'])
a6 = cramer_v(a['store_type'], a['assortment'])

a7 = cramer_v(a['assortment'], a['state_holiday'])
a8 = cramer_v(a['assortment'], a['store_type'])
a9 = cramer_v(a['assortment'], a['assortment'])

# Final dataset
d = pd.DataFrame( {'state_holiday': [a1, a2, a3],
               'store_type': [a4, a5, a6],
               'assortment': [a7, a8, a9] })

d = d.set_index( d.columns )

sns.heatmap(d, annot=True);

# 5.0. PASSO 05 - PREPARACAO DOS DADOS

In [None]:
df5 = df4.copy()

## 5.1. Normalizacao

## 5.2. Rescaling

In [None]:
rs = RobustScaler()
mms = MinMaxScaler()

# competition distance
df5['competition_distance'] = rs.fit_transform( df5[['competition_distance']].values )

# competition time month
df5['competition_time_month'] = rs.fit_transform( df5[['competition_time_month']].values )

# promo time week
df5['promo_time_week'] = mms.fit_transform( df5[['promo_time_week']].values )

# year
df5['year'] = mms.fit_transform( df5[['year']].values )


## 5.3. Transformacao

### 5.3.1. Encoding

In [None]:
# state_holiday - One Hot Encoding
df5 = pd.get_dummies( df5, prefix=['state_holiday'], columns=['state_holiday'])

# store_type = Label Encoding
le = LabelEncoder()
df5['store_type'] = le.fit_transform( df5['store_type'])

# assortment = Ordinal Encoding
assortment_dict = {'basic': 1, 'extra':2,'extended': 3}
df5['assortment'] = df5['assortment'].map( assortment_dict )

### 5.3.2. Response Variable Transformation

In [None]:
df5['sales'] = np.log1p(df5['sales'])

In [None]:
sns.distplot(df5['sales'])

### 5.3.2. Nature Transformation

In [None]:
# month
df5['month_sin'] = df5['month'].apply( lambda x: np.sin(x * ( 2. * np.pi/12) ) )
df5['month_cos'] = df5['month'].apply( lambda x: np.cos(x * ( 2. * np.pi/12) ) )

# day
df5['day_sin'] = df5['day'].apply( lambda x: np.sin(x * ( 2. * np.pi/30) ) )
df5['day_cos'] = df5['day'].apply( lambda x: np.cos(x * ( 2. * np.pi/30) ) )

# week of year
df5['week_of_year_sin'] = df5['week_of_year'].apply( lambda x: np.sin(x * ( 2. * np.pi/52) ) )
df5['week_of_year_cos'] = df5['week_of_year'].apply( lambda x: np.cos(x * ( 2. * np.pi/52) ) )

# day of week
df5['day_of_week_sin'] = df5['day_of_week'].apply( lambda x: np.sin(x * ( 2. * np.pi/7) ) )
df5['day_of_week_cos'] = df5['day_of_week'].apply( lambda x: np.cos(x * ( 2. * np.pi/7) ) )






# 6.0. PASSO 06 - FEATURE SELECTION

In [None]:
df6 = df5.copy()

## 6.1. Split dataframe into training and test dataset

In [None]:
df6.head()

In [None]:
cols_drop = ['week_of_year', 'day', 'month', 'day_of_week', 'promo_since', 'competition_since', 'year_week']
df6 = df6.drop( cols_drop, axis=1)

In [None]:
df6[['store', 'date']].groupby( 'store' ).max().reset_index()['date'][0] - datetime.timedelta(days=6*7)

In [None]:
# training dataset
X_train = df6[df6['date'] < '2015-06-19']
y_train = X_train['sales']

# teste dataset
X_test = df6[df6['date'] >= '2015-06-19']
y_test = X_test['sales']

print( 'Training Min Date: {}' .format( X_train['date'].min()))
print( 'Training Max Date: {}' .format( X_train['date'].max()))

print( '\nTest Min Date: {}' .format( X_test['date'].min()))
print( 'Test Max Date: {}' .format( X_test['date'].max()))

## 6.2. Boruta as Feature Selector

In [None]:
# Training and test dataset for Boruta
X_train_n = X_train.drop( ['date', 'sales'], axis=1 ).values
y_train_n = y_train.values.ravel()

# define RandomForestRegressor
rf = RandomForestRegressor( n_jobs=-1 )

# define Boruta
boruta = BorutaPy( rf, n_estimators='auto', verbose=2, random_state=42 ).fit( X_train_n, y_train_n)

### 6.2.1. Best Features from Boruta

In [None]:
cols_selected = boruta.support_.tolist()

# Best features
X_train_fs = X_train.drop( ['date', 'sales'], axis=1 )
cols_selected_boruta = X_train_fs.iloc[:, cols_selected].columns.to_list()

# not selected Boruta
cols_not_selected_boruta = list( np.setdiff1d( X_train_fs.columns, cols_selected_boruta ) )

## 6.3. Manual Feature Selection

In [None]:
cols_selected_boruta = [
    'store',
    'promo',
    'store_type',
    'assortment',
    'competition_distance',
    'competition_open_since_month',
    'competition_open_since_year',
    'promo2',
    'promo2_since_week',
    'promo2_since_year',
    'competition_time_month',
    'promo_time_week',
    'month_sin',
    'month_cos',
    'day_sin',
    'day_cos',
    'week_of_year_sin',
    'week_of_year_cos',
    'day_of_week_sin',
    'day_of_week_cos']

# columns to add
feat_to_add = ['date', 'sales']

cols_selected_boruta_full = cols_selected_boruta.copy()
cols_selected_boruta_full.extend( feat_to_add)

# final features
# cols_selected_boruta.extend( feat_to_add )

In [None]:
cols_selected_boruta_full

# 7.0. PASSO 07 - MARCHINE LEARNING MODELLING

In [None]:
x_train = X_train[ cols_selected_boruta]
x_test = X_test[ cols_selected_boruta]

# Time Series Data Preparation
x_training = X_train[ cols_selected_boruta_full]

## 7.1. Average Model

In [None]:
aux1 = x_test.copy()
aux1['sales'] = y_test.copy()

# prediction
aux2 = aux1[['store', 'sales']].groupby( 'store' ).mean().reset_index().rename( columns={'sales': 'predictions'} )
aux1 = pd.merge( aux1, aux2, how='left', on='store')
yhat_baseline = aux1['predictions']

# performance
baseline_result = ml_error( 'Average Model', np.expm1( y_test ), np.expm1( yhat_baseline) )
baseline_result

## 7.2. Linear Regression Model

In [None]:
# model
lr = LinearRegression().fit(x_train, y_train)

# prediction
yhat_lr = lr.predict( x_test)

# performance
lr_result = ml_error( 'Linear Regression', np.expm1( y_test ), np.expm1( yhat_lr))
lr_result

### 7.2.1. Linear Regression Model - Cross Validation

In [None]:
lr_result_cv = cross_validation(x_training, 1, 'Linear Regression', lr, verbose=False )
lr_result_cv

## 7.3. Linear Regression Regularized Model - Lasso

In [None]:
# model
lrr = Lasso( alpha=0.01 ).fit(x_train, y_train)

# prediction
yhat_lrr = lrr.predict( x_test)

# performance
lrr_result = ml_error( 'Linear Regression - Lasso', np.expm1( y_test ), np.expm1( yhat_lrr))
lrr_result

### 7.3.1. Lasso - Cross Validation

In [None]:
lrr_result_cv = cross_validation(x_training, 1, 'Lasso', lrr, verbose=False )
lrr_result_cv

## 7.4. Random Forest Regressor

In [None]:
# model
rf = RandomForestRegressor( n_estimators=100, n_jobs=1, random_state=42 ).fit(x_train, y_train)

# prediction
yhat_rf = rf.predict( x_test)

# performance
rf_result = ml_error( 'Random Forest Regressor', np.expm1( y_test ), np.expm1( yhat_rf))
rf_result

### 7.4.1. Random Forest Regresso - Croos Validation

In [None]:
rf_result_cv = cross_validation(x_training,1, 'Random Forest Regressor', rf, verbose=True )
rf_result_cv

## 7.5. XGBoost Regressor

In [None]:
# model
model_xgb = xgb.XGBRegressor( objective='reg:squarederror', 
                             n_estimators=100, 
                             eta= 0.01, 
                             max_depth=10, 
                             subsample=0.7, 
                             colsample_bytree=0.9 ).fit(x_train, y_train)

# prediction
yhat_xgb = model_xgb.predict( x_test)

# performance
xgb_result = ml_error( 'XGB Regressor', np.expm1( y_test ), np.expm1( yhat_xgb))
xgb_result

### 7.5.1. XGBoost Regressor - Cross Validation

In [None]:
xgb_result_cv = cross_validation(x_training, 1, 'XGBoost Regressor', model_xgb, verbose=True )
xgb_result_cv

## 7.6. Compare Model´s Performance

### 7.6.1. Single Performance

In [None]:
modelling_result = pd.concat( [baseline_result, lr_result, lrr_result, rf_result, xgb_result])
modelling_result.sort_values( 'RMSE' )

### 7.6.2. Real Performance - Cross Validation

In [None]:
modelling_result_cv = pd.concat( [lr_result_cv, lrr_result_cv, rf_result_cv, xgb_result_cv])
modelling_result_cv

# 8.0. PASSO 08 - HYPERPARAMETER FINE TUNING

## 8.1. Random Search

In [None]:
param = {
    'n_estimators': [1500, 1700, 2500, 3000, 3500],
    'eta': [0.01, 0.03],
    'max_depth': [3, 5, 9],
    'subsample': [0.1, 0.5, 0.7],
    'colsample_bytree': [0.3, 0.7, 0.9],
    'min_child_weight': [3, 8, 15]
        }
    
MAX_EVAL = 2

In [None]:
import random

final_result = pd.DataFrame()

for i in range( MAX_EVAL ):
    # choose values for parameters randomly
    hp = { k: random.sample( v, 1 )[0] for k, v in param.items() }
    print( hp )
    
    # model
    model_xgb = xgb.XGBRegressor( objective='reg:squarederror',
                                  n_estimators=hp['n_estimators'], 
                                  eta=hp['eta'], 
                                  max_depth=hp['max_depth'], 
                                  subsample=hp['subsample'],
                                  colsample_bytree=hp['colsample_bytree'],
                                  min_child_weight=hp['min_child_weight'] )

    # performance
    result = cross_validation( x_training, 2, 'XGBoost Regressor', model_xgb, verbose=True )
    final_result = pd.concat( [final_result, result] )
        
final_result

## 8.2. Final Model

In [None]:
param_tuned = {
    'n_estimators':1500,
    'eta':0.3,
    'max_depth':9,
    'subsample':0.7,
    'colsample_bytree':0.9,
    'min_child_weight':15 
        }


In [None]:
# model
model_xgb_tuned = xgb.XGBRegressor( objective='reg:squarederror',
                                    n_estimators=param_tuned['n_estimators'], 
                                    eta=param_tuned['eta'], 
                                    max_depth=param_tuned['max_depth'], 
                                    subsample=param_tuned['subsample'],                        
                                    colsample_bytree=param_tuned['colsample_bytree'],
                                    min_child_weight=param_tuned['min_child_weight'] ).fit( x_train, y_train)
                                                    
# prediction
yhat_xgb_tuned = model_xgb_tuned.predict( x_test)

# performance
xgb_result_tuned = ml_error('XGBoost Regressor', np.expm1(y_test), np.expm1(yhat_xgb_tuned))
xgb_result_tuned