**Author:** Allyson de Lima Medeiros     
**Data:** 2019-04-05     
**Contato:**https://www.linkedin.com/in/allysonlm/     

In [1]:
#########################################################
# Imports
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 100)

import numpy as np
import random
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib as plt
%matplotlib inline 

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

import os
print(os.listdir("../input"))

['lojas.csv', 'dataset_teste.csv', 'sample_submission.csv', 'dataset_treino.csv']


In [2]:

#########################################################
# Leitura dos dados
train = pd.read_csv('../input/dataset_treino.csv')
test = pd.read_csv('../input/dataset_teste.csv')
store = pd.read_csv('../input/lojas.csv')


#########################################################
# Dados com lojas abertas
train = train[(train['Open'] == 1) & (train['Sales'] > 0)]


#########################################################   
# Open = 1 para os dados de teste
test.fillna(1, inplace=True)


#########################################################
# Merge
train = train.merge(store, on = 'Store', how = 'left')
test = test.merge(store, on = 'Store', how = 'left')

#########################################################   
# StateHoliday
train.loc[train['StateHoliday'] == 0, 'StateHoliday'] = '0'



le = LabelEncoder()
lst_promo_inter = list((train['PromoInterval'].append(test['PromoInterval'])).unique())
for ds in [train, test]:
    #########################################################
    # Coluna data
    ds['Date'] = pd.to_datetime(ds['Date'], errors='coerce')
    ds['Year'] = ds.Date.dt.year
    ds['Month'] = ds.Date.dt.month
    ds['Day'] = ds.Date.dt.day
    ds['Week'] = ds.Date.dt.week
    ds['WeekOfYear'] = ds.Date.dt.week
    
    #########################################################
    # Categoricos
    ds['StoreType'] = le.fit_transform(ds['StoreType'])
    ds['Assortment'] = le.fit_transform(ds['Assortment'])
    ds['StateHoliday'] = le.fit_transform(ds['StateHoliday'])
    
    
    #########################################################
    # CompetitionMonth
    ds['CompetitionMonth'] = 12 * (train.Year - train.CompetitionOpenSinceYear) + (train.Month - train.CompetitionOpenSinceMonth)
    ds['CompetitionMonth'] = ds.CompetitionMonth.apply(lambda x: x if x > 0 else 0)   
    
    
    #########################################################
    # PromoOpen
    ds['PromoOpen'] = 12 * (ds.Year - ds.Promo2SinceYear) + (ds.WeekOfYear - ds.Promo2SinceWeek) / 4.0        
    ds['PromoOpen'] = ds.PromoOpen.apply(lambda x: x if x > 0 else 0)
    
    
    #########################################################
    # PromoInterval
    ds['PromoInterval'] = [lst_promo_inter.index(x) for x in ds['PromoInterval']]
    
    
    #########################################################
    # Prencher nulos
    ds.fillna(-1, inplace=True)

In [3]:
train.head(3)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,Week,WeekOfYear,CompetitionMonth,PromoOpen
0,1,5,2015-07-31,5263,555,1,1,0,1,2,0,1270.0,9.0,2008.0,0,-1.0,-1.0,0,2015,7,31,31,31,82.0,0.0
1,2,5,2015-07-31,6064,625,1,1,0,1,0,0,570.0,11.0,2007.0,1,13.0,2010.0,1,2015,7,31,31,31,92.0,64.5
2,3,5,2015-07-31,8314,821,1,1,0,1,0,0,14130.0,12.0,2006.0,1,14.0,2011.0,1,2015,7,31,31,31,103.0,52.25


In [4]:
#########################################################
# RMSPE
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)



#########################################################
# Features
features = ['Store',
            'DayOfWeek', 
            'Promo', 
            'StateHoliday', 
            'SchoolHoliday', 
            'StoreType', 
            'Assortment',
            'CompetitionDistance', 
            'CompetitionOpenSinceMonth', 
            'CompetitionOpenSinceYear', 
            'Promo2',
            'Promo2SinceWeek', 
            'Promo2SinceYear', 
            'PromoInterval', 
            'Year',
            'Month', 
            'Day',
            #'Week', 
            'WeekOfYear'
            #'CompetitionMonth',
            #'PromoOpen'
           ]


#########################################################
# Modelo
param = {
    'objective': 'reg:linear', 
    "booster" : "gbtree",
    'eta': 0.03,
    'max_depth':10,
    'subsample':0.9,
    'colsample_bytree':0.7,
    'silent' : 1  
}

X_train, X_test, y_train, y_test = train_test_split(train[features], np.log1p(train['Sales']), 
                                                    test_size = 50000, random_state = 2019)

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_test, y_test)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

In [5]:
#########################################################
# Treinamento
gbm = xgb.train(
            param, 
            dtrain, 
            7000,
            evals=watchlist,
            early_stopping_rounds=100, 
            feval=rmspe_xg, 
            verbose_eval=100
)

[0]	train-rmse:8.02109	eval-rmse:8.02106	train-rmspe:0.999809	eval-rmspe:0.999809
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[100]	train-rmse:0.45711	eval-rmse:0.456233	train-rmspe:0.362725	eval-rmspe:0.349328
[200]	train-rmse:0.198313	eval-rmse:0.199207	train-rmspe:0.251356	eval-rmspe:0.213862
[300]	train-rmse:0.158528	eval-rmse:0.159778	train-rmspe:0.212345	eval-rmspe:0.175661
[400]	train-rmse:0.136373	eval-rmse:0.138189	train-rmspe:0.185433	eval-rmspe:0.152595
[500]	train-rmse:0.121977	eval-rmse:0.124436	train-rmspe:0.164514	eval-rmspe:0.137867
[600]	train-rmse:0.112118	eval-rmse:0.115163	train-rmspe:0.15208	eval-rmspe:0.127811
[700]	train-rmse:0.105828	eval-rmse:0.109435	train-rmspe:0.140827	eval-rmspe:0.121681
[800]	train-rmse:0.100877	eval-rmse:0.105051	train-rmspe:0.132488	eval-rmspe:0.116991
[900]	train-rmse:0.097279	eval-rmse:0.102039	train-rmspe:0.12795	eval-rmspe:0.113834
[

In [6]:
#########################################################
# Predict treino
yhat = gbm.predict(xgb.DMatrix(X_test))

In [7]:
rmspe(np.expm1(y_test), np.expm1(yhat))

0.0954776882987182

In [8]:
#########################################################
# Tabela com erro, ratio e ajuste de pesos
res = pd.DataFrame(data = y_test)
res['Prediction']=yhat
res = pd.merge(X_test,res, left_index= True, right_index=True)
res['Ratio'] = res.Prediction/res.Sales
res['Error'] =abs(res.Ratio-1)
res['Weight'] = res.Sales/res.Prediction

res.head()

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,Sales,Prediction,Ratio,Error,Weight
388908,654,4,1,0,0,2,0,6930.0,9.0,2006.0,0,-1.0,-1.0,0,2014,5,8,19,8.913281,8.918775,1.000616,0.000616,0.999384
662106,1071,4,1,0,1,0,0,820.0,3.0,2012.0,1,35.0,2012.0,3,2013,7,18,29,8.683893,8.739903,1.00645,0.00645,0.993591
87947,114,1,1,0,0,2,0,4510.0,-1.0,-1.0,1,48.0,2011.0,3,2015,4,27,18,8.95519,8.90074,0.99392,0.00608,1.006118
812187,61,1,1,0,0,0,2,350.0,12.0,2007.0,1,1.0,2012.0,1,2013,2,4,6,8.405815,8.434639,1.003429,0.003429,0.996583
750646,900,5,1,0,0,0,0,3920.0,4.0,2005.0,1,40.0,2014.0,1,2013,4,12,15,8.821732,8.71506,0.987908,0.012092,1.01224


In [9]:
#########################################################
# Conceito: os scores de treino e validação estão bem próximos, ajustanto os pesos do treino
# eh provavel que o score da validação seja melhor

W=[(0.990+(i/1000)) for i in range(20)]
S =[]
for w in W:
    error = rmspe(np.expm1(y_test), np.expm1(yhat*w))
    #print('RMSPE for {:.3f}:{:.6f}'.format(w,error))
    S.append(error)
Score = pd.Series(S,index=W)
#Score.plot()
BS = Score[Score.values == Score.values.min()]
print ('Melhor ajuste de score:{}'.format(BS))


Melhor ajuste de score:0.999    0.094718
dtype: float64


In [10]:
#########################################################
# Score com ajuste
rmspe(np.expm1(y_test), np.expm1(yhat*0.999))

0.09471813894691583

In [11]:
#########################################################
# Predição final
test_probs = gbm.predict(xgb.DMatrix(test[features]))

In [12]:
#########################################################
# Peso final ajustado por tentativa e erro no Score público
peso_final = 0.9965
submission = pd.DataFrame({"Id": test["Id"], "Open": test["Open"], "Sales":  np.expm1(test_probs * peso_final) })
submission.loc[submission['Open'] == 0, 'Sales'] = 0
submission.loc[submission['Sales'] < 0, 'Sales'] = 0
submission = submission[['Id', 'Sales']]


submission.to_csv("sub_final4.csv", index=False)