In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [16]:
train = pd.read_csv('data/train_merged.csv', parse_dates=['date'])
test = pd.read_csv('data/test_merged.csv', parse_dates=['date'])

In [17]:
train.earthquake.fillna('', inplace=True)
train.event_name.fillna('', inplace=True)
train.national_holiday.fillna('', inplace=True)
train.regional_holiday.fillna('', inplace=True)
train.local_holiday.fillna('', inplace=True)

test.earthquake.fillna('', inplace=True)
test.event_name.fillna('', inplace=True)
test.national_holiday.fillna('', inplace=True)
test.regional_holiday.fillna('', inplace=True)
test.local_holiday.fillna('', inplace=True)

In [18]:
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,...,earthquake,event_name,national_holiday,regional_holiday,local_holiday,year,month,day,day_of_week,payday
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,...,,,Primer dia del ano,,,2013,1,1,1,False
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,...,,,Primer dia del ano,,,2013,1,1,1,False
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,...,,,Primer dia del ano,,,2013,1,1,1,False
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,...,,,Primer dia del ano,,,2013,1,1,1,False
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,...,,,Primer dia del ano,,,2013,1,1,1,False


In [19]:
test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion,city,state,type,cluster,transactions,...,earthquake,event_name,national_holiday,regional_holiday,local_holiday,year,month,day,day_of_week,payday
0,3000888,2017-08-16,1,AUTOMOTIVE,0,Quito,Pichincha,D,13,,...,,,,,,2017,8,16,2,False
1,3000889,2017-08-16,1,BABY CARE,0,Quito,Pichincha,D,13,,...,,,,,,2017,8,16,2,False
2,3000890,2017-08-16,1,BEAUTY,2,Quito,Pichincha,D,13,,...,,,,,,2017,8,16,2,False
3,3000891,2017-08-16,1,BEVERAGES,20,Quito,Pichincha,D,13,,...,,,,,,2017,8,16,2,False
4,3000892,2017-08-16,1,BOOKS,0,Quito,Pichincha,D,13,,...,,,,,,2017,8,16,2,False


In [20]:
%%time
cat_features = [
    'family', 'store_nbr', 'city', 'state', 'cluster',
    'earthquake', 'event_name', 'national_holiday', 'regional_holiday', 'local_holiday', 
    'payday', 'type',
]
for name in cat_features:
    print(f'Encode {name}')
    le = LabelEncoder()
    le.fit(train[name].unique())
    train[name] = le.transform(train[name])
    test[name] = le.transform(test[name])

Encode family
Encode store_nbr
Encode city
Encode state
Encode cluster
Encode earthquake
Encode event_name
Encode national_holiday
Encode regional_holiday
Encode local_holiday
Encode payday
Encode type
Wall time: 4.31 s


In [21]:
dates = sorted(train.date.unique())
border_date = dates[-30]
train_data = train[train.date <= border_date]
valid_data = train[train.date > border_date]

print(train_data.shape)
print(valid_data.shape)

(2949210, 22)
(51678, 22)


In [22]:
train_dates = train_data.date.unique()
valid_dates = valid_data.date.unique()
print(f'Train from {train_dates[0]} to {train_dates[-1]}')
print(f'Valid from {valid_dates[0]} to {valid_dates[-1]}')

Train from 2013-01-01T00:00:00.000000000 to 2017-07-17T00:00:00.000000000
Valid from 2017-07-18T00:00:00.000000000 to 2017-08-15T00:00:00.000000000


In [23]:
y_train = np.log(train_data[['sales']].values + 1)
y_valid = np.log(valid_data[['sales']].values + 1)

X_train = train_data.drop(['id', 'sales', 'year', 'date', 'transactions'], axis=1)
X_valid = valid_data.drop(['id', 'sales', 'year', 'date', 'transactions'], axis=1)

X_test = test.drop(['id', 'date', 'year', 'transactions'], axis=1)

In [24]:
X_test.head()

Unnamed: 0,store_nbr,family,onpromotion,city,state,type,cluster,oil_price,earthquake,event_name,national_holiday,regional_holiday,local_holiday,month,day,day_of_week,payday
0,0,0,0,18,12,3,12,46.8,0,0,0,0,0,8,16,2,0
1,0,1,0,18,12,3,12,46.8,0,0,0,0,0,8,16,2,0
2,0,2,2,18,12,3,12,46.8,0,0,0,0,0,8,16,2,0
3,0,3,20,18,12,3,12,46.8,0,0,0,0,0,8,16,2,0
4,0,4,0,18,12,3,12,46.8,0,0,0,0,0,8,16,2,0


In [10]:
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'random_state': 1,
    'learning_rate': 0.01,
    'subsample': 0.99,
    'colsample_bytree': 0.80,
    'reg_alpha': 10.0,
    'reg_lambda': 0.18,
    'min_child_weight': 47,
}

In [11]:
%%time
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_valid = xgb.DMatrix(X_valid, label=y_valid)
eval_list = [(xgb_train, 'train'), (xgb_valid, 'eval')]
eval_result = dict()

model = xgb.train(
    params=xgb_params,
    dtrain=xgb_train,
    evals=eval_list,
    evals_result=eval_result,
    num_boost_round=1000,
    early_stopping_rounds=100,
)

[0]	train-rmse:3.59063	eval-rmse:3.97498
[1]	train-rmse:3.56236	eval-rmse:3.93936
[2]	train-rmse:3.53338	eval-rmse:3.90762
[3]	train-rmse:3.50495	eval-rmse:3.87796
[4]	train-rmse:3.47660	eval-rmse:3.84689
[5]	train-rmse:3.44837	eval-rmse:3.81619
[6]	train-rmse:3.42166	eval-rmse:3.78217
[7]	train-rmse:3.39974	eval-rmse:3.74936
[8]	train-rmse:3.37861	eval-rmse:3.71704
[9]	train-rmse:3.35733	eval-rmse:3.68492
[10]	train-rmse:3.33677	eval-rmse:3.65312
[11]	train-rmse:3.31117	eval-rmse:3.62096
[12]	train-rmse:3.28488	eval-rmse:3.59204
[13]	train-rmse:3.26470	eval-rmse:3.56148
[14]	train-rmse:3.23888	eval-rmse:3.53309
[15]	train-rmse:3.21338	eval-rmse:3.50523
[16]	train-rmse:3.18801	eval-rmse:3.47768
[17]	train-rmse:3.16306	eval-rmse:3.45050
[18]	train-rmse:3.13963	eval-rmse:3.42036
[19]	train-rmse:3.11645	eval-rmse:3.39045
[20]	train-rmse:3.09254	eval-rmse:3.36408
[21]	train-rmse:3.06991	eval-rmse:3.33482
[22]	train-rmse:3.04774	eval-rmse:3.30600
[23]	train-rmse:3.02546	eval-rmse:3.27757
[2

In [25]:
xgb_test = xgb.DMatrix(X_test)
xgb_pred = np.exp(pd.Series(model.predict(xgb_test)).map(lambda x: max(x, 0)))

In [26]:
xgb_valid_pred = model.predict(xgb_valid)
mean_squared_error(y_valid, xgb_valid_pred, squared=False)

0.729255078701444

In [27]:
sorted(model.get_fscore().items(), key=lambda x: x[1], reverse=True)

[('family', 18371),
 ('store_nbr', 11280),
 ('oil_price', 5970),
 ('onpromotion', 4140),
 ('type', 4024),
 ('cluster', 3892),
 ('month', 2977),
 ('city', 2916),
 ('day_of_week', 2592),
 ('state', 2511),
 ('national_holiday', 1245),
 ('day', 876),
 ('earthquake', 248),
 ('event_name', 190),
 ('payday', 3)]

In [28]:
sub = test[['id']].copy()
sub['sales'] = xgb_pred
sub.head()

Unnamed: 0,id,sales
0,3000888,4.733689
1,3000889,1.154945
2,3000890,5.642949
3,3000891,2605.564439
4,3000892,1.159933


In [29]:
sub.to_csv('data/submission.csv', index=False)