In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import LSTM
from keras import callbacks
from keras.callbacks import ModelCheckpoint

In [2]:
%%time
input_dir = '../inputs/favorita-grocery-sales-forecasting'

df_train = pd.read_pickle('df_train_favorita.pkl')
df_test = pd.read_pickle('df_test_favorita.pkl')
items = pd.read_pickle('items_favorita.pkl')
stores = pd.read_pickle('stores_favorita.pkl')


df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))



CPU times: total: 28.1 s
Wall time: 28.7 s


In [5]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[f'{t2017 + timedelta(days=i)}'].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [6]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

stores_items = pd.DataFrame(index=df_2017.index)
test_ids = df_test[['id']]

items = items.reindex( stores_items.index.get_level_values(1) )

Preparing dataset...


In [8]:
X_train = X_train.values
X_test = X_test.values
X_val = X_val.values
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))

In [9]:
model = Sequential()
model.add(LSTM(32, input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(Dropout(.1))
model.add(Dense(32))
model.add(Dropout(.2))
model.add(Dense(1))
model.compile(loss = 'mse', optimizer='adam', metrics=['mse'])

In [10]:
N_EPOCHS = 5

val_pred = []
test_pred = []
# wtpath = 'weights.hdf5'  # To save best epoch. But need Keras bug to be fixed first.
sample_weights=np.array( pd.concat([items["perishable"]] * 6) * 0.25 + 1 )
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    y = y_train[:, i]
    xv = X_val
    yv = y_val[:, i]
    model.fit(X_train, y, batch_size = 512, epochs = N_EPOCHS, verbose=2,
               sample_weight=sample_weights, validation_data=(xv,yv) ) 
    val_pred.append(model.predict(X_val))
    test_pred.append(model.predict(X_test))

Step 1
Epoch 1/5
1964/1964 - 20s - loss: 0.3893 - mse: 0.3671 - val_loss: 0.3069 - val_mse: 0.3069 - 20s/epoch - 10ms/step
Epoch 2/5
1964/1964 - 8s - loss: 0.3516 - mse: 0.3328 - val_loss: 0.2992 - val_mse: 0.2992 - 8s/epoch - 4ms/step
Epoch 3/5
1964/1964 - 8s - loss: 0.3439 - mse: 0.3258 - val_loss: 0.2999 - val_mse: 0.2999 - 8s/epoch - 4ms/step
Epoch 4/5
1964/1964 - 8s - loss: 0.3408 - mse: 0.3229 - val_loss: 0.2969 - val_mse: 0.2969 - 8s/epoch - 4ms/step
Epoch 5/5
1964/1964 - 8s - loss: 0.3385 - mse: 0.3209 - val_loss: 0.3001 - val_mse: 0.3001 - 8s/epoch - 4ms/step
Step 2
Epoch 1/5
1964/1964 - 8s - loss: 0.3663 - mse: 0.3455 - val_loss: 0.3281 - val_mse: 0.3281 - 8s/epoch - 4ms/step
Epoch 2/5
1964/1964 - 8s - loss: 0.3624 - mse: 0.3420 - val_loss: 0.3286 - val_mse: 0.3286 - 8s/epoch - 4ms/step
Epoch 3/5
1964/1964 - 8s - loss: 0.3613 - mse: 0.3409 - val_loss: 0.3287 - val_mse: 0.3287 - 8s/epoch - 4ms/step
Epoch 4/5
1964/1964 - 8s - loss: 0.3605 - mse: 0.3402 - val_loss: 0.3270 - val_

Epoch 1/5
1964/1964 - 9s - loss: 0.4143 - mse: 0.3911 - val_loss: 0.3798 - val_mse: 0.3798 - 9s/epoch - 4ms/step
Epoch 2/5
1964/1964 - 8s - loss: 0.4067 - mse: 0.3840 - val_loss: 0.3785 - val_mse: 0.3785 - 8s/epoch - 4ms/step
Epoch 3/5
1964/1964 - 8s - loss: 0.4052 - mse: 0.3827 - val_loss: 0.3793 - val_mse: 0.3793 - 8s/epoch - 4ms/step
Epoch 4/5
1964/1964 - 8s - loss: 0.4043 - mse: 0.3818 - val_loss: 0.3772 - val_mse: 0.3772 - 8s/epoch - 4ms/step
Epoch 5/5
1964/1964 - 8s - loss: 0.4041 - mse: 0.3816 - val_loss: 0.3780 - val_mse: 0.3780 - 8s/epoch - 4ms/step
Step 14
Epoch 1/5
1964/1964 - 9s - loss: 0.3998 - mse: 0.3780 - val_loss: 0.3664 - val_mse: 0.3664 - 9s/epoch - 4ms/step
Epoch 2/5
1964/1964 - 8s - loss: 0.3937 - mse: 0.3725 - val_loss: 0.3716 - val_mse: 0.3716 - 8s/epoch - 4ms/step
Epoch 3/5
1964/1964 - 8s - loss: 0.3930 - mse: 0.3719 - val_loss: 0.3651 - val_mse: 0.3651 - 8s/epoch - 4ms/step
Epoch 4/5
1964/1964 - 8s - loss: 0.3921 - mse: 0.3710 - val_loss: 0.3641 - val_mse: 0.36

In [11]:
n_public = 5 # Number of days in public test set
weights=pd.concat([items["perishable"]]) * 0.25 + 1
print("Unweighted validation mse: ", mean_squared_error(
    y_val, np.array(val_pred).squeeze(axis=2).transpose()) )
print("Full validation mse:       ", mean_squared_error(
    y_val, np.array(val_pred).squeeze(axis=2).transpose(), sample_weight=weights) )
print("'Public' validation mse:   ", mean_squared_error(
    y_val[:,:n_public], np.array(val_pred).squeeze(axis=2).transpose()[:,:n_public], 
    sample_weight=weights) )
print("'Private' validation mse:  ", mean_squared_error(
    y_val[:,n_public:], np.array(val_pred).squeeze(axis=2).transpose()[:,n_public:], 
    sample_weight=weights) )
    
y_test = np.array(test_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_test, index=stores_items.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = test_ids.join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lstm.csv', float_format='%.4f', index=None)

Unweighted validation mse:  0.36650037801433366
Full validation mse:        0.36597097080926233
'Public' validation mse:    0.3370075289959559
'Private' validation mse:   0.37913617163349256


In [25]:
X_train.shape

40

In [23]:
len(y_train[:, 1])

1005090

In [18]:
X_test.shape

(167515, 1, 40)

In [20]:
X_val.shape

(167515, 1, 40)

In [21]:
y_val.shape

(167515, 16)