In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import LSTM

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# from keras.utils import to_categorical

In [3]:
# df_train = pd.read_csv('train.csv', dtype={'onpromotion': bool},
#                        converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
#                        parse_dates=["date"], skiprows=range(1, 66458909))

In [4]:
#df_train = df_train_raw.loc[df_train_raw['date']>=pd.datetime(2016,1,1)]

In [5]:
# train on 2017 data
df_train = pd.read_csv('train.csv', dtype={'onpromotion': bool},
                       converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
                       parse_dates=["date"], skiprows = range(1,101688780))

In [6]:
date_index = pd.date_range(df_train['date'].min(), df_train['date'].max())

In [7]:
df_test = pd.read_csv("test.csv", dtype={'onpromotion': bool}, parse_dates=["date"]).set_index(['store_nbr', 'item_nbr', 'date'])

In [8]:
items = pd.read_csv("items.csv").set_index("item_nbr")

In [9]:
# flatten promo across dates
promo_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(level=-1).fillna(False)
promo_train.columns = promo_train.columns.get_level_values(1)
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)
promo_train = promo_train.reindex(date_index.values, axis = 1).fillna(False)
promo_test = promo_test.reindex(promo_train.index).fillna(False) #align two ds, disregard missing test items 
# promo = pd.concat([promo_train, promo_test], axis=1)
# del promo_test, promo_train

In [10]:
df_train = df_train.set_index(["store_nbr", "item_nbr", "date"])["unit_sales"].unstack(level=-1).fillna(0)
df_train = df_train.reindex(date_index.values, axis = 1).fillna(0)

In [11]:
items = items.reindex(df_train.index.get_level_values(1)) #delete items not in train, align item index to train index

In [12]:
df_train = df_train.iloc[:,3:] #start from 2017-01-04 to 2017-08-15

In [13]:
sale_scaler = MinMaxScaler()
sale_scaler.fit(df_train.values.reshape(-1, 1))
ds_train = sale_scaler.transform(df_train)

In [14]:
promo_train = promo_train.iloc[:,3:].astype('int')

In [15]:
# def length_of_month(d):
#     return (d.replace(month=d.month+1, day=1) - d.replace(day=1)).days
# def last_day_of_month(d):
#     return d.replace(month=d.month+1, day=1) - timedelta(days=1)

In [16]:
ds_dow_train = np.tile(df_train.columns.dayofweek.values/6, (df_train.shape[0],1)) #progression in a week

In [17]:
len_month = {1:31, 2:30, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31}
ds_dom_train = np.tile(df_train.columns.day.values/np.vectorize(len_month.get)(df_train.columns.month.values), (df_train.shape[0],1))#progression in a month

In [18]:
# concat all timestep data into [nsamples, ntimesteps, nfeatures]

In [19]:
ds_train = np.reshape(ds_train[:,:-1], (df_train.shape[0],df_train.shape[1]-1,1)) # last day's sale, shift forward
promo_train = np.reshape(promo_train.values[:,1:],(df_train.shape[0],df_train.shape[1]-1,1)) #today's promo
ds_dow_train = np.reshape(ds_dow_train[:,1:], (df_train.shape[0],df_train.shape[1]-1,1))
ds_dom_train = np.reshape(ds_dom_train[:,1:], (df_train.shape[0],df_train.shape[1]-1,1))
dtrain = np.concatenate((ds_train,promo_train,ds_dow_train,ds_dom_train), axis = 2)

In [20]:
def prep_data(data, ydata, anchor, colindex, lookback = 40, train = True):
    ianchor = np.argwhere(colindex == anchor)[0][0]
    xout = data[:,(ianchor-lookback-1):(ianchor-1),:] #last day's price, today's promo
    if train:
        yout = ydata[:, ianchor]
        return xout,yout
    return xout

In [21]:
train_start = date(2017,8,1)
train_end = date(2017,8,15)
ndays = (train_end-train_start).days

In [22]:
X_l = []
y_l = []
for i in range(ndays):
    anchor = (train_start + timedelta(days=i)).strftime('%Y-%m-%d')
    X_temp, y_temp = prep_data(dtrain, df_train.values, anchor, df_train.columns)
    X_l.append(X_temp)
    y_l.append(y_temp)
X_train = np.concatenate(X_l, axis = 0)
y_train = np.concatenate(y_l, axis = 0)
del X_l, y_l
X_val, y_val = prep_data(dtrain, df_train.values, '2017-08-15', df_train.columns)

In [48]:
model = Sequential()
model.add(LSTM(32, return_sequences=True,
               input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32))  # return a single vector of dimension 32
model.add(Dense(1))
model.compile(loss='mse',
              optimizer='adam',
              metrics=['mse'])

In [49]:
model.fit(X_train, y_train, sample_weight=(pd.concat([items["perishable"]] * ndays) * 0.25 + 1).values,
          batch_size=1024, epochs=4, verbose=1,
          validation_data=(X_val, y_val))

Train on 2345210 samples, validate on 167515 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fde96496d30>

In [None]:
# model2 = Sequential()
# model2.add(LSTM(32, return_sequences=True,
#                input_shape=(X_train.shape[1], X_train.shape[2])))
# model2.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
# model2.add(LSTM(32))  # return a single vector of dimension 32
# model2.add(Dense(1))
# model2.compile(loss='mse',
#               optimizer='adam',
#               metrics=['mse'])

In [None]:
# model2.fit(X_train, y_train,
#           batch_size=df_train.shape[0], epochs=8, verbose=1, shuffle=False,
#           validation_data=(X_val, y_val))

In [50]:
y_l = []

In [51]:
test_start = date(2017,8,16)

In [52]:
y_last = df_train.iloc[:,-1].values.reshape(-1,1)

In [53]:
def prep_test(dtrain, promo_test, y_last, test_day, lookback=40):
    y_last = sale_scaler.transform(y_last)
    dow = np.tile(test_day.weekday()/6, (dtrain.shape[0],1))
    dom = np.tile(test_day.day/31, (dtrain.shape[0],1))
    promo_day = promo_test[test_day.strftime('%Y-%m-%d')].astype('int').values.reshape(-1,1)
    add = np.concatenate((y_last,promo_day,dow,dom),axis = 1).reshape(dtrain.shape[0],1,4)
    dtrain = np.append(dtrain, add, axis=1)
    return dtrain, dtrain[:,-lookback:]

In [54]:
for i in range(16):
    test_day = test_start + timedelta(days = i)
    dtrain, X_test = prep_test(dtrain, promo_test, y_last, test_day)
    y_last = model.predict(X_test)
    y_l.append(y_last)

In [None]:
# print("Validation mse:", mean_squared_error(
#     y_val, model.predict(X_val))**0.5)

In [55]:
print("Making submission...")
y_test = np.array(y_l).squeeze().transpose()
df_preds = pd.DataFrame(y_test, index=df_train.index, columns=pd.date_range("2017-08-16", periods=16)
                       ).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [56]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lstm4.csv', float_format='%.4f', index=None)

In [None]:
# predict unknown items by class average

In [None]:
# # add class
# df_preds2 = df_preds.unstack(-1)
# item_idx = df_preds2.index.get_level_values(1)
# df_preds2['class'] = items.reindex(item_idx)['class'].values
# df_preds2['family'] = items.reindex(item_idx)['family'].values

In [None]:
# class_pred = df_preds2.reset_index().drop(['item_nbr','family'], axis = 1).groupby(['store_nbr', 'class']).agg('mean')
# class_pred = class_pred.stack()

In [None]:
# # add family
# family_pred = df_preds2.reset_index().drop(['item_nbr','class'], axis = 1).groupby(['store_nbr', 'family']).agg('mean')
# family_pred = family_pred.stack()

In [None]:
# submission = df_test[["id"]].join(df_preds, how="left")

In [None]:
# item_idx = submission.index.get_level_values(1)

In [None]:
# items = pd.read_csv("items.csv").set_index("item_nbr")

In [None]:
# submission['class'] = items.reindex(item_idx)['class'].values
# submission['family'] = items.reindex(item_idx)['family'].values

In [None]:
# class_pred.rename(columns = {'unit_sales': 'class_sales'}, inplace = True)
# submission = submission.reset_index().set_index(['store_nbr','class','date']).merge(class_pred, left_index = True,
#                                                                       right_index = True, how = 'left')
# submission['unit_sales'].fillna(submission['class_sales'], inplace = True)
# submission.drop('class_sales', axis = 1, inplace = True)

In [None]:
# family_pred.rename(columns = {'unit_sales': 'family_sales'}, inplace = True)
# submission = submission.reset_index().set_index(['store_nbr','family','date']).merge(family_pred, left_index = True,
#                                                                       right_index = True, how = 'left')
# submission['unit_sales'].fillna(submission['family_sales'], inplace = True)
# submission.drop('family_sales', axis = 1, inplace = True)

In [None]:
# submission = submission.reset_index().drop(['family','class','store_nbr','item_nbr','date'], axis = 1).set_index(['id']).sort_index().fillna(0)

In [None]:
# submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
# submission.to_csv('default+meanclass.csv', float_format='%.4f')