In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from keras.models import Model, Sequential
from keras.layers.core import Dense, Dropout, Activation, Reshape
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers import LSTM
from keras import callbacks
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.layers.merge import concatenate
from keras.layers import Input
from keras.layers import Merge
from keras.layers.embeddings import Embedding
import h5py
import gc

Using TensorFlow backend.


In [2]:
df_train = pd.read_csv(
    'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

In [3]:
df_test = pd.read_csv(
    "test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "items.csv",
).set_index("item_nbr")

In [5]:
stores = pd.read_csv(
    "stores.csv",
).set_index("store_nbr")

In [6]:
le = LabelEncoder()
items['family'] = le.fit_transform(items['family'].values)

In [7]:
stores['city'] = le.fit_transform(stores['city'].values)
stores['state'] = le.fit_transform(stores['state'].values)
stores['type'] = le.fit_transform(stores['type'].values)

In [8]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [9]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [10]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [11]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

In [12]:
items = items.reindex(df_2017.index.get_level_values(1))
stores = stores.reindex(df_2017.index.get_level_values(0))

In [13]:
df_2017_item = df_2017.groupby('item_nbr')[df_2017.columns].sum()
promo_2017_item = promo_2017.groupby('item_nbr')[promo_2017.columns].sum()

In [14]:
df_2017_store_class = df_2017.reset_index()
df_2017_store_class['class'] = items['class'].values
df_2017_store_class_index = df_2017_store_class[['class', 'store_nbr']]
df_2017_store_class = df_2017_store_class.groupby(['class', 'store_nbr'])[df_2017.columns].sum()

In [15]:
df_2017_promo_store_class = promo_2017.reset_index()
df_2017_promo_store_class['class'] = items['class'].values
df_2017_promo_store_class_index = df_2017_promo_store_class[['class', 'store_nbr']]
df_2017_promo_store_class = df_2017_promo_store_class.groupby(['class', 'store_nbr'])[promo_2017.columns].sum()

In [16]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [17]:
def prepare_dataset(df, promo_df, t2017, is_train=True, name_prefix=None):
    X = {
        "promo_14_2017": get_timespan(promo_df, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_df, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_df, t2017, 140, 140).sum(axis=1).values,
        "promo_3_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 3).sum(axis=1).values,
        "promo_7_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 7).sum(axis=1).values,
        "promo_14_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 14).sum(axis=1).values,
    }

    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, i, i)
        X['diff_%s_mean' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s' % i] = tmp.mean(axis=1).values
        X['median_%s' % i] = tmp.median(axis=1).values
        X['min_%s' % i] = tmp.min(axis=1).values
        X['max_%s' % i] = tmp.max(axis=1).values
        X['std_%s' % i] = tmp.std(axis=1).values

    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017 + timedelta(days=-7), i, i)
        X['diff_%s_mean_2' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay_2' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s_2' % i] = tmp.mean(axis=1).values
        X['median_%s_2' % i] = tmp.median(axis=1).values
        X['min_%s_2' % i] = tmp.min(axis=1).values
        X['max_%s_2' % i] = tmp.max(axis=1).values
        X['std_%s_2' % i] = tmp.std(axis=1).values

    for i in [7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, i, i)
        X['has_sales_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_sales_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_sales_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

        tmp = get_timespan(promo_df, t2017, i, i)
        X['has_promo_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_promo_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_promo_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

    tmp = get_timespan(promo_df, t2017 + timedelta(days=16), 15, 15)
    X['has_promo_days_in_after_15_days'] = (tmp > 0).sum(axis=1).values
    X['last_has_promo_day_in_after_15_days'] = i - ((tmp > 0) * np.arange(15)).max(axis=1).values
    X['first_has_promo_day_in_after_15_days'] = ((tmp > 0) * np.arange(15, 0, -1)).max(axis=1).values

    for i in range(1, 16):
        X['day_%s_2017' % i] = get_timespan(df, t2017, i, 1).values.ravel()

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, t2017, 140-i, 20, freq='7D').mean(axis=1).values

    for i in range(-16, 16):
        X["promo_{}".format(i)] = promo_df[t2017 + timedelta(days=i)].values.astype(np.uint8)
        
    X = pd.DataFrame(X)

    if is_train:
        y = df[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
    return X

In [18]:
print("Preparing dataset...")
num_days = 31
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(num_days):
    delta = timedelta(days=1 * i)
    X_tmp, y_tmp = prepare_dataset(df_2017, promo_2017, t2017 + delta)

    X_tmp2 = prepare_dataset(df_2017_item, promo_2017_item, t2017 + delta, is_train=False, name_prefix='item')
    X_tmp2.index = df_2017_item.index
    X_tmp2 = X_tmp2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

    X_tmp3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, t2017 + delta, is_train=False, name_prefix='store_class')
    X_tmp3.index = df_2017_store_class.index
    X_tmp3 = X_tmp3.reindex(df_2017_store_class_index).reset_index(drop=True)

    X_tmp = pd.concat([X_tmp, X_tmp2, X_tmp3, items.reset_index(), stores.reset_index()], axis=1)
    
    for i in range(16):
        X_tmp['dayofmonth_%s' % i] = np.repeat(((t2017+ delta) + timedelta(days=i)).day,X_tmp.shape[0])
        X_tmp['dayofweek_%s' % i] = np.repeat(((t2017+ delta) + timedelta(days=i)).weekday(),X_tmp.shape[0])

    X_l.append(X_tmp)
    y_l.append(y_tmp)

Preparing dataset...


In [19]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

In [20]:
X_train.loc[X_train['dayofmonth_0'] == 31, ['dayofmonth_0']] = 30

In [21]:
del X_l, y_l

In [22]:
X_val, y_val = prepare_dataset(df_2017, promo_2017, date(2017, 7, 26))
X_val2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 7, 26), is_train=False, name_prefix='item')
X_val2.index = df_2017_item.index
X_val2 = X_val2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

In [23]:
X_val3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 7, 26), is_train=False, name_prefix='store_class')
X_val3.index = df_2017_store_class.index
X_val3 = X_val3.reindex(df_2017_store_class_index).reset_index(drop=True)

In [24]:
X_val = pd.concat([X_val, X_val2, X_val3, items.reset_index(), stores.reset_index()], axis=1)

In [25]:
for i in range(16):
        X_val['dayofmonth_%s' % i] = np.repeat((date(2017, 7, 26) + timedelta(days=i)).day,X_val.shape[0])
        X_val['dayofweek_%s' % i] = np.repeat((date(2017, 7, 26) + timedelta(days=i)).weekday(),X_val.shape[0])

In [26]:
X_val.loc[X_val['dayofmonth_5'] == 31, ['dayofmonth_0']] = 30

In [27]:
X_test = prepare_dataset(df_2017, promo_2017, date(2017, 8, 16), is_train=False)
X_test2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 8, 16), is_train=False, name_prefix='item')
X_test2.index = df_2017_item.index
X_test2 = X_test2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

In [28]:
X_test3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 8, 16), is_train=False, name_prefix='store_class')
X_test3.index = df_2017_store_class.index
X_test3 = X_test3.reindex(df_2017_store_class_index).reset_index(drop=True)

In [29]:
X_test = pd.concat([X_test, X_test2, X_test3, items.reset_index(), stores.reset_index()], axis=1)

In [30]:
for i in range(16):
        X_test['dayofmonth_%s' % i] = np.repeat((date(2017, 8, 16) + timedelta(days=i)).day,X_test.shape[0])
        X_test['dayofweek_%s' % i] = np.repeat((date(2017, 8, 16) + timedelta(days=i)).weekday(),X_test.shape[0])

In [31]:
X_test.loc[X_test['dayofmonth_15'] == 31, ['dayofmonth_0']] = 30

In [32]:
del df_2017_item, promo_2017_item, df_2017_store_class, df_2017_promo_store_class, df_2017_store_class_index
gc.collect()

2317

## Categorical Embedding

In [33]:
sample_weights=np.array( pd.concat([items["perishable"]] * num_days) * 0.25 + 1 )

In [34]:
import copy

In [35]:
X_train_dnn = copy.copy(X_train.iloc[:,[552,553,554,556,557,558,559,560,561,562]]).values

In [36]:
X_val_dnn = copy.copy(X_val.iloc[:,[552,553,554,556,557,558,559,560,561,562]]).values

In [37]:
les = []
for i in range(X_train_dnn.shape[1]):
    le = LabelEncoder()
    le.fit(X_train.iloc[:,[552,553,554,556,557,558,559,560,561,562]].iloc[:, i])
    les.append(le)
    X_train_dnn[:, i] = le.transform(X_train_dnn[:, i])
    X_val_dnn[:, i] = le.transform(X_val_dnn[:, i])

In [38]:
def split_features(X):
    X_list = []
    
    item_nbr = X[..., [0]]
    X_list.append(item_nbr)
    
    family = X[..., [1]]
    X_list.append(family)
    
    class1 = X[..., [2]]
    X_list.append(class1)
    
    store_nbr = X[..., [3]]
    X_list.append(store_nbr)
    
    city = X[..., [4]]
    X_list.append(city)
    
    state = X[..., [5]]
    X_list.append(state)
    
    type1 = X[..., [6]]
    X_list.append(type1)
    
    cluster = X[..., [7]]
    X_list.append(cluster)
    
    dayofmonth_0 = X[..., [8]]
    X_list.append(dayofmonth_0)
    
    dayofweek_0 = X[..., [9]]
    X_list.append(dayofweek_0)
    
    return X_list

In [39]:
class NN_with_EntityEmbedding(object):
    def __init__(self, X_train, y_train, X_val, y_val):
            self.nb_epoch = 2
            self.__build_keras_model()
            self.fit(X_train, y_train, X_val, y_val)
    def preprocessing(self, X):
            X_list = split_features(X)
            return X_list
    def __build_keras_model(self):
            models = []
            
            modelitmnbr_in = Input(shape=(1,))
            modelitmnbr_out = Embedding(len(les[0].classes_), 50, input_length=1)(modelitmnbr_in)
            modelitmnbr_out = Reshape(target_shape=(50,))(modelitmnbr_out)
            #modelfamily = Model(modelfamily_in, modelfamily_out)
            models.append(modelitmnbr_out)
            
            modelfamily_in = Input(shape=(1,))
            modelfamily_out = Embedding(len(les[1].classes_), 16, input_length=1)(modelfamily_in)
            modelfamily_out = Reshape(target_shape=(16,))(modelfamily_out)
            #modelfamily = Model(modelfamily_in, modelfamily_out)
            models.append(modelfamily_out)
            
            modelclass_in = Input(shape=(1,))
            modelclass_out = Embedding(len(les[2].classes_), 50, input_length=1)(modelclass_in)
            modelclass_out = Reshape(target_shape=(50,))(modelclass_out)
            #modelclass = Model(modelclass_in, modelclass_out)
            models.append(modelclass_out)
            
            modelstrnbr_in = Input(shape=(1,))
            modelstrnbr_out = Embedding(len(les[3].classes_), 27, input_length=1)(modelstrnbr_in)
            modelstrnbr_out = Reshape(target_shape=(27,))(modelstrnbr_out)
            #modelfamily = Model(modelfamily_in, modelfamily_out)
            models.append(modelstrnbr_out)

            modelcity_in = Input(shape=(1,))
            modelcity_out = Embedding(len(les[4].classes_), 11, input_length=1)(modelcity_in)
            modelcity_out = Reshape(target_shape=(11,))(modelcity_out)
            #modelcity = Model(modelcity_in, modelcity_out)
            models.append(modelcity_out)
            
            modelstate_in = Input(shape=(1,))
            modelstate_out = Embedding(len(les[5].classes_), 8, input_length=1)(modelstate_in)
            modelstate_out = Reshape(target_shape=(8,))(modelstate_out)
            #modelstate = Model(modelstate_in, modelstate_out)
            models.append(modelstate_out)
            
            modeltype_in = Input(shape=(1,))
            modeltype_out = Embedding(len(les[6].classes_), 5, input_length=1)(modeltype_in)
            modeltype_out = Reshape(target_shape=(5,))(modeltype_out)
            #modeltype = Model(modeltype_in, modeltype_out)
            models.append(modeltype_out)
            
            modelcluster_in = Input(shape=(1,))
            modelcluster_out = Embedding(len(les[7].classes_), 8, input_length=1)(modelcluster_in)
            modelcluster_out = Reshape(target_shape=(8,))(modelcluster_out)
            #modelcluster = Model(modelcluster_in, modelcluster_out)
            models.append(modelcluster_out)
            
            modeldom_in = Input(shape=(1,))
            modeldom_out = Embedding(len(les[8].classes_), 15, input_length=1)(modeldom_in)
            modeldom_out = Reshape(target_shape=(15,))(modeldom_out)
            #modelcluster = Model(modelcluster_in, modelcluster_out)
            models.append(modeldom_out)
            
            modeldow_in = Input(shape=(1,))
            modeldow_out = Embedding(len(les[9].classes_), 4, input_length=1)(modeldow_in)
            modeldow_out = Reshape(target_shape=(4,))(modeldow_out)
            #modelcluster = Model(modelcluster_in, modelcluster_out)
            models.append(modeldow_out)
            
            concatenated = concatenate(models)
            out = Dense(256, activation='relu', kernel_initializer='uniform')(concatenated)
            out1 = Dense(128, activation='relu', kernel_initializer='uniform')(out)
            out2 = Dense(1, activation='relu', kernel_initializer='uniform')(out1)
            
            self.model = Model([modelitmnbr_in,modelfamily_in,modelclass_in,modelstrnbr_in,
                                modelcity_in,modelstate_in,modeltype_in, modelcluster_in,
                                modeldom_in,modeldow_in], out2)
            
            self.model.compile(loss='mse', optimizer='adam', 
                                 metrics=['accuracy'])

        
    def fit(self, X_train, y_train, X_val, y_val):
            self.model.fit(self.preprocessing(X_train), y_train,
                           validation_data=(self.preprocessing(X_val), y_val),
                           epochs=self.nb_epoch, batch_size=512,sample_weight = sample_weights
                           )

In [40]:
dnn = NN_with_EntityEmbedding(X_train_dnn, y_train[:,[0]], X_val_dnn, y_val[:,[0]])

Train on 5192965 samples, validate on 167515 samples
Epoch 1/2
Epoch 2/2


In [41]:
weights = dnn.model.get_weights()

In [42]:
item_nbr_embeddings = pd.DataFrame(weights[0])
item_nbr_embeddings.columns = ['item_nbr'+str(i) for i in range(len(item_nbr_embeddings.columns.values))]
item_nbr_embeddings['item_nbr'] = les[0].inverse_transform(np.unique(X_train_dnn[..., [0]]))
item_nbr_embeddings = item_nbr_embeddings.set_index('item_nbr')

In [43]:
item_family_embeddings = pd.DataFrame(weights[1])
item_family_embeddings.columns = ['family'+str(i) for i in range(len(item_family_embeddings.columns.values))]
item_family_embeddings['family'] = les[1].inverse_transform(np.unique(X_train_dnn[..., [1]]))
item_family_embeddings = item_family_embeddings.set_index('family')

In [44]:
item_class_embeddings = pd.DataFrame(weights[2])
item_class_embeddings.columns = ['class'+str(i) for i in range(len(item_class_embeddings.columns.values))]
item_class_embeddings['class'] = les[2].inverse_transform(np.unique(X_train_dnn[..., [2]]))
item_class_embeddings = item_class_embeddings.set_index('class')

In [45]:
store_nbr_embeddings = pd.DataFrame(weights[3])
store_nbr_embeddings.columns = ['store_nbr'+str(i) for i in range(len(store_nbr_embeddings.columns.values))]
store_nbr_embeddings['store_nbr'] = les[3].inverse_transform(np.unique(X_train_dnn[..., [3]]))
store_nbr_embeddings = store_nbr_embeddings.set_index('store_nbr')

In [46]:
store_city_embeddings = pd.DataFrame(weights[4])
store_city_embeddings.columns = ['city'+str(i) for i in range(len(store_city_embeddings.columns.values))]
store_city_embeddings['city'] = les[4].inverse_transform(np.unique(X_train_dnn[..., [4]]))
store_city_embeddings = store_city_embeddings.set_index('city')

In [47]:
store_state_embeddings = pd.DataFrame(weights[5])
store_state_embeddings.columns = ['state'+str(i) for i in range(len(store_state_embeddings.columns.values))]
store_state_embeddings['state'] = les[5].inverse_transform(np.unique(X_train_dnn[..., [5]]))
store_state_embeddings = store_state_embeddings.set_index('state')

In [48]:
store_type_embeddings = pd.DataFrame(weights[6])
store_type_embeddings.columns = ['type'+str(i) for i in range(len(store_type_embeddings.columns.values))]
store_type_embeddings['type'] = les[6].inverse_transform(np.unique(X_train_dnn[..., [6]]))
store_type_embeddings = store_type_embeddings.set_index('type')

In [49]:
store_cluster_embeddings = pd.DataFrame(weights[7])
store_cluster_embeddings.columns = ['cluster'+str(i) for i in range(len(store_cluster_embeddings.columns.values))]
store_cluster_embeddings['cluster'] = les[7].inverse_transform(np.unique(X_train_dnn[..., [7]]))
store_cluster_embeddings = store_cluster_embeddings.set_index('cluster')

In [50]:
dom_embeddings = pd.DataFrame(weights[8])
dom_embeddings.columns = ['dom'+str(i) for i in range(len(dom_embeddings.columns.values))]
dom_embeddings['dom'] = les[8].inverse_transform(np.unique(X_train_dnn[..., [8]]))
dom_embeddings = dom_embeddings.set_index('dom')

In [51]:
dow_embeddings = pd.DataFrame(weights[9])
dow_embeddings.columns = ['dow'+str(i) for i in range(len(dow_embeddings.columns.values))]
dow_embeddings['dow'] = les[9].inverse_transform(np.unique(X_train_dnn[..., [9]]))
dow_embeddings = dow_embeddings.set_index('dow')

In [52]:
dom_cols = dom_embeddings.columns.values
dow_cols = dow_embeddings.columns.values

In [53]:
X_train_1 = X_train.join(item_nbr_embeddings, on = 'item_nbr', how = "left")
X_train_1 = X_train_1.join(item_family_embeddings, on = 'family', how = "left")
X_train_1 = X_train_1.join(item_class_embeddings, on = 'class', how = "left")
X_train_1 = X_train_1.join(store_nbr_embeddings, on = 'store_nbr', how = "left")
X_train_1 = X_train_1.join(store_city_embeddings, on = 'city', how = "left")
X_train_1 = X_train_1.join(store_state_embeddings, on = 'state', how = "left")
X_train_1 = X_train_1.join(store_type_embeddings, on = 'type', how = "left")
X_train_1 = X_train_1.join(store_cluster_embeddings, on = 'cluster', how = "left")

In [54]:
for j in range(16):
    dom_embeddings.columns = [i+("_%s" % j) for i in dom_cols]
    X_train_1 = X_train_1.join(dom_embeddings, on = ('dayofmonth_%s' % j), how = "left")

In [55]:
for j in range(16):
    dow_embeddings.columns = [i+("_%s" % j) for i in dow_cols]
    X_train_1 = X_train_1.join(dow_embeddings, on = ('dayofweek_%s' % j), how = "left")

In [56]:
X_train = X_train_1.iloc[:,0:552]
X_train_itmstr = X_train_1.iloc[:,593:768]
X_train_date = X_train_1.iloc[:,768:]

In [57]:
idx15 = [i for i, item in enumerate(X_train_date.columns.values) if item.endswith('_15')]

In [58]:
X_val_1 = X_val.join(item_nbr_embeddings, on = 'item_nbr', how = "left")
X_val_1 = X_val_1.join(item_family_embeddings, on = 'family', how = "left")
X_val_1 = X_val_1.join(item_class_embeddings, on = 'class', how = "left")
X_val_1 = X_val_1.join(store_nbr_embeddings, on = 'store_nbr', how = "left")
X_val_1 = X_val_1.join(store_city_embeddings, on = 'city', how = "left")
X_val_1 = X_val_1.join(store_state_embeddings, on = 'state', how = "left")
X_val_1 = X_val_1.join(store_type_embeddings, on = 'type', how = "left")
X_val_1 = X_val_1.join(store_cluster_embeddings, on = 'cluster', how = "left")

In [59]:
for j in range(16):
    dom_embeddings.columns = [i+("_%s" % j) for i in dom_cols]
    X_val_1 = X_val_1.join(dom_embeddings, on = ('dayofmonth_%s' % j), how = "left")
    
for j in range(16):
    dow_embeddings.columns = [i+("_%s" % j) for i in dow_cols]
    X_val_1 = X_val_1.join(dow_embeddings, on = ('dayofweek_%s' % j), how = "left")
    
X_val = X_val_1.iloc[:,0:552]
X_val_itmstr = X_val_1.iloc[:,593:768]
X_val_date = X_val_1.iloc[:,768:]

In [60]:
X_test_1 = X_test.join(item_nbr_embeddings, on = 'item_nbr', how = "left")
X_test_1 = X_test_1.join(item_family_embeddings, on = 'family', how = "left")
X_test_1 = X_test_1.join(item_class_embeddings, on = 'class', how = "left")
X_test_1 = X_test_1.join(store_nbr_embeddings, on = 'store_nbr', how = "left")
X_test_1 = X_test_1.join(store_city_embeddings, on = 'city', how = "left")
X_test_1 = X_test_1.join(store_state_embeddings, on = 'state', how = "left")
X_test_1 = X_test_1.join(store_type_embeddings, on = 'type', how = "left")
X_test_1 = X_test_1.join(store_cluster_embeddings, on = 'cluster', how = "left")

In [61]:
for j in range(16):
    dom_embeddings.columns = [i+("_%s" % j) for i in dom_cols]
    X_test_1 = X_test_1.join(dom_embeddings, on = ('dayofmonth_%s' % j), how = "left")
    
for j in range(16):
    dow_embeddings.columns = [i+("_%s" % j) for i in dow_cols]
    X_test_1 = X_test_1.join(dow_embeddings, on = ('dayofweek_%s' % j), how = "left")
    
X_test = X_test_1.iloc[:,0:552]
X_test_itmstr = X_test_1.iloc[:,593:768]
X_test_date = X_test_1.iloc[:,768:]

In [62]:
del X_train_1, X_val_1, X_test_1
gc.collect()

11693

## Model

In [63]:
df_meta_tr = pd.read_csv(
    "seq_meta_tr.csv",
)

df_meta_val = pd.read_csv(
    "seq_meta_val.csv",
)

df_meta_test = pd.read_csv(
    "seq_meta_test.csv",
)

In [64]:
df_meta_tr.columns = [("y_%s" % i) for  i in range(16)]

In [65]:
df_meta_val.columns = [("y_%s" % i) for  i in range(16)]

In [66]:
df_meta_test.columns = [("y_%s" % i) for  i in range(16)]

In [67]:
X_train = pd.concat([X_train.reset_index(drop=True), df_meta_tr], axis=1)

In [68]:
X_val = pd.concat([X_val.reset_index(drop=True), df_meta_val], axis=1)

In [69]:
X_test = pd.concat([X_test.reset_index(drop=True), df_meta_test], axis=1)

In [70]:
scaler = StandardScaler()
scaler.fit(pd.concat([X_train, X_val, X_test]))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [71]:
X_train[:] = scaler.transform(X_train)
X_val[:] = scaler.transform(X_val)
X_test[:] = scaler.transform(X_test)

In [76]:
X_train = pd.concat([X_train, X_train_itmstr.reset_index(drop=True)], axis=1)

In [77]:
X_val = pd.concat([X_val, X_val_itmstr.reset_index(drop=True)], axis=1)

In [78]:
X_test = pd.concat([X_test, X_test_itmstr.reset_index(drop=True)], axis=1)

In [93]:
j = 1
idx = [i for i, item in enumerate(X_train_date.columns.values) if item.endswith('_%s' % j)]
np.concatenate([X_train.as_matrix(),X_train_date.as_matrix()[:,idx]],axis=1).shape

(5192965, 762)

In [95]:
def build_model():
    model = Sequential()
    model.add(LSTM(768, input_shape=(1,762)))
    model.add(BatchNormalization())
    model.add(Dropout(.2))

    model.add(Dense(512))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.1))

    model.add(Dense(256))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.1))

    model.add(Dense(128))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(64))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(32))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(16))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(1))

    return model

In [None]:
N_EPOCHS = 2

val_pred = []
test_pred = []
# wtpath = 'weights.hdf5'  # To save best epoch. But need Keras bug to be fixed first.
sample_weights=np.array( pd.concat([items["perishable"]] * num_days) * 0.25 + 1 )
for j in range(16):
    print("=" * 50)
    print("Step %d" % (j+1))
    print("=" * 50)
    
    idx = [i for i, item in enumerate(X_train_date.columns.values) if item.endswith('_%s' % j)]
    
    X_train_model = np.concatenate([X_train.as_matrix(),X_train_date.as_matrix()[:,idx]],axis=1)
    X_val_model = np.concatenate([X_val.as_matrix(),X_val_date.as_matrix()[:,idx]],axis=1)
    X_test_model = np.concatenate([X_test.as_matrix(),X_test_date.as_matrix()[:,idx]],axis=1)
    
    X_train_model = X_train_model.reshape((X_train_model.shape[0], 1, X_train_model.shape[1]))
    X_test_model = X_test_model.reshape((X_test_model.shape[0], 1, X_test_model.shape[1]))
    X_val_model = X_val_model.reshape((X_val_model.shape[0], 1, X_val_model.shape[1]))
    
    y = y_train[:, j]
    y_mean = y.mean()
    xv = X_val_model
    yv = y_val[:, j]
    model = build_model()
    opt = optimizers.Adam(lr=0.001)
    model.compile(loss='mse', optimizer=opt, metrics=['mse'])

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, verbose=0),
        ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, epsilon=1e-4, mode='min')
        ]
    model.fit(X_train_model, y - y_mean, batch_size = 65536, epochs = N_EPOCHS, verbose=2,
               sample_weight=sample_weights, validation_data=(xv,yv-y_mean), callbacks=callbacks )
    val_pred.append(model.predict(X_val_model)+y_mean)
    test_pred.append(model.predict(X_test_model)+y_mean)

Step 1
Train on 5192965 samples, validate on 167515 samples
Epoch 1/2
 - 712s - loss: 0.5512 - mean_squared_error: 0.5210 - val_loss: 0.4460 - val_mean_squared_error: 0.4460
Epoch 2/2
 - 679s - loss: 0.4188 - mean_squared_error: 0.3967 - val_loss: 0.4020 - val_mean_squared_error: 0.4020
Step 2
Train on 5192965 samples, validate on 167515 samples
Epoch 1/2
 - 763s - loss: 0.6165 - mean_squared_error: 0.5830 - val_loss: 0.5519 - val_mean_squared_error: 0.5519
Epoch 2/2
 - 753s - loss: 0.4444 - mean_squared_error: 0.4206 - val_loss: 0.4595 - val_mean_squared_error: 0.4595
Step 3


In [None]:
weight = items["perishable"] * 0.25 + 1
err = (y_val - np.array(val_pred).squeeze(axis=2).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print('nwrmsle = {}'.format(err))

y_val = np.array(val_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_val, index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
df_preds["unit_sales"] = np.clip(np.expm1(df_preds["unit_sales"]), 0, 1000)
df_preds.reset_index().to_csv('nn_cv.csv', index=False)

print("Making submission...")
y_test = np.array(test_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('nn_sub.csv', float_format='%.4f', index=None)