## Load required libraries and data

In [174]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from keras.models import Model, Sequential
from keras.layers.core import Dense, Dropout, Activation, Reshape
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers import LSTM
from keras import callbacks
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.layers.merge import concatenate
from keras.layers import Input
from keras.layers import Merge
from keras.layers.embeddings import Embedding
import h5py
import gc

In [175]:
## Applying a log transform 
df_train = pd.read_csv(
    'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

In [176]:
df_test = pd.read_csv(
    "test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [177]:
items = pd.read_csv(
    "items.csv",
).set_index("item_nbr")

In [178]:
stores = pd.read_csv(
    "stores.csv",
).set_index("store_nbr")

In [None]:
### Encoding the categorical variables

In [179]:
le = LabelEncoder()
items['family'] = le.fit_transform(items['family'].values)

In [327]:
items['family'].values

array([12, 12, 12, ...,  3,  7, 12])

In [180]:
stores['city'] = le.fit_transform(stores['city'].values)
stores['state'] = le.fit_transform(stores['state'].values)
stores['type'] = le.fit_transform(stores['type'].values)

In [181]:
## filtering for 2017 data only
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [182]:
## Unstacking promos into wide format
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [183]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [184]:
## Unstacking train sales into wide format
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

In [185]:
## reindexing to have order and size similar to df_2017
items = items.reindex(df_2017.index.get_level_values(1))
stores = stores.reindex(df_2017.index.get_level_values(0))

In [186]:
## Aggregating at item nbr level
df_2017_item = df_2017.groupby('item_nbr')[df_2017.columns].sum()
promo_2017_item = promo_2017.groupby('item_nbr')[promo_2017.columns].sum()

In [187]:
## Aggregating at store class level
df_2017_store_class = df_2017.reset_index()
df_2017_store_class['class'] = items['class'].values
df_2017_store_class_index = df_2017_store_class[['class', 'store_nbr']]
df_2017_store_class = df_2017_store_class.groupby(['class', 'store_nbr'])[df_2017.columns].sum()

In [188]:
df_2017_promo_store_class = promo_2017.reset_index()
df_2017_promo_store_class['class'] = items['class'].values
df_2017_promo_store_class_index = df_2017_promo_store_class[['class', 'store_nbr']]
df_2017_promo_store_class = df_2017_promo_store_class.groupby(['class', 'store_nbr'])[promo_2017.columns].sum()

### Preparing datasets for model

In [189]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [190]:
def prepare_dataset(df, promo_df, t2017, is_train=True, name_prefix=None):
    X = {
        "promo_14_2017": get_timespan(promo_df, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_df, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_df, t2017, 140, 140).sum(axis=1).values,
        "promo_3_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 3).sum(axis=1).values,
        "promo_7_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 7).sum(axis=1).values,
        "promo_14_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 14).sum(axis=1).values,
    }

    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, i, i)
        X['diff_%s_mean' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s' % i] = tmp.mean(axis=1).values
        X['median_%s' % i] = tmp.median(axis=1).values
        X['min_%s' % i] = tmp.min(axis=1).values
        X['max_%s' % i] = tmp.max(axis=1).values
        X['std_%s' % i] = tmp.std(axis=1).values

    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017 + timedelta(days=-7), i, i)
        X['diff_%s_mean_2' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay_2' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s_2' % i] = tmp.mean(axis=1).values
        X['median_%s_2' % i] = tmp.median(axis=1).values
        X['min_%s_2' % i] = tmp.min(axis=1).values
        X['max_%s_2' % i] = tmp.max(axis=1).values
        X['std_%s_2' % i] = tmp.std(axis=1).values

    for i in [7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, i, i)
        X['has_sales_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_sales_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_sales_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

        tmp = get_timespan(promo_df, t2017, i, i)
        X['has_promo_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_promo_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_promo_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

    tmp = get_timespan(promo_df, t2017 + timedelta(days=16), 15, 15)
    X['has_promo_days_in_after_15_days'] = (tmp > 0).sum(axis=1).values
    X['last_has_promo_day_in_after_15_days'] = i - ((tmp > 0) * np.arange(15)).max(axis=1).values
    X['first_has_promo_day_in_after_15_days'] = ((tmp > 0) * np.arange(15, 0, -1)).max(axis=1).values

    for i in range(1, 16):
        X['day_%s_2017' % i] = get_timespan(df, t2017, i, 1).values.ravel()

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, t2017, 140-i, 20, freq='7D').mean(axis=1).values

    for i in range(-16, 16):
        X["promo_{}".format(i)] = promo_df[t2017 + timedelta(days=i)].values.astype(np.uint8)
        
    X = pd.DataFrame(X)

    if is_train:
        y = df[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
    return X

In [191]:
print("Preparing dataset...")
num_days = 31
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(num_days):
    delta = timedelta(days=1 * i)
    X_tmp, y_tmp = prepare_dataset(df_2017, promo_2017, t2017 + delta)

    X_tmp2 = prepare_dataset(df_2017_item, promo_2017_item, t2017 + delta, is_train=False, name_prefix='item')
    X_tmp2.index = df_2017_item.index
    X_tmp2 = X_tmp2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

    X_tmp3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, t2017 + delta, is_train=False, name_prefix='store_class')
    X_tmp3.index = df_2017_store_class.index
    X_tmp3 = X_tmp3.reindex(df_2017_store_class_index).reset_index(drop=True)

    X_tmp = pd.concat([X_tmp, X_tmp2, X_tmp3, items.reset_index(), stores.reset_index()], axis=1)
    
    for i in range(16):
        X_tmp['dayofmonth_%s' % i] = np.repeat(((t2017+ delta) + timedelta(days=i)).day,X_tmp.shape[0])
        X_tmp['dayofweek_%s' % i] = np.repeat(((t2017+ delta) + timedelta(days=i)).weekday(),X_tmp.shape[0])

    X_l.append(X_tmp)
    y_l.append(y_tmp)

Preparing dataset...


In [192]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

In [193]:
X_train.loc[X_train['dayofmonth_0'] == 31, ['dayofmonth_0']] = 30

In [194]:
del X_l, y_l

In [195]:
X_val, y_val = prepare_dataset(df_2017, promo_2017, date(2017, 7, 26))
X_val2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 7, 26), is_train=False, name_prefix='item')
X_val2.index = df_2017_item.index
X_val2 = X_val2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

In [196]:
X_val3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 7, 26), is_train=False, name_prefix='store_class')
X_val3.index = df_2017_store_class.index
X_val3 = X_val3.reindex(df_2017_store_class_index).reset_index(drop=True)

In [197]:
X_val = pd.concat([X_val, X_val2, X_val3, items.reset_index(), stores.reset_index()], axis=1)

In [198]:
for i in range(16):
        X_val['dayofmonth_%s' % i] = np.repeat((date(2017, 7, 26) + timedelta(days=i)).day,X_val.shape[0])
        X_val['dayofweek_%s' % i] = np.repeat((date(2017, 7, 26) + timedelta(days=i)).weekday(),X_val.shape[0])

In [199]:
X_val.loc[X_val['dayofmonth_5'] == 31, ['dayofmonth_5']] = 30

In [200]:
X_test = prepare_dataset(df_2017, promo_2017, date(2017, 8, 16), is_train=False)
X_test2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 8, 16), is_train=False, name_prefix='item')
X_test2.index = df_2017_item.index
X_test2 = X_test2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

In [201]:
X_test3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 8, 16), is_train=False, name_prefix='store_class')
X_test3.index = df_2017_store_class.index
X_test3 = X_test3.reindex(df_2017_store_class_index).reset_index(drop=True)

In [202]:
X_test = pd.concat([X_test, X_test2, X_test3, items.reset_index(), stores.reset_index()], axis=1)

In [203]:
for i in range(16):
        X_test['dayofmonth_%s' % i] = np.repeat((date(2017, 8, 16) + timedelta(days=i)).day,X_test.shape[0])
        X_test['dayofweek_%s' % i] = np.repeat((date(2017, 8, 16) + timedelta(days=i)).weekday(),X_test.shape[0])

In [204]:
X_test.loc[X_test['dayofmonth_15'] == 31, ['dayofmonth_15']] = 30

## Categorical Embedding

In [205]:
sample_weights=np.array( pd.concat([items["perishable"]] * num_days) * 0.25 + 1 )

In [206]:
import copy

In [207]:
X_train_dnn = copy.copy(X_train.iloc[:,[552,553,554,556,557,558,559,560,561,562]]).values

In [208]:
X_val_dnn = copy.copy(X_val.iloc[:,[552,553,554,556,557,558,559,560,561,562]]).values

In [209]:
les = []
for i in range(X_train_dnn.shape[1]):
    le = LabelEncoder()
    le.fit(X_train.iloc[:,[552,553,554,556,557,558,559,560,561,562]].iloc[:, i])
    les.append(le)
    X_train_dnn[:, i] = le.transform(X_train_dnn[:, i])
    X_val_dnn[:, i] = le.transform(X_val_dnn[:, i])

In [210]:
def split_features(X):
    X_list = []
    
    item_nbr = X[..., [0]]
    X_list.append(item_nbr)
    
    family = X[..., [1]]
    X_list.append(family)
    
    class1 = X[..., [2]]
    X_list.append(class1)
    
    store_nbr = X[..., [3]]
    X_list.append(store_nbr)
    
    city = X[..., [4]]
    X_list.append(city)
    
    state = X[..., [5]]
    X_list.append(state)
    
    type1 = X[..., [6]]
    X_list.append(type1)
    
    cluster = X[..., [7]]
    X_list.append(cluster)
    
    dayofmonth_0 = X[..., [8]]
    X_list.append(dayofmonth_0)
    
    dayofweek_0 = X[..., [9]]
    X_list.append(dayofweek_0)
    
    return X_list

In [211]:
class NN_with_EntityEmbedding(object):
    def __init__(self, X_train, y_train, X_val, y_val):
            self.nb_epoch = 2
            self.__build_keras_model()
            self.fit(X_train, y_train, X_val, y_val)
    def preprocessing(self, X):
            X_list = split_features(X)
            return X_list
    def __build_keras_model(self):
            models = []
            
            modelitmnbr_in = Input(shape=(1,))
            modelitmnbr_out = Embedding(len(les[0].classes_), 50, input_length=1)(modelitmnbr_in)
            modelitmnbr_out = Reshape(target_shape=(50,))(modelitmnbr_out)
            #modelfamily = Model(modelfamily_in, modelfamily_out)
            models.append(modelitmnbr_out)
            
            modelfamily_in = Input(shape=(1,))
            modelfamily_out = Embedding(len(les[1].classes_), 16, input_length=1)(modelfamily_in)
            modelfamily_out = Reshape(target_shape=(16,))(modelfamily_out)
            #modelfamily = Model(modelfamily_in, modelfamily_out)
            models.append(modelfamily_out)
            
            modelclass_in = Input(shape=(1,))
            modelclass_out = Embedding(len(les[2].classes_), 50, input_length=1)(modelclass_in)
            modelclass_out = Reshape(target_shape=(50,))(modelclass_out)
            #modelclass = Model(modelclass_in, modelclass_out)
            models.append(modelclass_out)
            
            modelstrnbr_in = Input(shape=(1,))
            modelstrnbr_out = Embedding(len(les[3].classes_), 27, input_length=1)(modelstrnbr_in)
            modelstrnbr_out = Reshape(target_shape=(27,))(modelstrnbr_out)
            #modelfamily = Model(modelfamily_in, modelfamily_out)
            models.append(modelstrnbr_out)

            modelcity_in = Input(shape=(1,))
            modelcity_out = Embedding(len(les[4].classes_), 11, input_length=1)(modelcity_in)
            modelcity_out = Reshape(target_shape=(11,))(modelcity_out)
            #modelcity = Model(modelcity_in, modelcity_out)
            models.append(modelcity_out)
            
            modelstate_in = Input(shape=(1,))
            modelstate_out = Embedding(len(les[5].classes_), 8, input_length=1)(modelstate_in)
            modelstate_out = Reshape(target_shape=(8,))(modelstate_out)
            #modelstate = Model(modelstate_in, modelstate_out)
            models.append(modelstate_out)
            
            modeltype_in = Input(shape=(1,))
            modeltype_out = Embedding(len(les[6].classes_), 5, input_length=1)(modeltype_in)
            modeltype_out = Reshape(target_shape=(5,))(modeltype_out)
            #modeltype = Model(modeltype_in, modeltype_out)
            models.append(modeltype_out)
            
            modelcluster_in = Input(shape=(1,))
            modelcluster_out = Embedding(len(les[7].classes_), 8, input_length=1)(modelcluster_in)
            modelcluster_out = Reshape(target_shape=(8,))(modelcluster_out)
            #modelcluster = Model(modelcluster_in, modelcluster_out)
            models.append(modelcluster_out)
            
            modeldom_in = Input(shape=(1,))
            modeldom_out = Embedding(len(les[8].classes_), 15, input_length=1)(modeldom_in)
            modeldom_out = Reshape(target_shape=(15,))(modeldom_out)
            #modelcluster = Model(modelcluster_in, modelcluster_out)
            models.append(modeldom_out)
            
            modeldow_in = Input(shape=(1,))
            modeldow_out = Embedding(len(les[9].classes_), 4, input_length=1)(modeldow_in)
            modeldow_out = Reshape(target_shape=(4,))(modeldow_out)
            #modelcluster = Model(modelcluster_in, modelcluster_out)
            models.append(modeldow_out)
            
            concatenated = concatenate(models)
            out = Dense(256, activation='relu', kernel_initializer='uniform')(concatenated)
            out1 = Dense(128, activation='relu', kernel_initializer='uniform')(out)
            out2 = Dense(1, activation='relu', kernel_initializer='uniform')(out1)
            
            self.model = Model([modelitmnbr_in,modelfamily_in,modelclass_in,modelstrnbr_in,
                                modelcity_in,modelstate_in,modeltype_in, modelcluster_in,
                                modeldom_in,modeldow_in], out2)
            
            self.model.compile(loss='mse', optimizer='adam', 
                                 metrics=['accuracy'])

        
    def fit(self, X_train, y_train, X_val, y_val):
            self.model.fit(self.preprocessing(X_train), y_train,
                           validation_data=(self.preprocessing(X_val), y_val),
                           epochs=self.nb_epoch, batch_size=512,sample_weight = sample_weights
                           )

In [212]:
dnn = NN_with_EntityEmbedding(X_train_dnn, y_train[:,[0]], X_val_dnn, y_val[:,[0]])

Train on 5192965 samples, validate on 167515 samples
Epoch 1/2
Epoch 2/2


## Create entity embedding data frames

In [213]:
weights = dnn.model.get_weights()

In [214]:
item_nbr_embeddings = pd.DataFrame(weights[0])
item_nbr_embeddings.columns = ['item_nbr'+str(i) for i in range(len(item_nbr_embeddings.columns.values))]
item_nbr_embeddings['item_nbr'] = les[0].inverse_transform(np.unique(X_train_dnn[..., [0]]))
item_nbr_embeddings = item_nbr_embeddings.set_index('item_nbr')

In [215]:
item_family_embeddings = pd.DataFrame(weights[1])
item_family_embeddings.columns = ['family'+str(i) for i in range(len(item_family_embeddings.columns.values))]
item_family_embeddings['family'] = les[1].inverse_transform(np.unique(X_train_dnn[..., [1]]))
item_family_embeddings = item_family_embeddings.set_index('family')

In [216]:
item_class_embeddings = pd.DataFrame(weights[2])
item_class_embeddings.columns = ['class'+str(i) for i in range(len(item_class_embeddings.columns.values))]
item_class_embeddings['class'] = les[2].inverse_transform(np.unique(X_train_dnn[..., [2]]))
item_class_embeddings = item_class_embeddings.set_index('class')

In [217]:
store_nbr_embeddings = pd.DataFrame(weights[3])
store_nbr_embeddings.columns = ['store_nbr'+str(i) for i in range(len(store_nbr_embeddings.columns.values))]
store_nbr_embeddings['store_nbr'] = les[3].inverse_transform(np.unique(X_train_dnn[..., [3]]))
store_nbr_embeddings = store_nbr_embeddings.set_index('store_nbr')

In [218]:
store_city_embeddings = pd.DataFrame(weights[4])
store_city_embeddings.columns = ['city'+str(i) for i in range(len(store_city_embeddings.columns.values))]
store_city_embeddings['city'] = les[4].inverse_transform(np.unique(X_train_dnn[..., [4]]))
store_city_embeddings = store_city_embeddings.set_index('city')

In [219]:
store_state_embeddings = pd.DataFrame(weights[5])
store_state_embeddings.columns = ['state'+str(i) for i in range(len(store_state_embeddings.columns.values))]
store_state_embeddings['state'] = les[5].inverse_transform(np.unique(X_train_dnn[..., [5]]))
store_state_embeddings = store_state_embeddings.set_index('state')

In [220]:
store_type_embeddings = pd.DataFrame(weights[6])
store_type_embeddings.columns = ['type'+str(i) for i in range(len(store_type_embeddings.columns.values))]
store_type_embeddings['type'] = les[6].inverse_transform(np.unique(X_train_dnn[..., [6]]))
store_type_embeddings = store_type_embeddings.set_index('type')

In [221]:
store_cluster_embeddings = pd.DataFrame(weights[7])
store_cluster_embeddings.columns = ['cluster'+str(i) for i in range(len(store_cluster_embeddings.columns.values))]
store_cluster_embeddings['cluster'] = les[7].inverse_transform(np.unique(X_train_dnn[..., [7]]))
store_cluster_embeddings = store_cluster_embeddings.set_index('cluster')

In [222]:
dom_embeddings = pd.DataFrame(weights[8])
dom_embeddings.columns = ['dom'+str(i) for i in range(len(dom_embeddings.columns.values))]
dom_embeddings['dom'] = les[8].inverse_transform(np.unique(X_train_dnn[..., [8]]))
dom_embeddings = dom_embeddings.set_index('dom')

In [223]:
dow_embeddings = pd.DataFrame(weights[9])
dow_embeddings.columns = ['dow'+str(i) for i in range(len(dow_embeddings.columns.values))]
dow_embeddings['dow'] = les[9].inverse_transform(np.unique(X_train_dnn[..., [9]]))
dow_embeddings = dow_embeddings.set_index('dow')

In [224]:
dom_cols = dom_embeddings.columns.values
dow_cols = dow_embeddings.columns.values

## Recreate train with only 8 days

In [225]:
print("Preparing dataset...")
num_days = 8
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(num_days):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(df_2017, promo_2017, t2017 + delta)

    X_tmp2 = prepare_dataset(df_2017_item, promo_2017_item, t2017 + delta, is_train=False, name_prefix='item')
    X_tmp2.index = df_2017_item.index
    X_tmp2 = X_tmp2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

    X_tmp3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, t2017 + delta, is_train=False, name_prefix='store_class')
    X_tmp3.index = df_2017_store_class.index
    X_tmp3 = X_tmp3.reindex(df_2017_store_class_index).reset_index(drop=True)

    X_tmp = pd.concat([X_tmp, X_tmp2, X_tmp3, items.reset_index(), stores.reset_index()], axis=1)
    
    for i in range(16):
        X_tmp['dayofmonth_%s' % i] = np.repeat(((t2017+ delta) + timedelta(days=i)).day,X_tmp.shape[0])

    X_l.append(X_tmp)
    y_l.append(y_tmp)

Preparing dataset...


In [226]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

In [227]:
X_train.loc[X_train['dayofmonth_0'] == 31, ['dayofmonth_0']] = 30
X_train.loc[X_train['dayofmonth_12'] == 31, ['dayofmonth_12']] = 30

In [228]:
del X_l, y_l

In [229]:
del df_2017_item, promo_2017_item, df_2017_store_class, df_2017_promo_store_class, df_2017_store_class_index
gc.collect()

83959

In [230]:
X_train_1 = X_train.join(item_nbr_embeddings, on = 'item_nbr', how = "left")
X_train_1 = X_train_1.join(item_family_embeddings, on = 'family', how = "left")
X_train_1 = X_train_1.join(item_class_embeddings, on = 'class', how = "left")
X_train_1 = X_train_1.join(store_nbr_embeddings, on = 'store_nbr', how = "left")
X_train_1 = X_train_1.join(store_city_embeddings, on = 'city', how = "left")
X_train_1 = X_train_1.join(store_state_embeddings, on = 'state', how = "left")
X_train_1 = X_train_1.join(store_type_embeddings, on = 'type', how = "left")
X_train_1 = X_train_1.join(store_cluster_embeddings, on = 'cluster', how = "left")

In [231]:
for j in range(16):
    dom_embeddings.columns = [i+("_%s" % j) for i in dom_cols]
    X_train_1 = X_train_1.join(dom_embeddings, on = ('dayofmonth_%s' % j), how = "left")

In [232]:
X_train = X_train_1.iloc[:,0:552]
X_train_catemb = X_train_1.iloc[:,577:]

In [233]:
X_val_1 = X_val.join(item_nbr_embeddings, on = 'item_nbr', how = "left")
X_val_1 = X_val_1.join(item_family_embeddings, on = 'family', how = "left")
X_val_1 = X_val_1.join(item_class_embeddings, on = 'class', how = "left")
X_val_1 = X_val_1.join(store_nbr_embeddings, on = 'store_nbr', how = "left")
X_val_1 = X_val_1.join(store_city_embeddings, on = 'city', how = "left")
X_val_1 = X_val_1.join(store_state_embeddings, on = 'state', how = "left")
X_val_1 = X_val_1.join(store_type_embeddings, on = 'type', how = "left")
X_val_1 = X_val_1.join(store_cluster_embeddings, on = 'cluster', how = "left")

In [234]:
for j in range(16):
    dom_embeddings.columns = [i+("_%s" % j) for i in dom_cols]
    X_val_1 = X_val_1.join(dom_embeddings, on = ('dayofmonth_%s' % j), how = "left")

In [235]:
X_val = X_val_1.iloc[:,0:552]
X_val_catemb = X_val_1.iloc[:,593:]

In [236]:
X_test_1 = X_test.join(item_nbr_embeddings, on = 'item_nbr', how = "left")
X_test_1 = X_test_1.join(item_family_embeddings, on = 'family', how = "left")
X_test_1 = X_test_1.join(item_class_embeddings, on = 'class', how = "left")
X_test_1 = X_test_1.join(store_nbr_embeddings, on = 'store_nbr', how = "left")
X_test_1 = X_test_1.join(store_city_embeddings, on = 'city', how = "left")
X_test_1 = X_test_1.join(store_state_embeddings, on = 'state', how = "left")
X_test_1 = X_test_1.join(store_type_embeddings, on = 'type', how = "left")
X_test_1 = X_test_1.join(store_cluster_embeddings, on = 'cluster', how = "left")

In [237]:
for j in range(16):
    dom_embeddings.columns = [i+("_%s" % j) for i in dom_cols]
    X_test_1 = X_test_1.join(dom_embeddings, on = ('dayofmonth_%s' % j), how = "left")
    
X_test = X_test_1.iloc[:,0:552]
X_test_catemb = X_test_1.iloc[:,593:]

In [238]:
del X_train_1, X_val_1, X_test_1
gc.collect()

807

## Final Meta Model

### read in the seq2seq predictions for train, validation and prediction

In [239]:
df_meta_tr = pd.read_csv(
    "seq_meta_tr_8.csv",
)

df_meta_val = pd.read_csv(
    "seq_meta_val_8.csv",
)

df_meta_test = pd.read_csv(
    "seq_meta_test_8.csv",
)

In [240]:
df_meta_tr.columns = [("y_%s" % i) for  i in range(16)]

In [241]:
df_meta_val.columns = [("y_%s" % i) for  i in range(16)]

In [242]:
df_meta_test.columns = [("y_%s" % i) for  i in range(16)]

In [None]:
## Join seq2seq predictions to features of train, validation, & test

In [245]:
X_train = pd.concat([X_train.reset_index(drop=True), df_meta_tr], axis=1)

In [246]:
X_val = pd.concat([X_val.reset_index(drop=True), df_meta_val], axis=1)

In [247]:
X_test = pd.concat([X_test.reset_index(drop=True), df_meta_test], axis=1)

In [None]:
## Scaling the features

In [249]:
scaler = StandardScaler()
scaler.fit(pd.concat([X_train, X_val, X_test]))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [250]:
X_train[:] = scaler.transform(X_train)
X_val[:] = scaler.transform(X_val)
X_test[:] = scaler.transform(X_test)

In [None]:
## Join categorical embeddigns

In [251]:
X_train = pd.concat([X_train, X_train_catemb.reset_index(drop=True)], axis=1)

In [252]:
X_val = pd.concat([X_val, X_val_catemb.reset_index(drop=True)], axis=1)

In [253]:
X_test = pd.concat([X_test, X_test_catemb.reset_index(drop=True)], axis=1)

In [None]:
## define model architecture

In [261]:
def build_model():
    model = Sequential()
    model.add(LSTM(768, input_shape=(1,758)))
    model.add(BatchNormalization())
    model.add(Dropout(.2))

    model.add(Dense(512))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.1))

    model.add(Dense(256))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.1))

    model.add(Dense(128))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(64))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(32))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(16))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.05))

    model.add(Dense(1))

    return model

In [None]:
## Training and scoring each of the 16 models

In [262]:
N_EPOCHS = 25

val_pred = []
test_pred = []
# wtpath = 'weights.hdf5'  # To save best epoch. But need Keras bug to be fixed first.
sample_weights=np.array( pd.concat([items["perishable"]] * num_days) * 0.25 + 1 )
for j in range(16):
    print("=" * 50)
    print("Step %d" % (j+1))
    print("=" * 50)
    
    idx = list(range(743))+[i for i, item in enumerate(X_train.columns.values) if (item.endswith('_%s' % j) and item.startswith('dom'))]
    
    X_train_model = X_train.as_matrix()[:,idx]
    X_val_model = X_val.as_matrix()[:,idx]
    X_test_model = X_test.as_matrix()[:,idx]
    
    X_train_model = X_train_model.reshape((X_train_model.shape[0], 1, X_train_model.shape[1]))
    X_test_model = X_test_model.reshape((X_test_model.shape[0], 1, X_test_model.shape[1]))
    X_val_model = X_val_model.reshape((X_val_model.shape[0], 1, X_val_model.shape[1]))
    
    y = y_train[:, j]
    y_mean = y.mean()
    xv = X_val_model
    yv = y_val[:, j]
    model = build_model()
    opt = optimizers.Adam(lr=0.001)
    model.compile(loss='mse', optimizer=opt, metrics=['mse'])

    callbacks = [
        ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, epsilon=1e-4, mode='min')
        ]
    model.fit(X_train_model, y - y_mean, batch_size = 65536, epochs = N_EPOCHS, verbose=2,
               sample_weight=sample_weights, validation_data=(xv,yv-y_mean), callbacks=callbacks )
    val_pred.append(model.predict(X_val_model)+y_mean)
    test_pred.append(model.predict(X_test_model)+y_mean)

Step 1
Train on 1340120 samples, validate on 167515 samples
Epoch 1/25
 - 173s - loss: 1.0363 - mean_squared_error: 0.9781 - val_loss: 0.6735 - val_mean_squared_error: 0.6735
Epoch 2/25
 - 167s - loss: 0.5464 - mean_squared_error: 0.5175 - val_loss: 0.5019 - val_mean_squared_error: 0.5019
Epoch 3/25
 - 167s - loss: 0.5070 - mean_squared_error: 0.4808 - val_loss: 0.4386 - val_mean_squared_error: 0.4386
Epoch 4/25
 - 168s - loss: 0.4840 - mean_squared_error: 0.4593 - val_loss: 0.4417 - val_mean_squared_error: 0.4417
Epoch 5/25
 - 168s - loss: 0.4672 - mean_squared_error: 0.4438 - val_loss: 0.4551 - val_mean_squared_error: 0.4551
Epoch 6/25
 - 167s - loss: 0.4496 - mean_squared_error: 0.4275 - val_loss: 0.4505 - val_mean_squared_error: 0.4505
Epoch 7/25
 - 173s - loss: 0.4322 - mean_squared_error: 0.4115 - val_loss: 0.4427 - val_mean_squared_error: 0.4427
Epoch 8/25
 - 168s - loss: 0.4226 - mean_squared_error: 0.4028 - val_loss: 0.4253 - val_mean_squared_error: 0.4253
Epoch 9/25
 - 168s -

Epoch 18/25
 - 168s - loss: 0.3481 - mean_squared_error: 0.3303 - val_loss: 0.3503 - val_mean_squared_error: 0.3503
Epoch 19/25
 - 169s - loss: 0.3468 - mean_squared_error: 0.3291 - val_loss: 0.3417 - val_mean_squared_error: 0.3417
Epoch 20/25
 - 169s - loss: 0.3447 - mean_squared_error: 0.3271 - val_loss: 0.3411 - val_mean_squared_error: 0.3411
Epoch 21/25
 - 169s - loss: 0.3429 - mean_squared_error: 0.3254 - val_loss: 0.3387 - val_mean_squared_error: 0.3387
Epoch 22/25
 - 169s - loss: 0.3414 - mean_squared_error: 0.3240 - val_loss: 0.3342 - val_mean_squared_error: 0.3342
Epoch 23/25
 - 169s - loss: 0.3396 - mean_squared_error: 0.3222 - val_loss: 0.3325 - val_mean_squared_error: 0.3325
Epoch 24/25
 - 168s - loss: 0.3381 - mean_squared_error: 0.3208 - val_loss: 0.3305 - val_mean_squared_error: 0.3305
Epoch 25/25
 - 168s - loss: 0.3373 - mean_squared_error: 0.3201 - val_loss: 0.3331 - val_mean_squared_error: 0.3331
Step 4
Train on 1340120 samples, validate on 167515 samples
Epoch 1/25
 

Epoch 10/25
 - 167s - loss: 0.4275 - mean_squared_error: 0.4035 - val_loss: 0.4421 - val_mean_squared_error: 0.4421
Epoch 11/25
 - 167s - loss: 0.4222 - mean_squared_error: 0.3986 - val_loss: 0.4262 - val_mean_squared_error: 0.4262
Epoch 12/25
 - 167s - loss: 0.4171 - mean_squared_error: 0.3937 - val_loss: 0.4144 - val_mean_squared_error: 0.4144
Epoch 13/25
 - 167s - loss: 0.4118 - mean_squared_error: 0.3888 - val_loss: 0.4060 - val_mean_squared_error: 0.4060
Epoch 14/25
 - 167s - loss: 0.4077 - mean_squared_error: 0.3849 - val_loss: 0.3955 - val_mean_squared_error: 0.3955
Epoch 15/25
 - 167s - loss: 0.4042 - mean_squared_error: 0.3817 - val_loss: 0.3825 - val_mean_squared_error: 0.3825
Epoch 16/25
 - 167s - loss: 0.4006 - mean_squared_error: 0.3783 - val_loss: 0.3744 - val_mean_squared_error: 0.3744
Epoch 17/25
 - 167s - loss: 0.3967 - mean_squared_error: 0.3745 - val_loss: 0.3647 - val_mean_squared_error: 0.3647
Epoch 18/25
 - 167s - loss: 0.3934 - mean_squared_error: 0.3714 - val_lo

Epoch 2/25
 - 167s - loss: 0.5469 - mean_squared_error: 0.5162 - val_loss: 0.5460 - val_mean_squared_error: 0.5460
Epoch 3/25
 - 168s - loss: 0.5159 - mean_squared_error: 0.4870 - val_loss: 0.5371 - val_mean_squared_error: 0.5371
Epoch 4/25
 - 167s - loss: 0.4950 - mean_squared_error: 0.4676 - val_loss: 0.5355 - val_mean_squared_error: 0.5355
Epoch 5/25
 - 167s - loss: 0.4782 - mean_squared_error: 0.4518 - val_loss: 0.5215 - val_mean_squared_error: 0.5215
Epoch 6/25
 - 168s - loss: 0.4656 - mean_squared_error: 0.4400 - val_loss: 0.5003 - val_mean_squared_error: 0.5003
Epoch 7/25
 - 168s - loss: 0.4542 - mean_squared_error: 0.4294 - val_loss: 0.4892 - val_mean_squared_error: 0.4892
Epoch 8/25
 - 167s - loss: 0.4447 - mean_squared_error: 0.4205 - val_loss: 0.4696 - val_mean_squared_error: 0.4696
Epoch 9/25
 - 167s - loss: 0.4369 - mean_squared_error: 0.4132 - val_loss: 0.4562 - val_mean_squared_error: 0.4562
Epoch 10/25
 - 168s - loss: 0.4292 - mean_squared_error: 0.4061 - val_loss: 0.43

 - 167s - loss: 0.3795 - mean_squared_error: 0.3598 - val_loss: 0.3499 - val_mean_squared_error: 0.3499
Epoch 21/25
 - 168s - loss: 0.3772 - mean_squared_error: 0.3576 - val_loss: 0.3501 - val_mean_squared_error: 0.3501
Epoch 22/25
 - 167s - loss: 0.3754 - mean_squared_error: 0.3558 - val_loss: 0.3446 - val_mean_squared_error: 0.3446
Epoch 23/25
 - 168s - loss: 0.3720 - mean_squared_error: 0.3526 - val_loss: 0.3439 - val_mean_squared_error: 0.3439
Epoch 24/25
 - 167s - loss: 0.3699 - mean_squared_error: 0.3506 - val_loss: 0.3432 - val_mean_squared_error: 0.3432
Epoch 25/25
 - 167s - loss: 0.3675 - mean_squared_error: 0.3483 - val_loss: 0.3432 - val_mean_squared_error: 0.3432
Step 12
Train on 1340120 samples, validate on 167515 samples
Epoch 1/25
 - 177s - loss: 0.9167 - mean_squared_error: 0.8699 - val_loss: 0.6431 - val_mean_squared_error: 0.6431
Epoch 2/25
 - 167s - loss: 0.5761 - mean_squared_error: 0.5460 - val_loss: 0.5305 - val_mean_squared_error: 0.5305
Epoch 3/25
 - 167s - loss

Epoch 12/25
 - 168s - loss: 0.4387 - mean_squared_error: 0.4162 - val_loss: 0.3923 - val_mean_squared_error: 0.3923
Epoch 13/25
 - 167s - loss: 0.4336 - mean_squared_error: 0.4113 - val_loss: 0.3797 - val_mean_squared_error: 0.3797
Epoch 14/25
 - 168s - loss: 0.4276 - mean_squared_error: 0.4055 - val_loss: 0.3750 - val_mean_squared_error: 0.3750
Epoch 15/25
 - 168s - loss: 0.4233 - mean_squared_error: 0.4014 - val_loss: 0.3661 - val_mean_squared_error: 0.3661
Epoch 16/25
 - 168s - loss: 0.4188 - mean_squared_error: 0.3971 - val_loss: 0.3570 - val_mean_squared_error: 0.3570
Epoch 17/25
 - 168s - loss: 0.4150 - mean_squared_error: 0.3935 - val_loss: 0.3494 - val_mean_squared_error: 0.3494
Epoch 18/25
 - 167s - loss: 0.4106 - mean_squared_error: 0.3893 - val_loss: 0.3424 - val_mean_squared_error: 0.3424
Epoch 19/25
 - 168s - loss: 0.4076 - mean_squared_error: 0.3864 - val_loss: 0.3442 - val_mean_squared_error: 0.3442
Epoch 20/25
 - 169s - loss: 0.4038 - mean_squared_error: 0.3828 - val_lo

In [None]:
## calculate the overall error for the validation data 

In [263]:
weight = items["perishable"] * 0.25 + 1
err = (y_val - np.array(val_pred).squeeze(axis=2).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print('nwrmsle = {}'.format(err))

nwrmsle = 0.5799981848527513


In [274]:
## Write out validation predictions for plotting
y_val = np.array(val_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_val, index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
df_preds["unit_sales"] = np.clip(np.expm1(df_preds["unit_sales"]), 0, 1000)
df_preds.reset_index().to_csv('nn_cv.csv', index=False)

In [265]:
print("Making submission...")
y_test = np.array(test_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [266]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('nn_sub.csv', float_format='%.4f', index=None)