### Loading libraries

In [None]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers.core import Dense, Dropout, Activation, Reshape
from keras.layers import LSTM
from keras import callbacks
from keras.callbacks import ModelCheckpoint

from keras.layers.core import Dense, Dropout, Activation
from keras.layers.advanced_activations import PReLU

from keras.layers import Merge
from keras.layers.embeddings import Embedding
import h5py

from sklearn.preprocessing import LabelEncoder
from keras.layers.merge import concatenate
from keras.layers import Input
from keras.models import Model, Sequential
import copy

from keras.layers.normalization import BatchNormalization

### Reading data and preprocessing

In [7]:
df_train = pd.read_csv(
    'D:/data mining/infor project/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "D:/data mining/infor project/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "D:/data mining/infor project/items.csv",
).set_index("item_nbr")

stores = pd.read_csv(
    "D:/data mining/infor project/stores.csv",
).set_index("store_nbr")

In [9]:
le = LabelEncoder()
items['family'] = le.fit_transform(items['family'].values)
items['class'] = le.fit_transform(items['class'].values)
items['perishable'] = le.fit_transform(items['perishable'].values)

In [10]:
stores['city'] = le.fit_transform(stores['city'].values)
stores['state'] = le.fit_transform(stores['state'].values)
stores['type'] = le.fit_transform(stores['type'].values)
stores['cluster'] = le.fit_transform(stores['cluster'].values)

In [11]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [12]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [13]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [14]:
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)

In [15]:
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [16]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [17]:
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [18]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)

In [19]:
df_2017.columns = df_2017.columns.get_level_values(1)

In [21]:
items = items.reindex(df_2017.index.get_level_values(1))
stores = stores.reindex(df_2017.index.get_level_values(0))

### Creating the datasets for training and testing the model

In [22]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [23]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [24]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []

Preparing dataset...


In [25]:
for i in range(6):
    delta = timedelta(days=7 * i)
    
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    
    X_tmp = pd.concat([X_tmp, items.reset_index(), stores.reset_index()], axis=1)
    
    X_l.append(X_tmp)
    y_l.append(y_tmp)

In [26]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

In [27]:
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_val = pd.concat([X_val, items.reset_index(), stores.reset_index()], axis=1)

## Building Categorical Embedding features

In [28]:
X_train_dnn = copy.copy(X_train.iloc[:,[41,42,45,46,47,48]]).values

In [29]:
X_val_dnn = copy.copy(X_val.iloc[:,[41,42,45,46,47,48]]).values

In [30]:
les = []
for i in range(X_train_dnn.shape[1]):
    le = LabelEncoder()
    le.fit(X_train.iloc[:,[41,42,45,46,47,48]].iloc[:, i])
    les.append(le)
    X_train_dnn[:, i] = le.transform(X_train_dnn[:, i])
    X_val_dnn[:, i] = le.transform(X_val_dnn[:, i])

In [31]:
def split_features(X):
    X_list = []
    
    family = X[..., [0]]
    X_list.append(family)
    
    class1 = X[..., [1]]
    X_list.append(class1)
    
    city = X[..., [2]]
    X_list.append(city)
    
    state = X[..., [3]]
    X_list.append(state)
    
    type1 = X[..., [4]]
    X_list.append(type1)
    
    cluster = X[..., [5]]
    X_list.append(cluster)
    
    return X_list

In [32]:
#Building Entity Embedding essentially invoves DNN modelling with output features by looking at the target variable 
#This function NN_with_EntityEmbedding does the same.

class NN_with_EntityEmbedding(object):
    def __init__(self, X_train, y_train, X_val, y_val):
            self.nb_epoch = 10
            self.__build_keras_model()
            self.fit(X_train, y_train, X_val, y_val)
    def preprocessing(self, X):
            X_list = split_features(X)
            return X_list
    def __build_keras_model(self):
            models = []
            
            modelfamily_in = Input(shape=(1,))
            modelfamily_out = Embedding(len(les[0].classes_), 16, input_length=1)(modelfamily_in)
            modelfamily_out = Reshape(target_shape=(16,))(modelfamily_out)
            #modelfamily = Model(modelfamily_in, modelfamily_out)
            models.append(modelfamily_out)
            
            modelclass_in = Input(shape=(1,))
            modelclass_out = Embedding(len(les[1].classes_), 50, input_length=1)(modelclass_in)
            modelclass_out = Reshape(target_shape=(50,))(modelclass_out)
            #modelclass = Model(modelclass_in, modelclass_out)
            models.append(modelclass_out)

            modelcity_in = Input(shape=(1,))
            modelcity_out = Embedding(len(les[2].classes_), 11, input_length=1)(modelcity_in)
            modelcity_out = Reshape(target_shape=(11,))(modelcity_out)
            #modelcity = Model(modelcity_in, modelcity_out)
            models.append(modelcity_out)
            
            modelstate_in = Input(shape=(1,))
            modelstate_out = Embedding(len(les[3].classes_), 8, input_length=1)(modelstate_in)
            modelstate_out = Reshape(target_shape=(8,))(modelstate_out)
            #modelstate = Model(modelstate_in, modelstate_out)
            models.append(modelstate_out)
            
            modeltype_in = Input(shape=(1,))
            modeltype_out = Embedding(len(les[4].classes_), 5, input_length=1)(modeltype_in)
            modeltype_out = Reshape(target_shape=(5,))(modeltype_out)
            #modeltype = Model(modeltype_in, modeltype_out)
            models.append(modeltype_out)
            
            modelcluster_in = Input(shape=(1,))
            modelcluster_out = Embedding(len(les[5].classes_), 8, input_length=1)(modelcluster_in)
            modelcluster_out = Reshape(target_shape=(8,))(modelcluster_out)
            #modelcluster = Model(modelcluster_in, modelcluster_out)
            models.append(modelcluster_out)
            
            concatenated = concatenate(models)
            out = Dense(150, activation='relu', kernel_initializer='uniform')(concatenated)
            out1 = Dense(250, activation='relu', kernel_initializer='uniform')(out)
            out2 = Dense(1, activation='relu', kernel_initializer='uniform')(out1)
            
            self.model = Model([modelfamily_in,modelclass_in,modelcity_in,modelstate_in,
                               modeltype_in, modelcluster_in], out2)
            
            self.model.compile(loss='mse', optimizer='adam', 
                                 metrics=['accuracy'])

        
    def fit(self, X_train, y_train, X_val, y_val):
            self.model.fit(self.preprocessing(X_train), y_train,
                           validation_data=(self.preprocessing(X_val), y_val),
                           epochs=self.nb_epoch, batch_size=128,
                           )



In [33]:
dnn = NN_with_EntityEmbedding(X_train_dnn, y_train[:,[15]], X_val_dnn, y_val[:,[15]])  

Train on 1005090 samples, validate on 167515 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
weights = dnn.model.get_weights()

In [35]:
#extracting embedding weight for 'item family'
item_family_embeddings = pd.DataFrame(weights[0])
item_family_embeddings.columns = ['family'+str(i) for i in range(len(item_family_embeddings.columns.values))]
item_family_embeddings['family'] = les[0].inverse_transform(np.unique(X_train_dnn[..., [0]]))
item_family_embeddings = item_family_embeddings.set_index('family')

In [36]:
#extracting embedding weight for 'item class'
item_class_embeddings = pd.DataFrame(weights[1])
item_class_embeddings.columns = ['class'+str(i) for i in range(len(item_class_embeddings.columns.values))]
item_class_embeddings['class'] = les[1].inverse_transform(np.unique(X_train_dnn[..., [1]]))
item_class_embeddings = item_class_embeddings.set_index('class')

In [37]:
#extracting embedding weight for 'store city'
store_city_embeddings = pd.DataFrame(weights[2])
store_city_embeddings.columns = ['city'+str(i) for i in range(len(store_city_embeddings.columns.values))]
store_city_embeddings['city'] = les[2].inverse_transform(np.unique(X_train_dnn[..., [2]]))
store_city_embeddings = store_city_embeddings.set_index('city')

In [38]:
#extracting embedding weight for 'store state'
store_state_embeddings = pd.DataFrame(weights[3])
store_state_embeddings.columns = ['state'+str(i) for i in range(len(store_state_embeddings.columns.values))]
store_state_embeddings['state'] = les[3].inverse_transform(np.unique(X_train_dnn[..., [3]]))
store_state_embeddings = store_state_embeddings.set_index('state')

In [39]:
#extracting embedding weight for 'store type'
store_type_embeddings = pd.DataFrame(weights[4])
store_type_embeddings.columns = ['type'+str(i) for i in range(len(store_type_embeddings.columns.values))]
store_type_embeddings['type'] = les[4].inverse_transform(np.unique(X_train_dnn[..., [4]]))
store_type_embeddings = store_type_embeddings.set_index('type')

In [40]:
#extracting embedding weight for 'store cluster'
store_cluster_embeddings = pd.DataFrame(weights[5])
store_cluster_embeddings.columns = ['cluster'+str(i) for i in range(len(store_cluster_embeddings.columns.values))]
store_cluster_embeddings['cluster'] = les[5].inverse_transform(np.unique(X_train_dnn[..., [5]]))
store_cluster_embeddings = store_cluster_embeddings.set_index('cluster')

# Training neural net model with Categorical Embedding Features

In [41]:
#adding embedding as features onto Xtrain array
X_train_1 = X_train.join(item_family_embeddings, on = 'family', how = "left")
X_train_1 = X_train_1.join(item_class_embeddings, on = 'class', how = "left")
X_train_1 = X_train_1.join(store_city_embeddings, on = 'city', how = "left")
X_train_1 = X_train_1.join(store_state_embeddings, on = 'state', how = "left")
X_train_1 = X_train_1.join(store_type_embeddings, on = 'type', how = "left")
X_train_1 = X_train_1.join(store_cluster_embeddings, on = 'cluster', how = "left")

In [42]:
#adding embedding as features onto Xval array
X_val_1 = X_val.join(item_family_embeddings, on = 'family', how = "left")
X_val_1 = X_val_1.join(item_class_embeddings, on = 'class', how = "left")
X_val_1 = X_val_1.join(store_city_embeddings, on = 'city', how = "left")
X_val_1 = X_val_1.join(store_state_embeddings, on = 'state', how = "left")
X_val_1 = X_val_1.join(store_type_embeddings, on = 'type', how = "left")
X_val_1 = X_val_1.join(store_cluster_embeddings, on = 'cluster', how = "left")

In [43]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)
X_test = pd.concat([X_test, items.reset_index(), stores.reset_index()], axis=1)

In [44]:
#adding embedding as features onto Xtest array
X_test_1 = X_test.join(item_family_embeddings, on = 'family', how = "left")
X_test_1 = X_test_1.join(item_class_embeddings, on = 'class', how = "left")
X_test_1 = X_test_1.join(store_city_embeddings, on = 'city', how = "left")
X_test_1 = X_test_1.join(store_state_embeddings, on = 'state', how = "left")
X_test_1 = X_test_1.join(store_type_embeddings, on = 'type', how = "left")
X_test_1 = X_test_1.join(store_cluster_embeddings, on = 'cluster', how = "left")

In [48]:
X_train.shape

(1005090, 49)

In [49]:
# Drop original item and store categoricals as they are now replaced with entity embedding
cols = [40,41,42,43,44,45,46,47,48]
X_train_1.drop(X_train_1.columns[cols],axis=1,inplace=True)
X_val_1.drop(X_val_1.columns[cols],axis=1,inplace=True)
X_test_1.drop(X_test_1.columns[cols],axis=1,inplace=True)

In [50]:
stores_items = pd.DataFrame(index=df_2017.index)
test_ids = df_test[['id']]

In [51]:
items = items.reindex( stores_items.index.get_level_values(1) )

In [52]:
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()
X_val = X_val.as_matrix()

In [53]:
X_train_1 = X_train_1.as_matrix()
X_test_1 = X_test_1.as_matrix()
X_val_1 = X_val_1.as_matrix()

In [54]:
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))

In [55]:
X_train_1 = X_train_1.reshape((X_train_1.shape[0], 1, X_train_1.shape[1]))
X_test_1 = X_test_1.reshape((X_test_1.shape[0], 1, X_test_1.shape[1]))
X_val_1 = X_val_1.reshape((X_val_1.shape[0], 1, X_val_1.shape[1]))

In [60]:
#Keras model Defintion
model = Sequential()
model.add(LSTM(512, input_shape=(X_train_1.shape[1],X_train_1.shape[2])))
model.add(BatchNormalization())
model.add(Dropout(.2))

model.add(Dense(256))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(.1))

model.add(Dense(256))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(.1))

model.add(Dense(128))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(.05))

model.add(Dense(64))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(.05))

model.add(Dense(32))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(.05))

model.add(Dense(16))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(.05))

model.add(Dense(1))

model.compile(loss = 'mse', optimizer='adam', metrics=['mse'])

In [61]:
N_EPOCHS = 5

val_pred = []
test_pred = []
# wtpath = 'weights.hdf5'  # To save best epoch. But need Keras bug to be fixed first.
sample_weights=np.array( pd.concat([items["perishable"]] * 6) * 0.25 + 1 )
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    y = y_train[:, i]
    xv = X_val_1
    yv = y_val[:, i]
    model.fit(X_train_1, y, batch_size = 512, epochs = N_EPOCHS, verbose=2,
               sample_weight=sample_weights, validation_data=(xv,yv) ) 
    val_pred.append(model.predict(X_val_1))
    test_pred.append(model.predict(X_test_1))
    

Step 1
Train on 1005090 samples, validate on 167515 samples
Epoch 1/5
63s - loss: 0.4585 - mean_squared_error: 0.4333 - val_loss: 0.2993 - val_mean_squared_error: 0.2993
Epoch 2/5
60s - loss: 0.3439 - mean_squared_error: 0.3260 - val_loss: 0.3066 - val_mean_squared_error: 0.3066
Epoch 3/5
60s - loss: 0.3355 - mean_squared_error: 0.3183 - val_loss: 0.2935 - val_mean_squared_error: 0.2935
Epoch 4/5
60s - loss: 0.3321 - mean_squared_error: 0.3152 - val_loss: 0.3013 - val_mean_squared_error: 0.3013
Epoch 5/5
61s - loss: 0.3290 - mean_squared_error: 0.3124 - val_loss: 0.2925 - val_mean_squared_error: 0.2925
Step 2
Train on 1005090 samples, validate on 167515 samples
Epoch 1/5
60s - loss: 0.3598 - mean_squared_error: 0.3395 - val_loss: 0.3322 - val_mean_squared_error: 0.3322
Epoch 2/5
61s - loss: 0.3542 - mean_squared_error: 0.3343 - val_loss: 0.3266 - val_mean_squared_error: 0.3266
Epoch 3/5
63s - loss: 0.3521 - mean_squared_error: 0.3324 - val_loss: 0.3241 - val_mean_squared_error: 0.3241


60s - loss: 0.3815 - mean_squared_error: 0.3619 - val_loss: 0.3960 - val_mean_squared_error: 0.3960
Epoch 4/5
61s - loss: 0.3791 - mean_squared_error: 0.3596 - val_loss: 0.3939 - val_mean_squared_error: 0.3939
Epoch 5/5
60s - loss: 0.3774 - mean_squared_error: 0.3580 - val_loss: 0.3971 - val_mean_squared_error: 0.3971
Step 13
Train on 1005090 samples, validate on 167515 samples
Epoch 1/5
61s - loss: 0.3890 - mean_squared_error: 0.3672 - val_loss: 0.3924 - val_mean_squared_error: 0.3924
Epoch 2/5
60s - loss: 0.3769 - mean_squared_error: 0.3560 - val_loss: 0.3900 - val_mean_squared_error: 0.3900
Epoch 3/5
60s - loss: 0.3740 - mean_squared_error: 0.3533 - val_loss: 0.3918 - val_mean_squared_error: 0.3918
Epoch 4/5
61s - loss: 0.3722 - mean_squared_error: 0.3516 - val_loss: 0.3925 - val_mean_squared_error: 0.3925
Epoch 5/5
61s - loss: 0.3706 - mean_squared_error: 0.3501 - val_loss: 0.3920 - val_mean_squared_error: 0.3920
Step 14
Train on 1005090 samples, validate on 167515 samples
Epoch 1/

## Check accuracy on validation set

In [62]:
n_public = 5 # Number of days in public test set
weights=pd.concat([items["perishable"]]) * 0.25 + 1
print("Unweighted validation mse: ", mean_squared_error(
    y_val, np.array(val_pred).squeeze(axis=2).transpose()) )
print("Full validation mse:       ", mean_squared_error(
    y_val, np.array(val_pred).squeeze(axis=2).transpose(), sample_weight=weights) )
print("'Public' validation mse:   ", mean_squared_error(
    y_val[:,:n_public], np.array(val_pred).squeeze(axis=2).transpose()[:,:n_public], 
    sample_weight=weights) )
print("'Private' validation mse:  ", mean_squared_error(
    y_val[:,n_public:], np.array(val_pred).squeeze(axis=2).transpose()[:,n_public:], 
    sample_weight=weights) )

Unweighted validation mse:  0.371372460586
Full validation mse:        0.370393038575
'Public' validation mse:    0.332081564279
'Private' validation mse:   0.387807345073


## Predictions

In [None]:
y_test = np.array(test_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_test, index=stores_items.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = test_ids.join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lstm_w_cat_em.csv', float_format='%.4f', index=None)