In [2]:
import pandas as pd
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
submission_df = pd.read_csv('../input/sample_submission.csv')

In [2]:
train_df

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62


In [14]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

%matplotlib inline
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [4]:
def SMAPE (forecast, actual):
    """Returns the Symmetric Mean Absolute Percentage Error between two Series"""
    masked_arr = ~((forecast==0)&(actual==0))
    diff = abs(forecast[masked_arr] - actual[masked_arr])
    avg = (abs(forecast[masked_arr]) + abs(actual[masked_arr]))/2
    
    print('SMAPE Error Score: ' + str(round(sum(diff/avg)/len(forecast) * 100, 2)) + ' %')

In [10]:
def series_to_supervised(data, window=1, lag=1, dropnan=True, exogenous = ['item', 'store'], value = ['sales']):
    cols= list()
    names = exogenous.copy()
    cols.append(data[exogenous])
    
    for i in range(window, 0, -1):
        cols.append(data[value].shift(i))
        names += [('sales(t-%d)' % (i))]

    cols.append(data[value])
    names += ['sales(t)']

    cols.append(data[value].shift(-lag))
    names += [('sales(t+%d)' % ( lag))]
    
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # Drop rows with NaN values if needed
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [8]:
sample = train_df[(train_df.item == 1)&(train_df.store==1)]
train_df.sample(5)

Unnamed: 0,date,store,item,sales
256132,2014-05-08,1,15,83
243695,2015-04-18,4,14,80
85642,2017-07-05,7,5,16
97263,2014-05-01,4,6,67
690474,2013-09-04,9,38,58


In [11]:
window = 355
lag_size = 90
series = series_to_supervised(sample.drop('date', axis=1), window=window, lag=lag_size)
series

Unnamed: 0,item,store,sales(t-355),sales(t-354),sales(t-353),sales(t-352),sales(t-351),sales(t-350),sales(t-349),sales(t-348),...,sales(t-8),sales(t-7),sales(t-6),sales(t-5),sales(t-4),sales(t-3),sales(t-2),sales(t-1),sales(t),sales(t+90)
355,1,1,13.0,11.0,14.0,13.0,10.0,12.0,10.0,9.0,...,16.0,15.0,15.0,8.0,18.0,7.0,13.0,11.0,9,20.0
356,1,1,11.0,14.0,13.0,10.0,12.0,10.0,9.0,12.0,...,15.0,15.0,8.0,18.0,7.0,13.0,11.0,9.0,8,25.0
357,1,1,14.0,13.0,10.0,12.0,10.0,9.0,12.0,9.0,...,15.0,8.0,18.0,7.0,13.0,11.0,9.0,8.0,17,4.0
358,1,1,13.0,10.0,12.0,10.0,9.0,12.0,9.0,9.0,...,8.0,18.0,7.0,13.0,11.0,9.0,8.0,17.0,6,15.0
359,1,1,10.0,12.0,10.0,9.0,12.0,9.0,9.0,7.0,...,18.0,7.0,13.0,11.0,9.0,8.0,17.0,6.0,16,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731,1,1,24.0,29.0,18.0,18.0,16.0,16.0,32.0,32.0,...,20.0,20.0,28.0,37.0,24.0,14.0,18.0,27.0,23,14.0
1732,1,1,29.0,18.0,18.0,16.0,16.0,32.0,32.0,26.0,...,20.0,28.0,37.0,24.0,14.0,18.0,27.0,23.0,28,19.0
1733,1,1,18.0,18.0,16.0,16.0,32.0,32.0,26.0,20.0,...,28.0,37.0,24.0,14.0,18.0,27.0,23.0,28.0,30,15.0
1734,1,1,18.0,16.0,16.0,32.0,32.0,26.0,20.0,21.0,...,37.0,24.0,14.0,18.0,27.0,23.0,28.0,30.0,21,27.0


In [12]:
labels = series.iloc[:, -1]
series = series.iloc[:, :-1]

In [16]:
X_train, X_valid, Y_train, Y_valid = train_test_split(series, labels.values, test_size=0.2, random_state=0)
print('Train set shape', X_train.shape)
print('Validation set shape', X_valid.shape)
X_train.head()


Train set shape (1104, 358)
Validation set shape (277, 358)


Unnamed: 0,item,store,sales(t-355),sales(t-354),sales(t-353),sales(t-352),sales(t-351),sales(t-350),sales(t-349),sales(t-348),...,sales(t-9),sales(t-8),sales(t-7),sales(t-6),sales(t-5),sales(t-4),sales(t-3),sales(t-2),sales(t-1),sales(t)
1037,1,1,25.0,24.0,18.0,17.0,18.0,19.0,17.0,28.0,...,18.0,18.0,8.0,17.0,19.0,30.0,26.0,16.0,25.0,26
1273,1,1,21.0,33.0,26.0,30.0,19.0,22.0,19.0,20.0,...,27.0,28.0,15.0,25.0,25.0,23.0,31.0,31.0,32.0,26
1458,1,1,16.0,26.0,7.0,13.0,9.0,18.0,13.0,8.0,...,18.0,12.0,20.0,21.0,13.0,20.0,16.0,10.0,16.0,21
1299,1,1,20.0,26.0,20.0,25.0,22.0,19.0,33.0,14.0,...,22.0,33.0,34.0,37.0,24.0,25.0,22.0,34.0,26.0,34
1292,1,1,13.0,27.0,26.0,22.0,29.0,28.0,19.0,20.0,...,33.0,34.0,35.0,28.0,18.0,27.0,30.0,22.0,33.0,34


In [17]:
X_valid

Unnamed: 0,item,store,sales(t-355),sales(t-354),sales(t-353),sales(t-352),sales(t-351),sales(t-350),sales(t-349),sales(t-348),...,sales(t-9),sales(t-8),sales(t-7),sales(t-6),sales(t-5),sales(t-4),sales(t-3),sales(t-2),sales(t-1),sales(t)
1009,1,1,18.0,22.0,13.0,8.0,19.0,14.0,18.0,17.0,...,15.0,19.0,15.0,25.0,12.0,20.0,22.0,13.0,14.0,19
1329,1,1,22.0,16.0,16.0,34.0,25.0,17.0,25.0,13.0,...,27.0,36.0,24.0,15.0,25.0,26.0,29.0,30.0,22.0,15
800,1,1,20.0,25.0,4.0,15.0,17.0,17.0,16.0,18.0,...,14.0,19.0,10.0,17.0,16.0,13.0,13.0,23.0,21.0,24
964,1,1,15.0,22.0,26.0,27.0,21.0,25.0,12.0,22.0,...,23.0,26.0,27.0,14.0,18.0,24.0,17.0,29.0,22.0,19
814,1,1,21.0,26.0,19.0,19.0,20.0,18.0,23.0,19.0,...,15.0,16.0,17.0,31.0,25.0,23.0,16.0,18.0,13.0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1584,1,1,30.0,39.0,13.0,25.0,33.0,26.0,28.0,35.0,...,26.0,22.0,26.0,25.0,32.0,33.0,15.0,21.0,29.0,19
1570,1,1,30.0,30.0,18.0,29.0,24.0,17.0,23.0,23.0,...,21.0,17.0,25.0,15.0,20.0,32.0,13.0,21.0,19.0,26
1503,1,1,16.0,10.0,12.0,9.0,21.0,12.0,14.0,16.0,...,24.0,25.0,25.0,16.0,12.0,20.0,16.0,16.0,18.0,19
1027,1,1,16.0,15.0,22.0,22.0,12.0,28.0,16.0,12.0,...,23.0,22.0,25.0,19.0,21.0,24.0,17.0,28.0,24.0,33


In [10]:
batch = 256
epochs = 40
lr = 0.0001
adam = optimizers.Adam(lr)

model_mlp = Sequential()
model_mlp.add(Dense(100, activation='relu', input_dim=X_train.shape[1]))
model_mlp.add(Dense(60, activation='relu'))
model_mlp.add(Dense(1))
model_mlp.compile(loss='mse', optimizer=adam)
model_mlp.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               35900     
_________________________________________________________________
dense_2 (Dense)              (None, 60)                6060      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 61        
Total params: 42,021
Trainable params: 42,021
Non-trainable params: 0
_________________________________________________________________


In [11]:
%%time
mlp_history = model_mlp.fit(X_train.values, Y_train, validation_data=(X_valid.values, Y_valid), epochs=epochs, verbose=2)

Train on 1104 samples, validate on 277 samples
Epoch 1/40
 - 1s - loss: 106.2588 - val_loss: 48.5598
Epoch 2/40
 - 0s - loss: 48.3589 - val_loss: 41.1086
Epoch 3/40
 - 0s - loss: 42.0016 - val_loss: 36.6391
Epoch 4/40
 - 0s - loss: 38.5556 - val_loss: 35.3069
Epoch 5/40
 - 0s - loss: 36.3200 - val_loss: 35.5669
Epoch 6/40
 - 0s - loss: 34.5340 - val_loss: 36.7984
Epoch 7/40
 - 0s - loss: 33.6350 - val_loss: 33.6636
Epoch 8/40
 - 0s - loss: 32.1709 - val_loss: 33.6415
Epoch 9/40
 - 0s - loss: 31.3157 - val_loss: 32.7105
Epoch 10/40
 - 0s - loss: 30.5336 - val_loss: 32.2267
Epoch 11/40
 - 0s - loss: 29.6259 - val_loss: 32.7445
Epoch 12/40
 - 0s - loss: 29.1207 - val_loss: 31.8342
Epoch 13/40
 - 0s - loss: 28.3749 - val_loss: 32.7086
Epoch 14/40
 - 0s - loss: 29.0227 - val_loss: 30.4495
Epoch 15/40
 - 0s - loss: 27.7549 - val_loss: 31.6482
Epoch 16/40
 - 0s - loss: 26.5300 - val_loss: 31.4437
Epoch 17/40
 - 0s - loss: 26.0333 - val_loss: 30.2984
Epoch 18/40
 - 0s - loss: 26.1300 - val_los

In [12]:
MPL_prediction = model_mlp.predict(series[-90:]).flatten()
SMAPE(MPL_prediction, labels[-90:])

SMAPE Error Score: 19.11 %


In [13]:
# %%time
# series = []
# for s in range(1,11):
#     for i in range(1,51):
#         series += [series_to_supervised(train_df[(train_df.item == i)&(train_df.store==s)].drop('date', axis=1), 
#                                         window=window, lag=lag)]
# series = pd.concat(series, axis=0)
# series.to_csv('sales_for_MLP.csv') 

In [14]:
series = pd.read_csv('sales_for_MLP.csv', index_col=0)
series

Unnamed: 0,item,store,sales(t-355),sales(t-354),sales(t-353),sales(t-352),sales(t-351),sales(t-350),sales(t-349),sales(t-348),...,sales(t-8),sales(t-7),sales(t-6),sales(t-5),sales(t-4),sales(t-3),sales(t-2),sales(t-1),sales(t),sales(t+90)
355,1,1,13.0,11.0,14.0,13.0,10.0,12.0,10.0,9.0,...,16.0,15.0,15.0,8.0,18.0,7.0,13.0,11.0,9,20.0
356,1,1,11.0,14.0,13.0,10.0,12.0,10.0,9.0,12.0,...,15.0,15.0,8.0,18.0,7.0,13.0,11.0,9.0,8,25.0
357,1,1,14.0,13.0,10.0,12.0,10.0,9.0,12.0,9.0,...,15.0,8.0,18.0,7.0,13.0,11.0,9.0,8.0,17,4.0
358,1,1,13.0,10.0,12.0,10.0,9.0,12.0,9.0,9.0,...,8.0,18.0,7.0,13.0,11.0,9.0,8.0,17.0,6,15.0
359,1,1,10.0,12.0,10.0,9.0,12.0,9.0,9.0,7.0,...,18.0,7.0,13.0,11.0,9.0,8.0,17.0,6.0,16,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912905,50,10,84.0,84.0,69.0,73.0,70.0,67.0,78.0,73.0,...,72.0,89.0,97.0,97.0,99.0,63.0,79.0,80.0,82,63.0
912906,50,10,84.0,69.0,73.0,70.0,67.0,78.0,73.0,97.0,...,89.0,97.0,97.0,99.0,63.0,79.0,80.0,82.0,90,59.0
912907,50,10,69.0,73.0,70.0,67.0,78.0,73.0,97.0,69.0,...,97.0,97.0,99.0,63.0,79.0,80.0,82.0,90.0,103,74.0
912908,50,10,73.0,70.0,67.0,78.0,73.0,97.0,69.0,85.0,...,97.0,99.0,63.0,79.0,80.0,82.0,90.0,103.0,99,62.0


In [15]:
labels = series.iloc[:, -1]
series = series.iloc[:, :-1]

In [5]:
X_train, X_valid, Y_train, Y_valid = train_test_split(series, labels.values, test_size=0.4, random_state=0)
print('Train set shape', X_train.shape)
print('Validation set shape', X_valid.shape)
X_train.head()


NameError: name 'train_test_split' is not defined

In [6]:
X_valid

NameError: name 'X_valid' is not defined

In [20]:
batch = 256
epochs = 40
lr = 0.0005
adam = optimizers.Adam(lr)

model_mlp = Sequential()
model_mlp.add(Dense(300, activation='relu', input_dim=X_train.shape[1]))
model_mlp.add(Dense(150, activation='relu', input_dim=X_train.shape[1]))
model_mlp.add(Dense(60, activation='relu'))
model_mlp.add(Dense(1))
model_mlp.compile(loss='mse', optimizer=adam)
model_mlp.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 300)               107700    
_________________________________________________________________
dense_8 (Dense)              (None, 150)               45150     
_________________________________________________________________
dense_9 (Dense)              (None, 60)                9060      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 61        
Total params: 161,971
Trainable params: 161,971
Non-trainable params: 0
_________________________________________________________________


In [21]:
# %%time
# mlp_history = model_mlp.fit(X_train.values, Y_train, validation_data=(X_valid.values, Y_valid), epochs=epochs, verbose=2)
# model_mlp.save('MLP.h5')

Train on 414300 samples, validate on 276200 samples
Epoch 1/40
 - 36s - loss: 73.2973 - val_loss: 68.5117
Epoch 2/40
 - 35s - loss: 65.9017 - val_loss: 65.4066
Epoch 3/40
 - 35s - loss: 64.4687 - val_loss: 63.2410
Epoch 4/40
 - 35s - loss: 63.4915 - val_loss: 66.2859
Epoch 5/40
 - 36s - loss: 63.0254 - val_loss: 62.8616
Epoch 6/40
 - 37s - loss: 62.5065 - val_loss: 60.5659
Epoch 7/40
 - 36s - loss: 62.0737 - val_loss: 60.5984
Epoch 8/40
 - 36s - loss: 61.8282 - val_loss: 60.9844
Epoch 9/40
 - 38s - loss: 61.5666 - val_loss: 64.2546
Epoch 10/40
 - 37s - loss: 61.4301 - val_loss: 63.1295
Epoch 11/40
 - 36s - loss: 61.1545 - val_loss: 62.3244
Epoch 12/40
 - 36s - loss: 60.9929 - val_loss: 62.4715
Epoch 13/40
 - 36s - loss: 60.7813 - val_loss: 60.6934
Epoch 14/40
 - 36s - loss: 60.7162 - val_loss: 61.2657
Epoch 15/40
 - 36s - loss: 60.5344 - val_loss: 59.7160
Epoch 16/40
 - 35s - loss: 60.4295 - val_loss: 59.3888
Epoch 17/40
 - 36s - loss: 60.2459 - val_loss: 59.8885
Epoch 18/40
 - 36s - l

In [29]:
model_mlp = keras.models.load_model('MLP.h5')

In [31]:
MPL_prediction = model_mlp.predict(series[-90:]).flatten()
SMAPE(MPL_prediction, labels[-90:])

SMAPE Error Score: 9.69 %


In [48]:
display(test_df, submission_df)

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1
...,...,...,...,...
44995,44995,2018-03-27,10,50
44996,44996,2018-03-28,10,50
44997,44997,2018-03-29,10,50
44998,44998,2018-03-30,10,50


Unnamed: 0,id,sales
0,0,52
1,1,52
2,2,52
3,3,52
4,4,52
...,...,...
44995,44995,52
44996,44996,52
44997,44997,52
44998,44998,52


In [65]:
result = []
for i in range(1,51):
    for s in range(1,11):
        result += [model_mlp.predict(series [(series.item == i)&(series.store==s)][-90:]).flatten()]

In [81]:
result = np.array(result)
submission_df.sales = result.reshape(-1)
submission_df

Unnamed: 0,id,sales
0,0,22.022301
1,1,22.233278
2,2,22.842680
3,3,25.115673
4,4,26.080570
...,...,...
44995,44995,56.212875
44996,44996,59.555695
44997,44997,63.752804
44998,44998,66.937950


In [106]:
series [(series.item == 50)&(series.store==10)][-90:]

Unnamed: 0,item,store,sales(t-355),sales(t-354),sales(t-353),sales(t-352),sales(t-351),sales(t-350),sales(t-349),sales(t-348),...,sales(t-9),sales(t-8),sales(t-7),sales(t-6),sales(t-5),sales(t-4),sales(t-3),sales(t-2),sales(t-1),sales(t)
912820,50,10,97.0,90.0,116.0,71.0,90.0,82.0,108.0,93.0,...,82.0,83.0,91.0,122.0,112.0,119.0,120.0,99.0,98.0,103
912821,50,10,90.0,116.0,71.0,90.0,82.0,108.0,93.0,98.0,...,83.0,91.0,122.0,112.0,119.0,120.0,99.0,98.0,103.0,93
912822,50,10,116.0,71.0,90.0,82.0,108.0,93.0,98.0,149.0,...,91.0,122.0,112.0,119.0,120.0,99.0,98.0,103.0,93.0,108
912823,50,10,71.0,90.0,82.0,108.0,93.0,98.0,149.0,72.0,...,122.0,112.0,119.0,120.0,99.0,98.0,103.0,93.0,108.0,120
912824,50,10,90.0,82.0,108.0,93.0,98.0,149.0,72.0,96.0,...,112.0,119.0,120.0,99.0,98.0,103.0,93.0,108.0,120.0,135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912905,50,10,84.0,84.0,69.0,73.0,70.0,67.0,78.0,73.0,...,84.0,72.0,89.0,97.0,97.0,99.0,63.0,79.0,80.0,82
912906,50,10,84.0,69.0,73.0,70.0,67.0,78.0,73.0,97.0,...,72.0,89.0,97.0,97.0,99.0,63.0,79.0,80.0,82.0,90
912907,50,10,69.0,73.0,70.0,67.0,78.0,73.0,97.0,69.0,...,89.0,97.0,97.0,99.0,63.0,79.0,80.0,82.0,90.0,103
912908,50,10,73.0,70.0,67.0,78.0,73.0,97.0,69.0,85.0,...,97.0,97.0,99.0,63.0,79.0,80.0,82.0,90.0,103.0,99


In [95]:
labels.loc[(1646+1826*j):(1735+1826*j)]

3472    28.0
3473    23.0
3474    36.0
3475    47.0
3476    30.0
        ... 
3557    19.0
3558    21.0
3559    18.0
3560    24.0
3561    31.0
Name: sales(t+90), Length: 90, dtype: float64

In [98]:
result = []
check = []
j=0
for i in range(1,51):
    for s in range(1,11):
        result += [model_mlp.predict(series [(series.item == i)&(series.store==s)][-180:-90]).flatten()]
        check +=[labels.loc[(1646+1826*j):(1735+1826*j)]]
        j+=1

In [108]:
SMAPE(check, result)

SMAPE Error Score: 5.2 %
