In [1]:
import numpy as np
import pandas as pd 
import os
from datetime import datetime, timedelta,date
%matplotlib inline
import matplotlib.pyplot as plt
from __future__ import division

import warnings
warnings.filterwarnings("ignore")

#import Keras
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam 
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.layers import LSTM
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [15]:
## Reading the dataset
path = '../DSfinalProject/data'
os.listdir(path)
train = pd.read_csv(path+'/train.csv')
test = pd.read_csv(path+'/test.csv')

#### reference: https://towardsdatascience.com/predicting-sales-611cb5a252de

### Preprocess

In [16]:
drop_col = ['Unnamed: 0','item_price','date_block_num','item_price',
            'sales_income','shop_name','item_name','item_category_id',
            'item_category_name']
train = train.drop(columns=drop_col)

In [17]:
train['date'] = pd.to_datetime(train['date'],format = '%d.%m.%Y')
train

Unnamed: 0,date,shop_id,item_id,item_cnt_day
0,2013-01-02,59,22154,1.0
1,2013-01-03,25,2552,1.0
2,2013-01-05,25,2552,-1.0
3,2013-01-06,25,2554,1.0
4,2013-01-15,25,2555,1.0
...,...,...,...,...
2935844,2015-10-10,25,7409,1.0
2935845,2015-10-09,25,7460,1.0
2935846,2015-10-14,25,7459,1.0
2935847,2015-10-22,25,7440,1.0


In [18]:
#represent month in date field as its first day
train['date'] = train['date'].dt.year.astype('str') + '-' + train['date'].dt.month.astype('str') + '-01'
train['date'] = pd.to_datetime(train['date'])
#groupby date and sum the sales
train = train.groupby('date').item_cnt_day.sum().reset_index()

In [19]:
train

Unnamed: 0,date,item_cnt_day
0,2013-01-01,131479.0
1,2013-02-01,128090.0
2,2013-03-01,147142.0
3,2013-04-01,107190.0
4,2013-05-01,106970.0
5,2013-06-01,125381.0
6,2013-07-01,116966.0
7,2013-08-01,125291.0
8,2013-09-01,133332.0
9,2013-10-01,127541.0


In [20]:
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
#plot monthly sales
plot_data = [
    go.Scatter(
        x=train['date'],
        y=train['item_cnt_day'],
    )
]
plot_layout = go.Layout(
        title='Montly Sales'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [21]:
#create a new dataframe to model the difference
diff = train.copy()
#add previous sales to the next row
diff['prev_sales'] = diff['item_cnt_day'].shift(1)
#drop the null values and calculate the difference
diff = diff.dropna()
diff['diff'] = (diff['item_cnt_day'] - diff['prev_sales'])
diff.head(10)

Unnamed: 0,date,item_cnt_day,prev_sales,diff
1,2013-02-01,128090.0,131479.0,-3389.0
2,2013-03-01,147142.0,128090.0,19052.0
3,2013-04-01,107190.0,147142.0,-39952.0
4,2013-05-01,106970.0,107190.0,-220.0
5,2013-06-01,125381.0,106970.0,18411.0
6,2013-07-01,116966.0,125381.0,-8415.0
7,2013-08-01,125291.0,116966.0,8325.0
8,2013-09-01,133332.0,125291.0,8041.0
9,2013-10-01,127541.0,133332.0,-5791.0
10,2013-11-01,130009.0,127541.0,2468.0


In [22]:
#plot sales diff
plot_data = [
    go.Scatter(
        x=diff['date'],
        y=diff['diff'],
    )
]
plot_layout = go.Layout(
        title='Montly Sales Diff'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [23]:
#create dataframe for transformation from time series to supervised
df_supervised = diff.drop(['prev_sales'],axis=1)
#adding lags
for inc in range(1,13):
    field_name = 'lag_' + str(inc)
    df_supervised[field_name] = df_supervised['diff'].shift(inc)
#drop null values
df_supervised = df_supervised.dropna().reset_index(drop=True)

In [24]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf
# Define the regression formula
model = smf.ols(formula='diff ~ lag_1 + lag_2 + lag_3+lag_4+lag_5+lag_6+lag_7+lag_8+lag_9+lag_10+lag_11+lag_12', data=df_supervised)
# Fit the regression
model_fit = model.fit()
# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

0.8348273896061983


In [27]:
#import MinMaxScaler and create a new dataframe for LSTM model
from sklearn.preprocessing import MinMaxScaler
df_model = df_supervised.drop(['item_cnt_day','date'],axis=1)
#split train and test set
train_set, test_set = df_model[0:-6].values, df_model[-6:].values

In [28]:
#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
# reshape training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)
# reshape test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)

### LSTM model

In [29]:
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [32]:
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, batch_size=1, verbose=1, shuffle=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 100/100


<keras.callbacks.History at 0x27b0c2c9fa0>

In [34]:
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, batch_size=1, verbose=1, shuffle=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 100/100


<keras.callbacks.History at 0x27b0be30790>

In [37]:
y_pred = model.predict(X_test,batch_size=1)
#for multistep prediction, you need to replace X_test values with the predictions coming from t-1



In [38]:
#reshape y_pred
y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])
#rebuild test set for inverse transform
pred_test_set = []
for index in range(0,len(y_pred)):
    print (np.concatenate([y_pred[index],X_test[index]],axis=1))
    pred_test_set.append(np.concatenate([y_pred[index],X_test[index]],axis=1))
#reshape pred_test_set
pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
#inverse transform
pred_test_set_inverted = scaler.inverse_transform(pred_test_set)

[[ 0.13158534  0.06100398  0.07298624 -0.33759685 -0.85541344  0.95954114
   0.28349586  0.24661034  0.05079482  0.30049426  0.00677932  0.1034264
   0.13005944]]
[[-0.04601443  0.03808169  0.03954048  0.07298624 -0.33759685 -0.85541344
   0.95954114  0.28349586  0.24661034  0.05079482  0.30049426  0.00677932
   0.1034264 ]]
[[ 0.00252932 -0.00706416  0.01708189  0.03954048  0.07298624 -0.33759685
  -0.85541344  0.95954114  0.28349586  0.24661034  0.05079482  0.30049426
   0.00677932]]
[[ 0.34410107  0.11656285 -0.02715068  0.01708189  0.03954048  0.07298624
  -0.33759685 -0.85541344  0.95954114  0.28349586  0.24661034  0.05079482
   0.30049426]]
[[-0.10313332  0.18164853  0.09397542 -0.02715068  0.01708189  0.03954048
   0.07298624 -0.33759685 -0.85541344  0.95954114  0.28349586  0.24661034
   0.05079482]]
[[ 0.09450826  0.24763747  0.15774446  0.09397542 -0.02715068  0.01708189
   0.03954048  0.07298624 -0.33759685 -0.85541344  0.95954114  0.28349586
   0.24661034]]


In [41]:
#create dataframe that shows the predicted sales
result_list = []
sales_dates = list(train[-7:].date)
act_sales = list(train[-7:].item_cnt_day)
for index in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[index][0] + act_sales[index])
    result_dict['date'] = sales_dates[index+1]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)
#for multistep prediction, replace act_sales with the predicted sales

In [42]:
#merge with actual sales dataframe
df_sales_pred = pd.merge(train,df_result,on='date',how='left')
#plot actual and predicted
plot_data = [
    go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['item_cnt_day'],
        name='actual'
    ),
        go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['pred_value'],
        name='predicted'
    )
    
]
plot_layout = go.Layout(
        title='Sales Prediction'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)