In [158]:
import os

import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
#from jupyterthemes import jtplot
#jtplot.style(theme='chesterish')
import pickle
from scipy.spatial.distance import euclidean #used for fdt
import fastdtw as fdt #fast dynamic time warping
from statsmodels.tsa.seasonal import seasonal_decompose #decompose seasonality
from statsmodels.tsa.stattools import adfuller #test if series is stationary (then can perform ARIMA)

from pmdarima.arima import auto_arima
import xgboost as xgb #xgboost model
import tensorflow as tf #DNN estimator model
%matplotlib inline
path = '../input/'

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import optimizers
from keras.utils.vis_utils import plot_model
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from keras import optimizers, models
from datetime import datetime

In [159]:
plt.rcParams["figure.figsize"] = [16,9]

In [160]:
DATA_DIR = r'C:\Users\yashc\DataspellProjects\DemandForecastingGCP/forecasting_ensemble/data/raw/store_item/'
LSTM_MODEL_DIR = r'C:\Users\yashc\DataspellProjects\DemandForecastingGCP\forecasting_ensemble\models'
TRAIN_PATH = DATA_DIR + 'train.csv'
TEST_PATH = DATA_DIR + 'test.csv'
GROUPED_COLS = ['item','store','date']
LABEL_COL = 'sales'
INDEX_COL = 'date'

LSTM_PARAMS = {
    'LSTM_WINDOW_LENGTH':29,
    'LSTM_PREDICTION_LAG':90,
    'LSTM_NODES':30,
    'LSTM_EPOCHS':10,
    'LSTM_BATCH':256,
    'LSTM_LR':0.001
}

XGB_PARAMS = {
    'XGB_MAX_DEPTH':3,
    'XGB_ETA':0.2,
    'XGB_SILENT':1,
    'XGB_SUBSAMPLE':1,
    'XGB_NUM_ROUNDS':1000
}

In [161]:
def SMAPE (forecast, actual):
    """Returns the Symmetric Mean Absolute Percentage Error between two Series"""
    masked_arr = ~((forecast==0)&(actual==0))
    diff = abs(forecast[masked_arr] - actual[masked_arr])
    avg = (abs(forecast[masked_arr]) + abs(actual[masked_arr]))/2

    print('SMAPE Error Score: ' + str(round(sum(diff/avg)/len(forecast) * 100, 2)) + ' %')

In [162]:
def Fuller(TimeSeries):
    """Provides Fuller test results for TimeSeries"""
    stationary_test = adfuller(TimeSeries)
    print('ADF Statistic: %f' % stationary_test[0])
    print('p-value: %f' % stationary_test[1])
    print('Critical Values:')
    for key, value in stationary_test[4].items():
        print('\t%s: %.3f' % (key, value))

In [163]:
def xboost(x_train, y_train, x_test, xgb_params):
    """Trains xgboost model and returns it"""

    dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=list(x_train.columns))
    dtest = xgb.DMatrix(x_test, feature_names=list(x_test.columns))

    params = {'max_depth':xgb_params['XGB_MAX_DEPTH'],
              'eta':xgb_params['XGB_ETA'],
              'silent':xgb_params['XGB_SILENT'],
              'subsample':xgb_params['XGB_SUBSAMPLE']}

    xgb_model = xgb.train(params, dtrain, xgb_params['XGB_NUM_ROUNDS'])

    return xgb_model

In [164]:
def lstm_model(params,x_train, y_train, x_valid, y_valid, lstm_model_dir,is_trained=0):

    folder_name = datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
    save_path = f'model-{folder_name}'

    if is_trained:
        fnf = listdir(lstm_model_dir)
        sorted_lstm_models = [x for x in fnf if ('model' in str(x)) & ('xgb' not in str(x))]
        sorted_lstm_models.sort(reverse=True)
        model_lstm = models.load_model(sorted_lstm_models[0])

    else:
        model_lstm = Sequential()
        model_lstm.add(LSTM(params['LSTM_NODES'], activation='relu', input_shape=(x_train.shape[1], x_train.shape[2])))
        model_lstm.add(Dense(1))
        model_lstm.compile(loss='mse', optimizer=adam)
        model_lstm.summary()
        lstm_history = model_lstm.fit(x_train, y_train, validation_data=(x_valid, y_valid), epochs=epochs, verbose=True)
        model_lstm.save(save_path)
    
    return model_lstm



In [165]:
def read_data(path, date_index_col):
    if path:
        df = pd.read_csv(path, index_col=date_index_col, infer_datetime_format=True)
        df.index = pd.to_datetime(df.index)
        return df

In [166]:
train = read_data(TRAIN_PATH, 0)
test = read_data(TEST_PATH, 1)

In [167]:
# Prepare data for LSTM
def lstm_data_preprocessing(data, grouped_cols, label_col, date_col='date'):
    # Rearrange dataset so we can apply shift methods
    data = data.reset_index()
    data = data.sort_values(date_col).groupby(grouped_cols, as_index=False)
    data = data.agg({f'{label_col}':['mean']})
    data.columns = grouped_cols + [label_col]
    return data


In [168]:
def xgboost_data_preprocessing(data,date_col):

    # Adding date based features
    data[date_col] = pd.to_datetime(data[date_col])
    data['year'] = data[date_col].dt.year
    data['quarter'] = data[date_col].dt.quarter
    data['month'] = data[date_col].dt.month
    data['weekofyear'] = data[date_col].dt.weekofyear
    data['weekday'] = data[date_col].dt.weekday
    data['dayofweek'] = data[date_col].dt.dayofweek

In [169]:
# Transform the data into a time series problem

def series_to_supervised(data, window=1, lag=1, dropnan=True):

    # Drop date column from data
    data.drop('date', axis=1, inplace=True)

    cols, names = list(), list()

    # Input sequence (t-n, ... t-1)
    for i in range(window, 0, -1):
        cols.append(data.shift(i))
        names += [('%s(t-%d)' % (col, i)) for col in data.columns]

    # Current timestep (t=0)
    cols.append(data)
    names += [('%s(t)' % (col)) for col in data.columns]

    # Target timestep (t=lag)
    cols.append(data.shift(-lag))
    names += [('%s(t+%d)' % (col, lag)) for col in data.columns]

    # Put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names

    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    print(agg.columns)
    # Drop rows with different item or store values than the shifted columns

    last_item = 'item(t-%d)' % window
    last_store = 'store(t-%d)' % window
    agg = agg[(agg['store(t)'] == agg[last_store])]
    agg = agg[(agg['item(t)'] == agg[last_item])]

    columns_to_drop = [('%s(t+%d)' % (col, lag)) for col in ['item', 'store']]
    for i in range(window, 0, -1):
        columns_to_drop += [('%s(t-%d)' % (col, i)) for col in ['item', 'store']]
    agg.drop(columns_to_drop, axis=1, inplace=True)
    agg.drop(['item(t)', 'store(t)'], axis=1, inplace=True)


    # Label
    labels_col = 'sales(t+%d)' % lag
    labels = agg[labels_col]
    agg = agg.drop(labels_col, axis=1)

    X_train, X_valid, Y_train, Y_valid = train_test_split(agg, labels.values, test_size=0.4, random_state=0)
    # print('Train set shape', X_train.shape)
    # print('Validation set shape', X_valid.shape)
    # X_train.head()

    X_train_series = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_valid_series = X_valid.values.reshape((X_valid.shape[0], X_valid.shape[1], 1))
    print('Train set shape', X_train_series.shape)
    print('Validation set shape', X_valid_series.shape)

    return X_train_series, Y_train, X_valid_series, Y_valid

In [170]:
lstm_processed_data = lstm_data_preprocessing(train, GROUPED_COLS, LABEL_COL, INDEX_COL)

In [171]:

X_train_series, Y_train, X_valid_series, Y_valid = series_to_supervised(lstm_processed_data, window=LSTM_PARAMS['LSTM_WINDOW_LENGTH'], lag=LSTM_PARAMS['LSTM_PREDICTION_LAG'])

Index(['item(t-29)', 'store(t-29)', 'sales(t-29)', 'item(t-28)', 'store(t-28)',
       'sales(t-28)', 'item(t-27)', 'store(t-27)', 'sales(t-27)', 'item(t-26)',
       'store(t-26)', 'sales(t-26)', 'item(t-25)', 'store(t-25)',
       'sales(t-25)', 'item(t-24)', 'store(t-24)', 'sales(t-24)', 'item(t-23)',
       'store(t-23)', 'sales(t-23)', 'item(t-22)', 'store(t-22)',
       'sales(t-22)', 'item(t-21)', 'store(t-21)', 'sales(t-21)', 'item(t-20)',
       'store(t-20)', 'sales(t-20)', 'item(t-19)', 'store(t-19)',
       'sales(t-19)', 'item(t-18)', 'store(t-18)', 'sales(t-18)', 'item(t-17)',
       'store(t-17)', 'sales(t-17)', 'item(t-16)', 'store(t-16)',
       'sales(t-16)', 'item(t-15)', 'store(t-15)', 'sales(t-15)', 'item(t-14)',
       'store(t-14)', 'sales(t-14)', 'item(t-13)', 'store(t-13)',
       'sales(t-13)', 'item(t-12)', 'store(t-12)', 'sales(t-12)', 'item(t-11)',
       'store(t-11)', 'sales(t-11)', 'item(t-10)', 'store(t-10)',
       'sales(t-10)', 'item(t-9)', 'store(t-