<a href="https://colab.research.google.com/github/Yabin10/M5-Forecasting-Accuracy-Uncertainty/blob/master/keras_lstm2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
import os, random

Using TensorFlow backend.


In [2]:
# from psutil import virtual_memory
# ram_gb = virtual_memory().total / 1e9
# print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

# if ram_gb < 20:
#   print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
#   print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
#   print('re-execute this cell.')
# else:
#   print('You are using a high-RAM runtime!')

## Load data

In [2]:
# change the file path if run on different machines
# FilePath = "/Users/yabindong/Program_Dataset/M5-Forcasting/m5-forecasting-accuracy/"
FilePath = "C:\\Users\\dyabin\\Documents\\Github_data\\m5-forecasting-accuracy\\"

In [3]:
# define a function to reduce the memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 #bytes to MB
    
    # the for loop converts int16 --> int8, int32 --> int 16, etc
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[0:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('Memory usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100*(start_mem-end_mem)/start_mem))
            
    return df

In [4]:
# read data and reduce memory usage
def ReadData(Path):
    print("Reading files...")
    calendar = pd.read_csv(FilePath+'calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print("calendar df has {} rows and {} columns".format(calendar.shape[0], calendar.shape[1]))
    
    train = pd.read_csv(FilePath+'sales_train_validation.csv')
    train = reduce_mem_usage(train)
    print("train df has {} rows and {} columns".format(train.shape[0], train.shape[1]))
    
    SellPrice = pd.read_csv(FilePath+'sell_prices.csv')
    SellPrice = reduce_mem_usage(SellPrice)
    print("train df has {} rows and {} columns".format(SellPrice.shape[0], SellPrice.shape[1]))
    
    SampleSub = pd.read_csv(FilePath+'sample_submission.csv')
    SampleSub = reduce_mem_usage(SampleSub)
    print("train df has {} rows and {} columns".format(SampleSub.shape[0], SampleSub.shape[1]))
    
    return calendar, train, SellPrice, SampleSub

In [5]:
df_calendar0, df_train0, df_SellPrice0, df_Sample_Submission = ReadData(FilePath)

Reading files...
Memory usage decreased from  0.21 Mb to  0.12 Mb (41.9% reduction)
calendar df has 1969 rows and 14 columns
Memory usage decreased from 446.40 Mb to 95.00 Mb (78.7% reduction)
train df has 30490 rows and 1919 columns
Memory usage decreased from 208.77 Mb to 130.48 Mb (37.5% reduction)
train df has 6841121 rows and 4 columns
Memory usage decreased from 13.49 Mb to  2.09 Mb (84.5% reduction)
train df has 60980 rows and 29 columns


In [6]:
df_Sample_Submission.set_index("id", inplace=True)

**Create date index**

In [7]:
# create time series index
idx_train = pd.date_range(start='2011-01-29', periods=1913, freq='D')
idx_val = pd.date_range(start='2016-04-25', periods=28, freq='D') #For validation time period
idx_eval = pd.date_range(start='2016-05-23', periods=28, freq='D') #For evaluation time period
idx_tot = pd.date_range(start='2011-01-29', periods=1969, freq='D')

In [8]:
# del df_SellPrice0

In [9]:
# df_train0 = df_train0.iloc[:, 6:].transpose()
print('The shape of df_train0 is {}'.format(df_train0.shape))

The shape of df_train0 is (1913, 30490)


## Create features

In [10]:
# define the time steps and start day for the study
timesteps = 14
startDay = 350

**Lag and rolling features**

In [11]:
#two lags, 7 and 28
df_lag7 = df_train0.shift(7)
df_lag28 = df_train0.shift(28)
print('The shape of lags are {}'.format(df_lag7.shape))

# add a rolling mean
df_rolling7 = df_train0.rolling(window=7).mean()
df_rolling28 = df_train0.rolling(window=28).mean()
print('The shape of rollings are {}'.format(df_rolling7.shape))

The shape of lags are (1913, 30490)
The shape of rollings are (1913, 30490)


In [12]:
df_train1 = pd.concat([df_train0, df_lag7, df_lag28, df_rolling7, df_rolling28], axis=1, ignore_index=True)

In [13]:
df_train1.set_index(idx_train, inplace=True)

In [14]:
print('The shape of df_train1 is {}'.format(df_train1.shape))

The shape of df_train1 is (1913, 152450)


**event features**

In [15]:
daysBeforeEvent = pd.DataFrame(np.zeros((1969,1)))
daysBeforeEvent.columns = ["oneDayBeforeEvent"]

In [16]:
for ind in range(len(df_calendar0)):
    if ((pd.isnull(df_calendar0["event_name_1"][ind])) == False):
        daysBeforeEvent.iloc[ind-1] = 1
daysBeforeEvent = daysBeforeEvent.astype('int32')

In [17]:
#"daysBeforeEventTest" will be used as input for predicting (We will forecast the days 1913-1941)
# daysBeforeEventTest = daysBeforeEvent[1913:]
#"daysBeforeEvent" will be used for training as a feature.
# daysBeforeEvent = daysBeforeEvent[:1913]
daysBeforeEvent.set_index(idx_tot, inplace=True)

In [18]:
df_train1 = pd.concat([df_train1, daysBeforeEvent[0:1913]], axis = 1)
print('The shape of df_train1 is {}'.format(df_train1.shape))

The shape of df_train1 is (1913, 152451)


**Select data from the start date**

In [20]:
# df_train1 = df_train1.iloc[startDay:,:]
df_train1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,152441,152442,152443,152444,152445,152446,152447,152448,152449,oneDayBeforeEvent
2012-01-14,0,0,0,2,0,0,0,24,3,2,...,6.535714,1.928571,0.0,5.607143,0.0,0.785714,0.928571,0.0,0.0,0
2012-01-15,0,0,0,0,0,0,0,9,0,2,...,6.678571,1.892857,0.0,5.75,0.0,0.785714,0.964286,0.0,0.0,1
2012-01-16,0,0,0,4,2,0,0,2,1,1,...,7.107143,1.928571,0.0,5.678571,0.0,0.821429,1.035714,0.0,0.0,0
2012-01-17,0,1,0,2,0,0,0,7,1,0,...,7.0,1.857143,0.0,5.535714,0.0,0.821429,0.964286,0.0,0.0,0
2012-01-18,0,0,0,1,2,0,0,0,0,0,...,7.142857,1.857143,0.0,5.535714,0.0,0.857143,1.0,0.0,0.0,0


## Scale the training data

In [21]:
sc = MinMaxScaler(feature_range = (0, 1))
train_scaled = sc.fit_transform(df_train1)

In [22]:
print("The shape of scaled data is {}".format(train_scaled.shape))

The shape of scaled data is (1563, 152451)


## Convert to RNN input format

In [23]:
def sliding_windows(data, seq_length):
    X = []
    Y = []
    for i in range(len(data)-seq_length):
        X.append(data[i:(i+seq_length)])
        Y.append(data[i+seq_length, 0:30490])

    return X, Y

In [None]:
DataX, DataY = sliding_windows(train_scaled, timesteps)
DataX = np.array(DataX)
DataY = np.array(DataY)
print("The shape of DataX is {}".format(DataX.shape))
print("The shape of DataY is {}".format(DataY.shape))

## Model

In [34]:
# Initialising the LSTM
LSTM_model= Sequential()

# Adding the first LSTM layer and some Dropout regularisation
LSTM_model.add(LSTM(units = 50, return_sequences = True, input_shape = (DataX.shape[1], DataX.shape[2])))
LSTM_model.add(Dropout(0.2))

# Adding a second LSTM layer and some Dropout regularisation
LSTM_model.add(LSTM(units = 400, return_sequences = True))
LSTM_model.add(Dropout(0.2))

# Adding a third LSTM layer and some Dropout regularisation
LSTM_model.add(LSTM(units = 400))
LSTM_model.add(Dropout(0.2))

# Adding the output layer
LSTM_model.add(Dense(units = 30490))

In [45]:
# Compiling the RNN
LSTM_model.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [63]:
# Fitting the LSTM to the Training set
epoch=30
batch_size=44

LSTM_model.fit(DataX, DataY, epochs = epoch, batch_size = batch_size)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x144a96dd0>

## Predict

In [67]:
test_predictions = []
first_eval_batch = df_scaled[-timesteps:]
current_batch = first_eval_batch.reshape((1, DataX.shape[1], DataX.shape[2]))

# this function returns the scaled all training data
item_tot = sc.fit_transform(df_train0).tolist()

for i in range(56):
    # get prediction 1 time stamp ahead ([0] is for grabbing just the number instead of [array])
    current_pred = LSTM_model.predict(current_batch)[0]
    # store prediction
    test_predictions.append(current_pred) 
    
    # create current lag and rollings
    item_tot.append(current_pred)
    df_item_tot = pd.DataFrame(item_tot)
    current_lag7 = df_item_tot.shift(7).to_numpy()[-timesteps:]
    current_lag28 = df_item_tot.shift(28).to_numpy()[-timesteps:]
    current_rolling7 = df_item_tot.rolling(window=7).mean().to_numpy()[-timesteps:]
    current_rolling28 = df_item_tot.rolling(window=28).mean().to_numpy()[-timesteps:]
    
    # update batch to now include prediction and drop first value
    current_batch = np.append(current_batch[:,1:, 0:30490],[[current_pred]],axis=1)
    current_batch = np.concatenate((current_batch[0], current_lag7, current_lag28, current_rolling7, current_rolling28, daysBeforeEvent[-(timesteps+56-i):-(56-i)]), axis=1).reshape(1, DataX.shape[1], DataX.shape[2])

In [68]:
print("The length of the data is {}".format(len(test_predictions)))
print("The row of the data is {}".format(len(test_predictions[0])))

The length of the data is 56
The row of the data is 30490


**scale back**

In [69]:
test_predictions = sc.inverse_transform(test_predictions)

## create submission file

In [70]:
df_submission0 = pd.DataFrame(test_predictions)
df_submission_val = df_submission0.iloc[0:28,:]
df_submission_eval = df_submission0.iloc[28:,:]

In [71]:
df_submission_eval.index=range(0,28)

In [72]:
df_submission = pd.concat([df_submission_val, df_submission_eval],axis=1)
df_submission = df_submission.transpose()

In [73]:
df_submission.index = df_Sample_Submission.index
df_submission.columns = df_Sample_Submission.columns

In [74]:
df_submission[df_submission < 0] =0

In [75]:
df_submission.head()

Unnamed: 0_level_0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1_validation,0.735254,0.005742,0.005746,0.005743,0.005744,0.005705,0.005666,0.005607,0.005569,0.005488,...,0.00507,0.005055,0.005011,0.004915,0.004001,0.002327,0.000331,0.0,0.0,0.0
HOBBIES_1_002_CA_1_validation,0.294894,0.211994,0.211993,0.211995,0.212007,0.212017,0.212027,0.212029,0.212039,0.212043,...,0.212069,0.212075,0.212076,0.212065,0.211964,0.211714,0.21137,0.211006,0.210656,0.210336
HOBBIES_1_003_CA_1_validation,0.484005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HOBBIES_1_004_CA_1_validation,2.122116,1.078057,1.078035,1.078046,1.078118,1.078128,1.078095,1.077961,1.077887,1.077778,...,1.077058,1.077065,1.077045,1.076976,1.076895,1.076312,1.075432,1.074728,1.074327,1.074258
HOBBIES_1_005_CA_1_validation,1.193682,0.99657,0.996563,0.996562,0.996573,0.996555,0.996517,0.996444,0.99639,0.996319,...,0.995967,0.995965,0.995952,0.995915,0.995792,0.99528,0.994493,0.993634,0.992775,0.991952


In [76]:
# df_submission.to_csv(r'/Users/yabindong/Program_Dataset/M5-Forcasting/Submission.csv')