# RNN, LSTM for M5 demand forecasting
- This notebook is an improvement based on the previous fundamental RNN

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'font.size': 16})
plt.rcParams['figure.figsize'] = 12,8

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
from numba import cuda #use gpu to accelerate
import os, random
from fastprogress import master_bar, progress_bar

In [4]:
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

Using TensorFlow backend.


## Load data

In [5]:
# change the file path if run on different machines
FilePath = "C:\\Users\\dyabin\\Documents\\Github_data\\m5-forecasting-accuracy\\"

In [6]:
# define a function to reduce the memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 #bytes to MB
    
    # the for loop converts int16 --> int8, int32 --> int 16, etc
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[0:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('Memory usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100*(start_mem-end_mem)/start_mem))
            
    return df

In [7]:
# read data and reduce memory usage
def ReadData(Path):
    print("Reading files...")
    calendar = pd.read_csv(FilePath+'calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print("calendar df has {} rows and {} columns".format(calendar.shape[0], calendar.shape[1]))
    
    train = pd.read_csv(FilePath+'sales_train_validation.csv')
    train = reduce_mem_usage(train)
    print("train df has {} rows and {} columns".format(train.shape[0], train.shape[1]))
    
    SellPrice = pd.read_csv(FilePath+'sell_prices.csv')
    SellPrice = reduce_mem_usage(SellPrice)
    print("train df has {} rows and {} columns".format(SellPrice.shape[0], SellPrice.shape[1]))
    
    SampleSub = pd.read_csv(FilePath+'sample_submission.csv')
    SampleSub = reduce_mem_usage(SampleSub)
    print("train df has {} rows and {} columns".format(SampleSub.shape[0], SampleSub.shape[1]))
    
    return calendar, train, SellPrice, SampleSub

In [8]:
df_calendar0, df_train0, df_SellPrice0, df_Sample_Submission = ReadData(FilePath)

Reading files...
Memory usage decreased from  0.21 Mb to  0.12 Mb (41.9% reduction)
calendar df has 1969 rows and 14 columns
Memory usage decreased from 446.40 Mb to 95.00 Mb (78.7% reduction)
train df has 30490 rows and 1919 columns
Memory usage decreased from 208.77 Mb to 130.48 Mb (37.5% reduction)
train df has 6841121 rows and 4 columns
Memory usage decreased from 13.49 Mb to  2.09 Mb (84.5% reduction)
train df has 60980 rows and 29 columns


In [9]:
df_Sample_Submission.set_index("id", inplace=True)

In [10]:
df_train0.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [11]:
# extract item info
df_train1 = df_train0.iloc[:,6:]
df_train1.head()

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,0,0,0,0,0,0,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,0,0,0,0,0,0,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


**GPU use**

In [13]:
#device = 'cpu'
#device = 'cuda'

**set seed all**

In [12]:
SEED = 1234
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

## Preprocessing data: datetime index
- The timeline is 2011-01-29 to 2016-06-19: This is the whole 1969 days 
- The given training data is from 2011-01-29 to 2016-04-24
- The validation time period is from 2016-04-25 to 2016-06-19

In [13]:
# create time series index
idx_train = pd.date_range(start='2011-01-29', periods=1913, freq='D')
idx_val = pd.date_range(start='2016-04-25', periods=28, freq='D') #For validation time period
idx_eval = pd.date_range(start='2016-05-23', periods=28, freq='D') #For evaluation time period

In [14]:
df_train1 = df_train1.transpose()
df_train1.set_index(idx_train, inplace=True)

In [15]:
df_train1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
2011-01-29,0,0,0,0,0,0,0,12,2,0,...,0,14,1,0,4,0,0,0,0,0
2011-01-30,0,0,0,0,0,0,0,15,0,0,...,0,11,1,0,4,0,0,6,0,0
2011-01-31,0,0,0,0,0,0,0,0,7,1,...,0,5,1,0,2,2,0,0,0,0
2011-02-01,0,0,0,0,0,0,0,0,3,0,...,0,6,1,0,5,2,0,2,0,0
2011-02-02,0,0,0,0,0,0,0,0,0,0,...,0,5,1,0,2,0,0,2,0,0


## Preprocessing data: scale

In [16]:
scaler = MinMaxScaler().fit(df_train1)
train_scale = scaler.transform(df_train1)
print("The shape after scale is {}".format(train_scale.shape))
print("The data type after scale is {}".format(type(train_scale)))

The shape after scale is (1913, 30490)
The data type after scale is <class 'numpy.ndarray'>


In [17]:
id_rand = random.randint(0,df_train1.shape[0])
train_scale[id_rand].shape

(30490,)

In [19]:
# # check the shape before and after scale
# print("The column # {}".format(id_rand))

# fig, axs = plt.subplots(2) 
# item=id_rand
# df_train1.iloc[:,item].plot(kind='hist', title="An item before scale", ax=axs[0], legend=False)
# pd.DataFrame(train_scale[:,item]).plot(kind='hist', title="An item after scale", ax=axs[1], legend=False)

## Preprocessing data: create sequence 

In [20]:
def sliding_windows(data, seq_length):
    X = []
    Y = []
    for i in range(len(data)-seq_length):
        X.append(data[i:(i+seq_length)])
        Y.append(data[i+seq_length])

    return X, Y

In [21]:
seq_length = 28
DataX, DataY = sliding_windows(train_scale, seq_length)
DataX = np.array(DataX)
DataY = np.array(DataY)

In [23]:
print(DataX.shape)
print(DataY.shape)

(1885, 28, 30490)
(1885, 30490)


## LSTM model: create

In [24]:
# attention to the dimension of the model
n_steps = DataX.shape[1] #the time steps used; 28 in this case
n_features = DataX.shape[2] #the number of time series; 30490 in this case

In [25]:
#Initiate the model
LSTM_model = Sequential() 

# add the first LSTM layer
LSTM_model.add(LSTM(units=512, return_sequences=True, input_shape=(n_steps, n_features)))
LSTM_model.add(Dropout(0.2))

# add the second LSTM layer
LSTM_model.add(LSTM(units=512, return_sequences=True))
LSTM_model.add(Dropout(0.2))

# add the third LSTM layer
LSTM_model.add(LSTM(units=216))
LSTM_model.add(Dropout(0.2))

# add the output layer
LSTM_model.add(Dense(units=30490))

In [26]:
# compile the model
LSTM_model.compile(optimizer='adam', loss='mean_squared_error')

## LSTM model: Input parameters and train

In [27]:
epoch=30
batch_size=65

LSTM_model.fit(DataX, DataY, epochs = epoch, batch_size = batch_size)


Epoch 1/30

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\dyabin\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-27-e8b861420467>", line 4, in <module>
    LSTM_model.fit(DataX, DataY, epochs = epoch, batch_size = batch_size)
  File "C:\Users\dyabin\Anaconda3\lib\site-packages\keras\engine\training.py", line 1239, in fit
    validation_freq=validation_freq)
  File "C:\Users\dyabin\Anaconda3\lib\site-packages\keras\engine\training_arrays.py", line 196, in fit_loop
    outs = fit_function(ins_batch)
  File "C:\Users\dyabin\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py", line 3292, in __call__
    run_metadata=self.run_metadata)
  File "C:\Users\dyabin\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1458, in __call__
    run_metadata_ptr)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most r

KeyboardInterrupt: 

In [None]:
loss_per_epoch = model.history.history['loss']
plt.plot(range(len(loss_per_epoch)),loss_per_epoch)

## Predict use LSTM

In [None]:
test_predictions = []

first_eval_batch = train_scale[-n_steps:]
current_batch = first_eval_batch.reshape((1, n_steps, n_features))

for i in range(n_steps*2):
    # get prediction 1 time stamp ahead ([0] is for grabbing just the number instead of [array])
    current_pred = LSTM_model.predict(current_batch)[0]
    
    # store prediction
    test_predictions.append(current_pred) 
    
    # update batch to now include prediction and drop first value
    current_batch = np.append(current_batch[:,1:,:],[[current_pred]],axis=1)

**Scale back**

In [None]:
test_predictions = scaler.inverse_transform(test_predictions)

## Create submission file

In [None]:
df_submission0 = pd.DataFrame(test_predictions)
df_submission_val = df_submission0.iloc[0:28,:]
df_submission_eval = df_submission0.iloc[28:,:]

In [None]:
df_submission_eval.index=range(0,28)

In [None]:
df_submission = pd.concat([df_submission_val, df_submission_eval],axis=1)
df_submission = df_submission.transpose()

In [None]:
df_submission.head()

In [None]:
df_submission.index = df_Sample_Submission.index
df_submission.columns = df_Sample_Submission.columns

In [None]:
df_submission[df_submission < 0] =0

In [None]:
df_submission.head()

In [None]:
#df_submission.to_csv(r'C:\\Users\\dyabin\\Documents\\Github_data\\m5-forecasting-accuracy-submission\\SecondSubmission.csv')