In [1]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras_self_attention import SeqSelfAttention

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd

In [3]:
dataPath = '/Users/yanzeliu/Downloads/m5-forecasting-accuracy'
timesteps = 14
startDay = 350

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    #iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    #df.memory_usage()返回每一列的内存，通过sum()算出总内存，除完后结果是xxMB
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            #[:3]是把'int1XX'后面的数字去掉
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
dt = pd.read_csv(dataPath + "/sales_train_validation.csv")
dt.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1


In [6]:
t = reduce_mem_usage(dt)

Mem. usage decreased to 95.00 Mb (78.7% reduction)


In [7]:
dt_r = t

In [8]:
print(dt_r.info())
dt_r = dt_r.T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int16(1306), int8(607), object(6)
memory usage: 95.0+ MB
None


In [9]:
dt_r = dt_r[6: ]
dt_r.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
d_1,0,0,0,0,0,0,0,12,2,0,...,0,14,1,0,4,0,0,0,0,0
d_2,0,0,0,0,0,0,0,15,0,0,...,0,11,1,0,4,0,0,6,0,0
d_3,0,0,0,0,0,0,0,0,7,1,...,0,5,1,0,2,2,0,0,0,0
d_4,0,0,0,0,0,0,0,0,3,0,...,0,6,1,0,5,2,0,2,0,0
d_5,0,0,0,0,0,0,0,0,0,0,...,0,5,1,0,2,0,0,2,0,0
d_6,0,0,0,0,0,0,0,4,2,0,...,0,13,2,0,7,3,5,4,0,0
d_7,0,0,0,0,0,0,0,6,3,0,...,0,18,1,0,8,1,0,1,0,0
d_8,0,0,0,0,0,0,0,5,9,0,...,0,13,3,0,4,4,1,8,0,0
d_9,0,0,0,0,0,0,0,7,0,0,...,0,18,4,0,4,1,1,5,0,0
d_10,0,0,0,0,0,0,0,0,0,0,...,0,19,1,0,1,0,3,2,0,0


In [10]:
#two lags, 7 and 14
dt_lag7 = dt_r.shift(7)
dt_lag28 = dt_r.shift(28)

In [11]:
# add a rolling mean
df_rolling7 = dt_r.rolling(window=7).mean()
df_rolling28 = dt_r.rolling(window=28).mean()
#df_rollstd7 = dt.rolling(window=7).std()
#df_rollstd28 = dt.rolling(window=28).std()

In [12]:
dt_new = pd.concat([dt_r, dt_lag7, dt_lag28, df_rolling7, df_rolling28], axis=1, ignore_index=True)

In [13]:
calendar = pd.read_csv(dataPath + "/calendar.csv")

In [14]:
calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [15]:
one_hot_weekday = pd.get_dummies(calendar.weekday)

In [16]:
train_ohw = one_hot_weekday[: -56].set_index(dt_new.index)
test_ohw = one_hot_weekday[-56: ]

In [17]:
dt_n = pd.concat([dt_new, train_ohw], axis=1, ignore_index=True)

In [18]:
snap_train = calendar[['snap_CA', 'snap_TX', 'snap_WI']][: -56].set_index(dt_new.index)
test_ohw = calendar[['snap_CA', 'snap_TX', 'snap_WI']][-56: ]

In [19]:
dt_nn = pd.concat([dt_n, snap_train], axis=1, ignore_index=True)

In [20]:
daysBeforeEvent = pd.DataFrame(np.zeros((1969,1)))
daysBeforeEvent.shape

(1969, 1)

In [21]:
for x,y in calendar.iterrows():
    if((pd.isnull(calendar["event_name_1"][x])) == False):
           daysBeforeEvent[0][x-1] = 1 

In [22]:
#"daysBeforeEventTest" will be used as input for predicting (We will forecast the days 1913-1941)
daysBeforeEventTest = daysBeforeEvent[1913:1941]
#"daysBeforeEvent" will be used for training as a feature.
daysBeforeEvent = daysBeforeEvent[:1913]

In [23]:
daysBeforeEvent.columns = ["oneDayBeforeEvent"]
daysBeforeEvent.index = dt_nn.index

In [24]:
dt_n2 = pd.concat([dt_nn, daysBeforeEvent], axis = 1)

In [25]:
dt_fi = dt_n2[350: ]

In [26]:
from sklearn.preprocessing import MinMaxScaler

In [27]:
sc = MinMaxScaler(feature_range = (0, 1))
dt_scaled = sc.fit_transform(dt_fi)

In [28]:
X_train = []
y_train = []
for i in range(timesteps, 1913 - 350 - 28):
    X_train.append(dt_scaled[i-timesteps:i])
    y_train.append(dt_scaled[i][0:30490])

In [29]:
X_train = np.array(X_train)
y_train = np.array(y_train)
print(X_train.shape)
print(y_train.shape)

(1521, 14, 152461)
(1521, 30490)


In [32]:
#np.save('/Users/yanzeliu/Desktop/X_train.npy', X_train)
#np.save('/Users/yanzeliu/Desktop/y_train.npy', y_train)

In [None]:
# Initialising the RNN
regressor = Sequential()

# Adding the first LSTM layer and some Dropout regularisation
layer_1_units=100
regressor.add(LSTM(units = layer_1_units, return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
regressor.add(Dropout(0.2))

# Adding a second LSTM layer and some Dropout regularisation
layer_2_units=500
regressor.add(LSTM(units = layer_2_units, return_sequences = True))
regressor.add(Dropout(0.2))

# Adding a third LSTM layer and some Dropout regularisation
layer_3_units=400
regressor.add(LSTM(units = layer_3_units))
regressor.add(Dropout(0.2))

# Adding the output layer
regressor.add(Dense(units = 30490))

# Compiling the RNN
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

# Fitting the RNN to the Training set
epoch_no=32
batch_size_RNN=128
regressor.fit(X_train, y_train, epochs = epoch_no, batch_size = batch_size_RNN)

Epoch 1/32
 128/1521 [=>............................] - ETA: 5:49 - loss: 0.0240