In [2]:
import numpy as np
import pandas as pd

In [1]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

Using TensorFlow backend.


In [2]:
from keras_self_attention import SeqSelfAttention

In [4]:
dataPath = '/Users/yanzeliu/Downloads/m5-forecasting-accuracy'
timesteps = 14
startDay = 350

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    #iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    #df.memory_usage()返回每一列的内存，通过sum()算出总内存，除完后结果是xxMB
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            #[:3]是把'int1XX'后面的数字去掉
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
dt = pd.read_csv(dataPath + "/sales_train_validation.csv")
dt.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1


In [7]:
dt = reduce_mem_usage(dt)

Mem. usage decreased to 95.00 Mb (78.7% reduction)


In [8]:
dt_r = dt

In [9]:
print(dt_r.info())
dt_r = dt_r.T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int16(1306), int8(607), object(6)
memory usage: 95.0+ MB
None


In [10]:
dt_r = dt_r[6: ]
dt_r.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
d_1,0,0,0,0,0,0,0,12,2,0,...,0,14,1,0,4,0,0,0,0,0
d_2,0,0,0,0,0,0,0,15,0,0,...,0,11,1,0,4,0,0,6,0,0
d_3,0,0,0,0,0,0,0,0,7,1,...,0,5,1,0,2,2,0,0,0,0
d_4,0,0,0,0,0,0,0,0,3,0,...,0,6,1,0,5,2,0,2,0,0
d_5,0,0,0,0,0,0,0,0,0,0,...,0,5,1,0,2,0,0,2,0,0
d_6,0,0,0,0,0,0,0,4,2,0,...,0,13,2,0,7,3,5,4,0,0
d_7,0,0,0,0,0,0,0,6,3,0,...,0,18,1,0,8,1,0,1,0,0
d_8,0,0,0,0,0,0,0,5,9,0,...,0,13,3,0,4,4,1,8,0,0
d_9,0,0,0,0,0,0,0,7,0,0,...,0,18,4,0,4,1,1,5,0,0
d_10,0,0,0,0,0,0,0,0,0,0,...,0,19,1,0,1,0,3,2,0,0


In [11]:
#two lags, 7 and 14
dt_lag7 = dt_r.shift(7)
dt_lag28 = dt_r.shift(28)

In [12]:
# add a rolling mean
df_rolling7 = dt_r.rolling(window=7).mean()
df_rolling28 = dt_r.rolling(window=28).mean()
#df_rollstd7 = dt.rolling(window=7).std()
#df_rollstd28 = dt.rolling(window=28).std()

In [13]:
dt_new = pd.concat([dt_r, dt_lag7, dt_lag28, df_rolling7, df_rolling28], axis=1, ignore_index=True)

In [14]:
dt_new.shape

(1913, 152450)

In [94]:
#dt_new.to_csv('/Users/yanzeliu/Desktop/dt_new.csv')

In [15]:
calendar = pd.read_csv(dataPath + "/calendar.csv")

In [16]:
calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [17]:
one_hot_weekday = pd.get_dummies(calendar.weekday)

In [18]:
train_ohw = one_hot_weekday[: -56].set_index(dt_new.index)
test_ohw = one_hot_weekday[-56: ]

In [19]:
dt_n = pd.concat([dt_new, train_ohw], axis=1, ignore_index=True)

In [20]:
snap_train = calendar[['snap_CA', 'snap_TX', 'snap_WI']][: -56].set_index(dt_new.index)
test_ohw = calendar[['snap_CA', 'snap_TX', 'snap_WI']][-56: ]

In [21]:
dt_nn = pd.concat([dt_n, snap_train], axis=1, ignore_index=True)

In [22]:
dt_nn.shape

(1913, 152460)

In [23]:
dt_n.shape

(1913, 152457)

In [24]:
daysBeforeEvent = pd.DataFrame(np.zeros((1969,1)))
daysBeforeEvent.shape

(1969, 1)

In [25]:
for x,y in calendar.iterrows():
    if((pd.isnull(calendar["event_name_1"][x])) == False):
           daysBeforeEvent[0][x-1] = 1 

In [26]:
#"daysBeforeEventTest" will be used as input for predicting (We will forecast the days 1913-1941)
daysBeforeEventTest = daysBeforeEvent[1913:1941]
#"daysBeforeEvent" will be used for training as a feature.
daysBeforeEvent = daysBeforeEvent[:1913]

In [27]:
daysBeforeEvent.columns = ["oneDayBeforeEvent"]
daysBeforeEvent.index = dt_nn.index

In [28]:
dt_n2 = pd.concat([dt_nn, daysBeforeEvent], axis = 1)

In [29]:
dt_n2.columns

Index([                  0,                   1,                   2,
                         3,                   4,                   5,
                         6,                   7,                   8,
                         9,
       ...
                    152451,              152452,              152453,
                    152454,              152455,              152456,
                    152457,              152458,              152459,
       'oneDayBeforeEvent'],
      dtype='object', length=152461)

In [30]:
dt_n2.shape

(1913, 152461)

In [31]:
dt_fi = dt_n2[350: ]

In [32]:
dt_fi.shape

(1563, 152461)

In [137]:
#dt_fi.to_csv('/Users/yanzeliu/Desktop/dt_final.csv')

In [33]:
dt_ftest = dt_fi[-28: ]

In [34]:
dt_ftrain = dt_fi[: -28]

In [35]:
dt_ftrain.shape

(1535, 152461)

In [36]:
from sklearn.preprocessing import MinMaxScaler

In [37]:
sc = MinMaxScaler(feature_range = (0, 1))
dt_scaled = sc.fit_transform(dt_ftrain)

In [38]:
dt_scaled.shape

(1535, 152461)

In [39]:
X_train = []
y_train = []
for i in range(timesteps, 1913 - 350 - 28):
    X_train.append(dt_scaled[i-timesteps:i])
    y_train.append(dt_scaled[i][0:152461])

In [40]:
X_train = np.array(X_train)
y_train = np.array(y_train)
print(X_train.shape)
print(y_train.shape)

(1521, 14, 152461)
(1521, 152461)


In [None]:
from typing import Union

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm


class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())

        return np.mean(all_scores)

usage example:

train_df = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
train_fold_df = train_df.iloc[:, :-28]
valid_fold_df = train_df.iloc[:, -28:]
valid_preds = valid_fold_df.copy() + np.random.randint(100, size=valid_fold_df.shape)

evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
evaluator.score(valid_preds)

In [None]:
# Initialising the RNN
regressor = Sequential()

# Adding the first LSTM layer and some Dropout regularisation
layer_1_units=100000
regressor.add(LSTM(units = layer_1_units, return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
regressor.add(Dropout(0.2))

# Adding a second LSTM layer and some Dropout regularisation
layer_2_units=5000
regressor.add(LSTM(units = layer_2_units, return_sequences = True))
regressor.add(Dropout(0.2))

# Adding a third LSTM layer and some Dropout regularisation
layer_3_units=400
regressor.add(LSTM(units = layer_3_units))
regressor.add(Dropout(0.2))

# Adding the output layer
regressor.add(Dense(units = 30490))

# Compiling the RNN
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

# Fitting the RNN to the Training set
epoch_no=32
batch_size_RNN=128
regressor.fit(X_train, y_train, epochs = epoch_no, batch_size = batch_size_RNN)