In [18]:
import numpy as np
import pandas as pd
# !pip install pytorch_lightning
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from torchsummary import summary

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
import numpy as np

from pathlib import Path
from argparse import ArgumentParser
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pickle


data_dir = Path.home()/'data/kaggle/m5-forecasting-accuracy'

x_cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
        'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI']
x_cont_cols = ['sell_price']

#### TODO
 - normalize y
 - sales price is 0. fix it. 

In [None]:
!ls $data_dir

#### Sales

In [27]:
%%time
sales = pd.read_csv(data_dir/'sales_train_validation.csv')
print(f'sales.shape: {sales.shape}')
cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

# encode cat cols
encoders = {}
for col in cat_cols:
    encoder =  OrdinalEncoder()
    sales[[col]] = encoder.fit_transform(sales[[col]])
    sales[col] = sales[col].astype(np.long)
    encoders[col] = encoder
    
# change day column names to just day number
day_cols = {col: col.split('_')[1] for col in sales.columns if col.startswith('d_')}
sales.rename(columns=day_cols, inplace=True)

sales.shape: (30490, 1919)
CPU times: user 7.42 s, sys: 4.79 s, total: 12.2 s
Wall time: 12.2 s


In [28]:
num_days = len(day_cols)
num_stores = sales['store_id'].nunique()
num_items = sales['item_id'].nunique()
print('num store_items - ', num_stores * num_items)

num store_items -  30490


In [None]:
sales['item_id'].nunique()

#### Calendar

In [None]:
calendar = pd.read_csv(data_dir/'calendar.csv')\
            .rename(columns={'d':'day'})

cat_cal_cols = ['wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI']
# ignore_cal_cols = ['wm_yr_wk']

for col in cat_cal_cols:
    
    # impute
    if str(calendar[col].dtype)[:3] == 'obj':
        fill_value = 'abcxyz' 
    elif str(calendar[col].dtype)[:3] == 'int':
        fill_value = -1
    calendar[[col]] = SimpleImputer(strategy='constant', fill_value=fill_value).fit_transform(calendar[[col]])
    
    # encode
    if col not in encoders:
        encoders[col] = OrdinalEncoder().fit(calendar[[col]])
    calendar[[col]] = encoders[col].transform(calendar[[col]])
    calendar[col] = calendar[col].astype(np.long)
    
# change day column names to just day number
calendar['day'] = calendar['day'].apply(lambda x: x.split('_')[1])
calendar['day'] = calendar['day'].astype(np.long)

calendar.head(2)

#### Prices

In [None]:
%%time
prices = pd.read_csv(data_dir/'sell_prices.csv')
for col in ['store_id', 'item_id', 'wm_yr_wk']:
    prices[[col]] = encoders[col].transform(prices[[col]])
    prices[col] = prices[col].astype(np.long)

### Merge

In [None]:
%%time
sales2 = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                                       var_name='day', value_name='demand')
sales2['day'] = sales2['day'].astype(np.long)

sales2.sort_values('day', inplace=True)
calendar.sort_values('day', inplace=True)

sales2 = sales2.merge(calendar, on='day', how='left')
sales2 = sales2.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
sales2['sell_price'] = sales2['sell_price'].astype(np.float32)
sales2['sell_price'] = sales2['sell_price'].fillna(0.0)

sales2.sort_values(['item_id', 'store_id','day'], inplace=True)

# scale continuous columns
scalers = {}
for col in ['sell_price','demand']:
    scaler = MinMaxScaler()
    sales2[[col]] = scaler.fit_transform(sales2[[col]])
    scalers[col] = scaler

In [None]:
sales2.to_parquet('combined.pq')
with open('encoders.pkl','wb') as f:
    pickle.dump(encoders,f)
    
with open('scalers.pkl','wb') as f:
    pickle.dump(scalers, f)

### Creating tensors

In [19]:
%%time
sales2 = pd.read_parquet('combined.pq')
print(sales2.shape)
sales2.columns

(58327370, 22)
CPU times: user 24.9 s, sys: 45.7 s, total: 1min 10s
Wall time: 14.6 s


Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day',
       'demand', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price'],
      dtype='object')

In [20]:
%%time
x = torch.tensor(sales2[x_cat_cols + x_cont_cols].values)
y = torch.tensor(sales2['demand'].values)

CPU times: user 11.9 s, sys: 26.9 s, total: 38.7 s
Wall time: 15.9 s


In [21]:
sales2[x_cat_cols + x_cont_cols].dtypes

item_id           int64
dept_id           int64
cat_id            int64
store_id          int64
state_id          int64
weekday           int64
wday              int64
month             int64
year              int64
event_name_1      int64
event_type_1      int64
event_name_2      int64
event_type_2      int64
snap_CA           int64
snap_TX           int64
snap_WI           int64
sell_price      float64
dtype: object

In [22]:
%%time

# from fastai v2
def get_emb_size(nunique):
    return min(600, round(1.6 * nunique**0.56))

emb_sizes = [(sales2[col].nunique(), get_emb_size(sales2[col].nunique())) for col in x_cat_cols]

CPU times: user 10.8 s, sys: 373 ms, total: 11.1 s
Wall time: 11.1 s


In [23]:
with open('emb_sz.pkl','wb') as f:
    pickle.dump(emb_sizes,f )

In [24]:
# group_size = num_items * num_stores
# group_size

In [29]:
%%time
num_features = x.size(1)
x1 = x.view(-1, num_days, num_features).refine_names('item_store', 'day','features')\
        .align_to('day','item_store','features').contiguous()

y1 = y.view(-1, num_days).refine_names('item_store', 'day')\
    .align_to('day', 'item_store').contiguous()

print(f'x1.shape - {x1.shape} y1.shape - {y1.shape}')

x1.shape - torch.Size([1913, 30490, 17]) y1.shape - torch.Size([1913, 30490])
CPU times: user 43.7 s, sys: 10.4 s, total: 54.1 s
Wall time: 2.37 s


In [30]:
%%time
torch.save(x1.rename(None), 'x.pt')
torch.save(y1.rename(None), 'y.pt')


CPU times: user 82 µs, sys: 10.1 s, total: 10.1 s
Wall time: 12.3 s


### Training

In [31]:
class M5DataSet(Dataset):
    def __init__(self,x, y, src_len, tgt_len, bsz, dstype='train'):
        assert dstype in ['train', 'test','val']
        self.x = x
        self.y = y
        self.src_len = src_len
        self.tgt_len = tgt_len
        self.bsz = bsz
        self.dstype = dstype
        
    def __len__(self):
        if self.dstype == 'train':
            l = (self.x.size(0) - (self.src_len + self.tgt_len)) 
            return l
        
        if self.dstype == 'test':
            return 1
        
        return l
    
    def __getitem__(self, idx):
        if self.dstype == 'train':
            # we have 30490 item_stores. We may not be able to load them all. So randomly pick bsz items. 
            item_store_mask = list(np.random.randint(0, self.x.size(1),(self.bsz,)))
        elif self.dstype == 'test':
            item_store_mask = list(np.arange(self.x.size(1)))
            idx = self.x.size(0) - self.src_len
        
        x_src = self.x.rename(None)[idx:idx+self.src_len, item_store_mask, :]
        x_tgt = self.x.rename(None)[idx+self.src_len:idx+self.src_len+self.tgt_len, item_store_mask, :]
        y_src = self.y.rename(None)[idx:idx+self.src_len, item_store_mask]
        y_tgt = self.y.rename(None)[idx+self.src_len:idx+self.src_len+self.tgt_len, item_store_mask]
#         print(f'x.shape - {self.x.shape} y.shape - {self.y.shape} idx - {idx}. x_item.shape - {x_item.shape} y_item.shape - {y_item.shape}')
        return x_src, x_tgt, y_src, y_tgt, item_store_mask

# train_ds = M5DataSet(x1, y1, src_len, tgt_len, 200)
# train_dl = DataLoader(train_ds, batch_size=1, shuffle=True, pin_memory=True)

In [32]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    
# def insert_embedding(inp, dim, index, emb):
#     """
#     Replace columns with their embeddings. Works only with 2-d tensors.
#     TODO - make it work for multi-dim tensors

#     :param inp: tensor of two or more dimensions
#     :param dim: dimension along which tensor should be expanded by inserting the embedding
#     :param i: index of tensor along dim which is to be embedded
#     :param emb: Embedding of shape [v,d], where v vocab_size and d is embedding dimension
#     :return: 
#     """
#     # create a slice of the data to be replaced with embedding. 
#     s = inp.index_select(dim, torch.tensor([index])).squeeze(dim)
#     embedded = emb(s.type(torch.long))
    
#     first_indices = torch.arange(0,index)
#     last_indices = torch.arange(index+1,inp.size(dim))

#     return torch.cat([inp.index_select(dim, first_indices), embedded.type(inp.dtype), inp.index_select(dim, last_indices)], axis=dim)

In [33]:
gx = torch.load('x.pt')
gy = torch.load('y.pt')

In [36]:
class SalesModel(LightningModule):
    def __init__(self, hparams):
        super(SalesModel, self).__init__()
        self.hparams = hparams
        self.x_cat_cols = x_cat_cols
        self.x_cont_cols = x_cont_cols
        self.pos_encoder = PositionalEncoding(hparams.ninp, hparams.dropout)
#         encoder_layers = nn.TransformerEncoderLayer(hparams.ninp, hparams.nhead, hparams.nhid, hparams.dropout)
#         decoder_layers = nn.TransformerDecoderLayer(hparams.ninp, hparams.nhead, hparams.nhid, hparams.dropout)
#         self.transformer_encoder = nn.TransformerEncoder(encoder_layers, hparams.nlayers)
#         self.transformer_decoder = nn.TransformerDecoder(decoder_layers, hparams.nlayers)
#         self.lin = nn.Linear()
        self.transformer = nn.Transformer(d_model=hparams.ninp, nhead=hparams.nhead, 
                                          num_encoder_layers=hparams.nlayers,
                                          num_decoder_layers=hparams.nlayers,
                                          dim_feedforward=hparams.nhid)
        self.criterion = nn.MSELoss()
        self.lin = nn.Linear(hparams.ninp, 1)
        self.sigmoid = nn.Sigmoid()
        
        print('reading data', flush=True)
        self.x = gx
        self.y = gy

        with open('emb_sz.pkl','rb') as f:
            emb_szs = pickle.load(f)
        print(f'emb_szs - {emb_szs}')
                    
        self.embs = nn.ModuleList([nn.Embedding(e[0],e[1]) for e in emb_szs])
        
#     def init_weights(self):
#         initrange = 0.1
#         self.src_embedding.weight.data.uniform_(-initrange, initrange)
#         self.decoder.bias.data.zero_()
#         self.decoder.weight.data.uniform_(-initrange, initrange)
        
    @staticmethod
    def add_model_specifi_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--bsz', default=20, type=int, help='batch_size', )
        parser.add_argument('--src-len', default=90, type=int, help='source length')
        parser.add_argument('--tgt-len', default=28, type=int, help='target length')
        parser.add_argument('--ninp', default=320, type=int, help='expected features in the input')
        parser.add_argument('--nhead', default=4, type=int, help='number of attention heads')
        parser.add_argument('--nhid', default=256, type=int, help='dimesion of feed-forward network model')
        parser.add_argument('--nlayers', default=2, type=int, help='number of encoder layers')
        parser.add_argument('--dropout', default=0.2, type=float, help='dropout')
        
        # they are not hyper params, but adding them as pytorch lightening can save them
        parser.add_argument('--num-cat-cols', default=len(x_cat_cols), type=int, help='number of categorical columns')
        parser.add_argument('--num-cont-cols', default=len(x_cont_cols), type=int, help='number of numeric columns')
        return parser
    
#     def _generate_square_subsequent_mask(self, sz):
#         # populate the lower triangle with True and rest with False
#         return torch.tril(torch.ones(sz, sz)) == 1.0
    
    def prepare_data(self):
        pass
        
    def train_dataloader(self):
        train_ds = M5DataSet(self.x, self.y, self.hparams.src_len, self.hparams.tgt_len, self.hparams.bsz)  
        print(f'train_ds.length - {len(train_ds)}')
        train_dl = DataLoader(train_ds, batch_size=1, shuffle=True, pin_memory=True)
        return train_dl
    
    def test_dataloader(self):
        test_ds = M5DataSet(self.x, self.y, self.hparams.src_len, self.hparams.tgt_len, self.hparams.bsz, dstype='test')  
        test_dl = DataLoader(test_ds, batch_size=1, shuffle=False)
        return test_dl

    
    def emb_lookups(self, xb, yb=None):
        embs_t = []
        for idx in range(self.hparams.num_cat_cols):
#             print('looking up for ', idx)
            embs_t.append(self.embs[idx](xb[:,:,idx].type(torch.long)))
        xb_cat = torch.cat(embs_t, dim=2)
        xb_cont = xb[:,:,self.hparams.num_cat_cols:]
        
        if yb is not None:
            xb = torch.cat([xb_cat, xb_cont.type(xb_cat.dtype), yb.unsqueeze(2).type(xb_cat.dtype)], dim=2)
        else:
            xb = torch.cat([xb_cat, xb_cont.type(xb_cat.dtype)], dim=2)
            
        #pad to adjust the feature dimension
        dim3_shortfall = self.hparams.ninp - xb.size(2)
        assert dim3_shortfall >= 0
        pad = nn.ConstantPad1d(padding=(0,dim3_shortfall),value=0)
        xb = pad(xb) 

        return xb

    def forward(self, x_src, y_src, x_tgt):
        offset = 0
        
        x_src = self.emb_lookups(x_src, y_src)
        x_tgt = self.emb_lookups(x_tgt)
            
        x_src = self.pos_encoder(x_src)
#         print('shape after pos encoder - ', x_src.size())
        out = self.transformer(x_src, x_tgt)
#         print('shape after transformer - ', out.size())
        out = self.sigmoid(self.lin(out))
        
        return out
    
    def training_step(self, batch, batch_idx):
        x_src, x_tgt, y_src, y_tgt, item_store_mask = batch
        x_src = x_src.squeeze(0)
        x_tgt = x_tgt.squeeze(0)
        y_src = y_src.squeeze(0)
        y_tgt = y_tgt.squeeze(0)
        
#         print(f'x_src.shape - {tuple(x_src.shape)} \t x_tgt.shape - {tuple(x_tgt.shape)} \t y_src.shape - {y_src.shape} \t y_tgt.shape - {y_tgt.shape}')
        yhat_tgt = self(x_src, y_src, x_tgt)

        # apply the mask (due to random selection of item_stores) to output
#         idxs = list(np.arange(0,x_src.size(1)))
#         idxs = [1 if o in item_store_mask else 0 for o in idxs]
#         mask = torch.tensor(idxs) * torch.ones(y_tgt.size(0), y_tgt.size(1))
#         print(f'mask.shape: {mask.shape}')
#         
        loss = self.criterion((yhat_tgt).reshape(-1).type(torch.float32), (y_tgt).reshape(-1).type(torch.float32))
        if batch_idx%10 == 0:
            print(f'{batch_idx} loss: {loss}  yhat_tgt.sum: {yhat_tgt.sum().item()}  y_tgt.sum: {y_tgt.sum().item()}')
            
        return {'loss': loss}
    
    def test(self):
        dl = self.test_dataloader()
        batch = next(iter(dl))
        return batch
        

In [37]:
src_len = 90
tgt_len = 28
# bsz = 200
# model = SalesModel(hparams)

parser = ArgumentParser()
parser = SalesModel.add_model_specifi_args(parser)
hparams = parser.parse_args('--bsz 2 --ninp 320 --nhid 128 --nlayers 1'.split())

checkpoint_callback = ModelCheckpoint(
    filepath='models/weights.ckpt',
    verbose=True
)

# hparams.__setattr__('x_cat_cols', x_cat_cols)
# hparams.__setattr__('x_cont_cols', x_cont_cols)
model = SalesModel(hparams)
trainer = Trainer(gpus=1,max_epochs=1)
trainer.fit(model)
trainer.save_checkpoint('models/weights.ckpt')

reading data
emb_szs - [(3049, 143), (7, 5), (3, 3), (10, 6), (3, 3), (7, 5), (7, 5), (12, 6), (6, 4), (31, 11), (5, 4), (5, 4), (3, 3), (2, 2), (2, 2), (2, 2)]


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …

train_ds.length - 1795
0 loss: 0.329624742269516  yhat_tgt.sum: 31.57564926147461  y_tgt.sum: 0.002621231979030144
10 loss: 7.812944318175141e-07  yhat_tgt.sum: 0.03121904283761978  y_tgt.sum: 0.03407601572739188
20 loss: 2.3384134692605585e-05  yhat_tgt.sum: 0.022272393107414246  y_tgt.sum: 0.19397116644823068
30 loss: 1.105664068745682e-05  yhat_tgt.sum: 0.019277438521385193  y_tgt.sum: 0.10353866317169069
40 loss: 9.619941465643933e-07  yhat_tgt.sum: 0.016730470582842827  y_tgt.sum: 0.028833551769331587
50 loss: 2.3461093405785505e-06  yhat_tgt.sum: 0.014959679916501045  y_tgt.sum: 0.05242463958060288
60 loss: 1.4739691323484294e-05  yhat_tgt.sum: 0.022178534418344498  y_tgt.sum: 0.163826998689384
70 loss: 2.9955435820738785e-06  yhat_tgt.sum: 0.01608908176422119  y_tgt.sum: 0.06422018348623854
80 loss: 1.4129650480754208e-06  yhat_tgt.sum: 0.015672950074076653  y_tgt.sum: 0.039318479685452164
90 loss: 0.0011536155361682177  yhat_tgt.sum: 0.016710707917809486  y_tgt.sum: 1.306684141

### testing

In [40]:
%%time
model = SalesModel.load_from_checkpoint('models/weights.ckpt')
x_src, x_tgt, y_src, y_tgt, item_store_mask = model.test()
x_src = x_src.squeeze(0)
x_tgt = x_tgt.squeeze(0)
y_src = y_src.squeeze(0)
y_tgt = y_tgt.squeeze(0)
print(f'x_src.shape - {x_src.shape}')

model.eval()
print('starting inference...')
yhat_tgt = model(x_src, y_src, x_tgt)
yhat_tgt.shape

reading data
emb_szs - [(3049, 143), (7, 5), (3, 3), (10, 6), (3, 3), (7, 5), (7, 5), (12, 6), (6, 4), (31, 11), (5, 4), (5, 4), (3, 3), (2, 2), (2, 2), (2, 2)]
x_src.shape - torch.Size([90, 30490, 17])
starting inference...
CPU times: user 4min 33s, sys: 2min 56s, total: 7min 30s
Wall time: 24.5 s


torch.Size([0, 30490, 1])

In [41]:
yhat_tgt

tensor([], size=(0, 30490, 1), grad_fn=<SigmoidBackward>)

## Playground

In [None]:
x_src.shape

In [None]:
item_store_mask = list(np.random.randint(0, 10,3))
item_store_mask

In [None]:
torch.randn(10).sum().item()

In [None]:
dir(hparams)