In [2]:
import numpy as np
import pandas as pd
# !pip install pytorch_lightning
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning import Trainer

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
import numpy as np

from pathlib import Path
from argparse import ArgumentParser
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pickle

data_dir = Path.home()/'data/kaggle/m5-forecasting-accuracy'

#### TODO
 - normalize y
 - sales price is 0. fix it. 

In [None]:
!ls $data_dir

#### Sales

In [11]:
%%time
sales = pd.read_csv(data_dir/'sales_train_validation.csv')
print(f'sales.shape: {sales.shape}')
cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

# encode cat cols
encoders = {}
for col in cat_cols:
    encoder =  OrdinalEncoder()
    sales[[col]] = encoder.fit_transform(sales[[col]])
    sales[col] = sales[col].astype(np.long)
    encoders[col] = encoder
    
# change day column names to just day number
day_cols = {col: col.split('_')[1] for col in sales.columns if col.startswith('d_')}
sales.rename(columns=day_cols, inplace=True)

sales.shape: (30490, 1919)
CPU times: user 7.4 s, sys: 4.19 s, total: 11.6 s
Wall time: 11.7 s


In [12]:
num_days = len(day_cols)
num_stores = sales['store_id'].nunique()
num_items = sales['item_id'].nunique()
print('num store_items - ', num_stores * num_items)

num store_items -  30490


In [None]:
sales['item_id'].nunique()

#### Calendar

In [None]:
calendar = pd.read_csv(data_dir/'calendar.csv')\
            .rename(columns={'d':'day'})

cat_cal_cols = ['wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI']
# ignore_cal_cols = ['wm_yr_wk']

for col in cat_cal_cols:
    
    # impute
    if str(calendar[col].dtype)[:3] == 'obj':
        fill_value = 'abcxyz' 
    elif str(calendar[col].dtype)[:3] == 'int':
        fill_value = -1
    calendar[[col]] = SimpleImputer(strategy='constant', fill_value=fill_value).fit_transform(calendar[[col]])
    
    # encode
    if col not in encoders:
        encoders[col] = OrdinalEncoder().fit(calendar[[col]])
    calendar[[col]] = encoders[col].transform(calendar[[col]])
    calendar[col] = calendar[col].astype(np.long)
    
# change day column names to just day number
calendar['day'] = calendar['day'].apply(lambda x: x.split('_')[1])
calendar['day'] = calendar['day'].astype(np.long)

calendar.head(2)

#### Prices

In [None]:
%%time
prices = pd.read_csv(data_dir/'sell_prices.csv')
for col in ['store_id', 'item_id', 'wm_yr_wk']:
    prices[[col]] = encoders[col].transform(prices[[col]])
    prices[col] = prices[col].astype(np.long)

### Merge

In [None]:
%%time
sales2 = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                                       var_name='day', value_name='demand')
sales2['day'] = sales2['day'].astype(np.long)

sales2.sort_values('day', inplace=True)
calendar.sort_values('day', inplace=True)

sales2 = sales2.merge(calendar, on='day', how='left')
sales2 = sales2.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
sales2['sell_price'] = sales2['sell_price'].astype(np.float32)
sales2['sell_price'] = sales2['sell_price'].fillna(0.0)

sales2.sort_values(['item_id', 'store_id','day'], inplace=True)

sales2.to_parquet('combined.pq')

### Creating tensors

In [None]:
%%time
sales2 = pd.read_parquet('combined.pq')
print(sales2.shape)
sales2.columns

In [7]:
%%time
x_cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day',
        'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI']
x_cont_cols = ['sell_price']

CPU times: user 4 µs, sys: 9 µs, total: 13 µs
Wall time: 26.5 µs


In [None]:
%%time
# sales2.sort_values(['item_id', 'store_id','day'], inplace=True)
sales2['sell_price'] = sales2['sell_price'].fillna(0.0)
sales2['sell_price'] = sales2['sell_price'].astype(np.float32)

In [None]:
%%time
x = torch.tensor(sales2[x_cat_cols + x_cont_cols].values)
y = torch.tensor(sales2['demand'].values)

In [None]:
sales2[x_cat_cols + x_cont_cols].dtypes

In [None]:
%%time

# from fastai v2
def get_emb_size(nunique):
    return min(600, round(1.6 * nunique**0.56))

emb_sizes = [(sales2[col].nunique(), get_emb_size(sales2[col].nunique())) for col in x_cat_cols]

In [None]:
with open('emb_sz.pkl','wb') as f:
    pickle.dump(emb_sizes,f )

In [None]:
group_size = num_items * num_stores
group_size

In [None]:
%%time
num_features = x.size(1)
x1 = x.view(-1, num_days, num_features).refine_names('item_store', 'day','features')\
        .align_to('day','item_store','features').contiguous()

y1 = y.view(-1, num_days).refine_names('item_store', 'day')\
    .align_to('day', 'item_store').contiguous()

print(f'x1.shape - {x1.shape} y1.shape - {y1.shape}')

In [None]:
%%time
torch.save(x1.rename(None), 'x.pt')
torch.save(y1.rename(None), 'y.pt')


In [None]:
# torch.unique(x1.rename(None))

In [20]:
class M5DataSet(Dataset):
    def __init__(self,x, y, src_len, tgt_len, bsz):
        self.x = x
        self.y = y
        self.src_len = src_len
        self.tgt_len = tgt_len
        self.bsz = bsz
        
    def __len__(self):
        l = (self.x.size(0) - (self.src_len + self.tgt_len)) 
        return l
    
    def __getitem__(self, idx):
        # we have 30490 item_stores. We may not be able to load them all. So randomly pick bsz items. 
        item_store_ids = list(np.random.randint(0, self.x.size(1),(self.bsz,)))
        
        x_src = self.x.rename(None)[idx:idx+self.src_len, item_store_ids, :]
        x_tgt = self.x.rename(None)[idx+self.src_len:idx+self.src_len+self.tgt_len, item_store_ids, :]
        y_src = self.y.rename(None)[idx:idx+self.src_len, item_store_ids]
        y_tgt = self.y.rename(None)[idx+self.src_len:idx+self.src_len+self.tgt_len, item_store_ids]
#         print(f'x.shape - {self.x.shape} y.shape - {self.y.shape} idx - {idx}. x_item.shape - {x_item.shape} y_item.shape - {y_item.shape}')
        return x_src, x_tgt, y_src, y_tgt

# train_ds = M5DataSet(x1, y1, src_len, tgt_len, 200)
# train_dl = DataLoader(train_ds, batch_size=1, shuffle=True, pin_memory=True)

In [16]:
%%time
x_b, y_b = next(iter(train_dl))
bsz = 100
print(f'x_b.shape - {x_b.shape}, y_b.shape - {y_b.shape}')

NameError: name 'train_dl' is not defined

### Training using lightening

In [17]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    
def insert_embedding(inp, dim, index, emb):
    """
    Replace columns with their embeddings. Works only with 2-d tensors.
    TODO - make it work for multi-dim tensors

    :param inp: tensor of two or more dimensions
    :param dim: dimension along which tensor should be expanded by inserting the embedding
    :param i: index of tensor along dim which is to be embedded
    :param emb: Embedding of shape [v,d], where v vocab_size and d is embedding dimension
    :return: 
    """
    # create a slice of the data to be replaced with embedding. 
    s = inp.index_select(dim, torch.tensor([index])).squeeze(dim)
    embedded = emb(s.type(torch.long))
    
    first_indices = torch.arange(0,index)
    last_indices = torch.arange(index+1,inp.size(dim))

    return torch.cat([inp.index_select(dim, first_indices), embedded.type(inp.dtype), inp.index_select(dim, last_indices)], axis=dim)

In [29]:
gx = torch.load('x.pt')
gy = torch.load('y.pt')

In [52]:
class SalesModel(LightningModule):
    def __init__(self, hparams, x_cat_cols, x_cont_cols):
        super(SalesModel, self).__init__()
        self.hparams = hparams
        self.x_cat_cols = x_cat_cols
        self.x_cont_cols = x_cont_cols
        self.pos_encoder = PositionalEncoding(hparams.ninp, hparams.dropout)
#         encoder_layers = nn.TransformerEncoderLayer(hparams.ninp, hparams.nhead, hparams.nhid, hparams.dropout)
#         decoder_layers = nn.TransformerDecoderLayer(hparams.ninp, hparams.nhead, hparams.nhid, hparams.dropout)
#         self.transformer_encoder = nn.TransformerEncoder(encoder_layers, hparams.nlayers)
#         self.transformer_decoder = nn.TransformerDecoder(decoder_layers, hparams.nlayers)
#         self.lin = nn.Linear()
        self.transformer = nn.Transformer(nhead=hparams.nhead, num_encoder_layers=hparams.nlayers)
        self.criterion = nn.MSELoss()
        
#     def init_weights(self):
#         initrange = 0.1
#         self.src_embedding.weight.data.uniform_(-initrange, initrange)
#         self.decoder.bias.data.zero_()
#         self.decoder.weight.data.uniform_(-initrange, initrange)
        
    @staticmethod
    def add_model_specifi_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--bsz', default=20, type=int, help='batch_size', )
        parser.add_argument('--src-len', default=90, type=int, help='source length')
        parser.add_argument('--tgt-len', default=28, type=int, help='target length')
        parser.add_argument('--ninp', default=320, type=int, help='expected features in the input')
        parser.add_argument('--nhead', default=4, type=int, help='number of attention heads')
        parser.add_argument('--nhid', default=1024, type=int, help='dimesion of feed-forward network model')
        parser.add_argument('--nlayers', default=2, type=int, help='number of encoder layers')
        parser.add_argument('--dropout', default=0.2, type=float, help='dropout')
        return parser
    
#     def _generate_square_subsequent_mask(self, sz):
#         # populate the lower triangle with True and rest with False
#         return torch.tril(torch.ones(sz, sz)) == 1.0
    
    def prepare_data(self):
        print('reading data', flush=True)
        self.x = gx
        self.y = gy

        
        with open('emb_sz.pkl','rb') as f:
            emb_szs = pickle.load(f)
        print(f'emb_szs - {emb_szs}')
                    
        self.embs = nn.ModuleList([nn.Embedding(e[0],e[1]) for e in emb_szs])

        
    def train_dataloader(self):
        train_ds = M5DataSet(self.x, self.y, self.hparams.src_len, self.hparams.tgt_len, self.hparams.bsz)        
        train_dl = DataLoader(train_ds, batch_size=1, shuffle=True, pin_memory=True)
        return train_dl

    def forward(self, x_src, y_src, x_tgt):
        offset = 0
        embs_t = []
        for idx, col in enumerate(self.x_cat_cols):
             embs_t.append(self.embs[idx](x_src[:,:,idx].type(torch.long)))
        x_src_cat = torch.cat(embs_t, dim=2)
        x_src_cont = x_src[:,:,len(x_cat_cols):]
        x_src = torch.cat([x_src_cat, x_src_cont.type(x_src_cat.dtype), y_src.unsqueeze(2)], dim=2)
        print(f'x_src_cat - {x_src_cat.shape}')
        print(f'x_src_cnt - {x_src_cont.shape}')
        print(f'x_src - {x_src.shape}')
        print(f'y.shape - {y_src.shape}')
#         for x in [x_src, x_tgt]:
#             for idx, col in enumerate(self.x_cat_cols):
#     #             print(f'emb for col - {idx}, {col}, {self.embs[idx]} inserting at {offset}')
#                 x = insert_embedding(x, dim=2, index=offset, emb=self.embs[idx])
#                 offset += self.embs[idx].weight.size(1)
            
        print(f'shape after x_src: {x_src.shape}, x_tgt.shape: {x_tgt.shape}')
            
        #pad to adjust the feature dimension
        dim3_shortfall = self.hparams.ninp - x.size(2)
        assert dim3_shortfall >= 0
        pad = nn.ConstantPad1d(padding=(0,1),value=0)
        x = pad(x)
        
        x = self.pos_encoder(x)
        print('shape after pos encoder - ', x.size())
        x = self.transformer(x)
        print('shape after transformer - ', x.size())
        
#         if self.src_mask is None or self.src_mask.size(0) != len(src):
#             device = src.device
#             mask = self._generate_square_subsequent_mask(len(src)).to(device)
#             self.src_mask = mask
            
#         src = self.src_embedding(src) * math.sqrt(self.hparams.ninp)
#         src = self.pos_encoder(src)
#         output = self.transformer_decoder(self.transformer_encoder(src))

        return x
    
    def training_step(self, batch, batch_idx):
        x_src, x_tgt, y_src, y_tgt = batch
        x_src = x_src.squeeze(0)
        x_tgt = x_tgt.squeeze(0)
        y_src = y_src.squeeze(0)
        y_tgt = y_tgt.squeeze(0)
        
        print(f'x_src.shape - {tuple(x_src.shape)} \t x_tgt.shape - {tuple(x_tgt.shape)} \t y_src.shape - {y_src.shape} \t y_tgt.shape - {y_tgt.shape}')
        yhat = self(x_src, y_src, x_tgt)
        print(f'yhat.shape - {yhat.shape}')
        loss = self.criterion(yhat.reshape(-1, self.ntoken), y)
        return {'loss': loss}

In [53]:
src_len = 90
tgt_len = 28
bsz = 200
# model = SalesModel(hparams)

parser = ArgumentParser()
parser = SalesModel.add_model_specifi_args(parser)
hparams = parser.parse_args('--bsz 200 --ninp 320'.split())

model = SalesModel(hparams, x_cat_cols, x_cont_cols)
trainer = Trainer()
trainer.fit(model)

reading data
emb_szs - [(3049, 143), (7, 5), (3, 3), (10, 6), (3, 3), (1913, 110), (7, 5), (7, 5), (12, 6), (6, 4), (31, 11), (5, 4), (5, 4), (3, 3), (2, 2), (2, 2), (2, 2)]


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …

x_src.shape - torch.Size([90, 200, 18]) 	 x_tgt.shape - torch.Size([28, 200, 18]) 	 y_src.shape - torch.Size([90, 200]) 	 y_tgt.shape - torch.Size([28, 200])


RuntimeError: Expected object of scalar type float but got scalar type long int for sequence element 2.

In [None]:
sales2[sales2.sell_price > 0]['sell_price']

In [33]:
m = nn.ConstantPad1d(padding=(0,1),value=0)
t = torch.ones((1,3,3))
m(t)

tensor([[[1., 1., 1., 0.],
         [1., 1., 1., 0.],
         [1., 1., 1., 0.]]])

In [64]:
tuple(gx.size())

(1913, 30490, 18)

In [59]:
dir(gx.size())

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'count',
 'index',
 'numel']