#Import libraries

In [11]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import warnings

#Import Datasets

In [12]:
#Get the data (session_id, item_id, date, Datetime, Timestamp)
url01 = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dataset_filtered/train_session01_seq.csv'

In [105]:
dataset01 = pd.read_csv(url01, index_col = 0, parse_dates=["date"])
dataset01 = dataset01.dropna()
dataset01 = dataset01.reset_index()
#fraction
dataset = dataset01.sample(frac=1)

In [106]:
dataset.head(5)

Unnamed: 0,session_id,item_id,date,timestamp,month,weekYear,season,duration
437007,3753251.0,10983.0,2020-10-29 13:44:58.748,1603979000.0,10.0,44.0,3.0,1.0
93107,800032.0,20845.0,2020-11-20 11:55:55.768,1605873000.0,11.0,47.0,3.0,1.0
323143,2768965.0,13599.0,2020-11-27 20:57:02.528,1606511000.0,11.0,48.0,3.0,1.0
138176,1190203.0,25173.0,2020-11-28 18:09:21.591,1606587000.0,11.0,48.0,3.0,1.0
397519,3413813.0,4483.0,2021-01-07 14:10:05.707,1610029000.0,1.0,1.0,4.0,1.0


In [107]:
dataset.shape

(516944, 8)

In [108]:
# Filter item less than 5 interactions
df_item_count = dataset[['item_id', 'session_id']].groupby('item_id').count().sort_values(by = 'session_id', ascending = False)
df_item_count.columns = ['CountItemId']
df_item_count_5 = df_item_count[df_item_count['CountItemId'] < 5]
# remove item_id less than 5 interactions
dataset = dataset[~dataset['item_id'].isin(list(df_item_count_5.index))]

In [109]:
dataset.shape

(507539, 8)

In [110]:
# Filter session less than 2 iteractions
df_session_count = dataset[['item_id', 'session_id']].groupby('session_id').count().sort_values(by = 'item_id', ascending = False)
df_session_count.columns = ['items_in_session']
df_session_count_2 = df_session_count[df_session_count['items_in_session'] < 2]
# remove session_id less than 2 interactions
dataset = dataset[~dataset['session_id'].isin(list(df_session_count_2.index))]

In [111]:
dataset.shape

(253678, 8)

In [112]:
#Train split data test
random_selection = np.random.rand(len(dataset.index)) <= 0.85
train_data = dataset[random_selection]
test_data = dataset[~random_selection]

In [113]:
#Unique item_id in dataset
unique_item_id = dataset['item_id'].unique()

#Config

In [202]:
config = {
    'data_path' : '...',
    'num_items': len(unique_item_id),
    'max_len' : 20,
    'hidden_units' : 256, # Embedding size
    'num_heads' : 2, # Multi-head layer
    'num_layers': 3, # block Transformers
    'dropout_rate' : 0.1,
    'lr' : 0.001,
    'batch_size' : 128,
    'num_epochs' : 5,
    'num_workers' : 2,
    'mask_prob' : 0.3, # for cloze task
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


#Preprocessing data

- Output: session_train: [0:-1] in the session. session_valid: [-1] in the session for valid last item


In [115]:
class MakeSequenceDataSet():
    """
    SequenceData
    """
    def __init__(self, config):
        #self.df = pd.read_csv(os.path.join(config['data_path'], 'rating.csv'))
        self.df = dataset

        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('item_id')
        self.session_encoder, self.session_decoder = self.generate_encoder_decoder('session_id')
        self.num_item, self.num_session = len(self.item_encoder), len(self.session_encoder)

        self.df['item_idx'] = self.df['item_id'].apply(lambda x : self.item_encoder[x] + 1)
        self.df['session_idx'] = self.df['session_id'].apply(lambda x : self.session_encoder[x])
        self.df = self.df.sort_values(['session_idx', 'timestamp']) # Sortbytime
        self.session_train, self.session_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        create encoder, decoder

        Args:
            col (str): named of columns to be created
        Returns:
            dict: user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder

    def generate_sequence_data(self) -> dict:
        """
        sequence_data create

        Returns:
            dict: train session sequence / valid user sequence
        """
        session_dict = defaultdict(list)
        session_train = {}
        session_valid = {}
        group_df = self.df.groupby('session_idx')
        print(group_df)
        for session, item in group_df:
            session_dict[session].extend(item['item_idx'].tolist())

        for session in session_dict:
            session_train[session] = session_dict[session][:-1]
            session_valid[session] = [session_dict[session][-1]] # Predict the last item

        return session_train, session_valid

    def get_train_valid_data(self):
        return self.session_train, self.session_valid

recheck MakeSequenceDataset

In [116]:
group_dataset = dataset.groupby('session_id')
group_dataset
MakeSequenceDataSet(config)
session_train, session_valid = MakeSequenceDataSet(config).get_train_valid_data()

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7d5a91adf160>
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7d5a6d1afb50>


In [117]:
#session_valid

In [118]:
#session_train

##BERT dataset input for model function:


In [119]:
class BERTRecDataSet(Dataset):
    def __init__(self, session_train, max_len, num_session, num_item, mask_prob):
        self.session_train = session_train
        self.max_len = max_len
        self.num_session = num_session
        self.num_item = num_item
        self.mask_prob = mask_prob
        self._all_items = set([i for i in range(1, self.num_item + 1)])

    def __len__(self):
        # number of session
        return self.num_session

    def __getitem__(self, user):

        session_seq = self.session_train[user]
        tokens = []
        labels = []
        for s in session_seq[-self.max_len:]:
            prob = np.random.random()
            if prob < self.mask_prob:
                prob /= self.mask_prob
                if prob < 0.8:
                    # masking
                    tokens.append(self.num_item + 1)  # mask_index: num_item + 1, 0: pad, 1~num_item: item index
                elif prob < 0.9:
                    # noise
                    tokens.extend(self.random_neg_sampling(rated_item = session_seq, num_item_sample = 1))  # item random sampling
                else:
                    tokens.append(s)
                labels.append(s) # used for learning O
            else:
                tokens.append(s)
                labels.append(0) # for learning X

        mask_len = self.max_len - len(tokens)
        tokens = [0] * mask_len + tokens
        labels = [0] * mask_len + labels

        return torch.LongTensor(tokens), torch.LongTensor(labels)

    def random_neg_sampling(self, rated_item : list, num_item_sample : int):
        nge_samples = random.sample(list(self._all_items - set(rated_item)), num_item_sample)
        return nge_samples

In [120]:
num_session1 = MakeSequenceDataSet(config).num_session

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7d5b8f295750>


In [121]:
print(num_session1)

99351


In [122]:
num_item1 = MakeSequenceDataSet(config).num_item

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7d5a821105e0>


In [123]:
print(num_item1)

14575


In [124]:
BERTRecDataSet(session_train=session_train, max_len=config['max_len'], num_session=num_session1, num_item=num_item1, mask_prob=config['mask_prob'])

<__main__.BERTRecDataSet at 0x7d5a6d113460>

#Data Loader

1. Dataset: pre-loaded datasets own data -> store samples and corresponding labels.
2. DataLoader: input dataset split corresponding batch size

#Model Architecture

BERT model includes:
1. **BERT embedding:**
* positional encoding + item embedding
2. **Transfomer Encoder Layer (Trm)**
* LN(x + Dropout(Sublayer(x)).
where LN - Layer Normalization
3. **Stacking of Transformers**
* H(l) = Trm(H(l-1));
where H is a Hidden representation
* Trm(H(l-1)) = LN(A(l-1) + Dropout(PFFN(A(l-1)))
where A: Attention, PFFN: Point Wise Feed Forward
* A(l-1) = LN(H(l-1)+Dropout(MH(H(l-1)))
where MH: Multi-head Attention

* Input: sequence of session
* Output: softmax - prediction of [mask] item



In [125]:
class PositionalEmbedding(nn.Module):
  """
  Input: batch_size, seq_len/max_len
  Output: batch_size, seq_len/max_len, d_model
  """
  def __init__(self, max_len, d_model):
        super().__init__()

        # Compute the positional encodings once in log space.
        self.pe = nn.Embedding(max_len, d_model)

  def forward(self, x):
        batch_size = x.size(0)
        return self.pe.weight.unsqueeze(0).repeat(batch_size, 1, 1)

class TokenEmbedding(nn.Embedding):
  """
  Input: batch_size, seq_len
  Output: batch_size, seq_len, embed_size
  """
  def __init__(self, vocab_size, embed_size=512):
        super().__init__(vocab_size, embed_size, padding_idx=0)

In [126]:
class BERTEmbedding(nn.Module):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)

        sum of all these features are output of BERTEmbedding
        input: batch_size, seq_len/max_len
        output: batch_size, seq_len/max_len, embed_size
    """

    def __init__(self, vocab_size, embed_size, max_len, dropout=0.1):
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """
        super().__init__()
        self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
        self.position = PositionalEmbedding(max_len=max_len, d_model=embed_size)
        # self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.embed_size = embed_size

    def forward(self, sequence):
        x = self.token(sequence) + self.position(sequence)
        return self.dropout(x)

In [127]:
class Attention(nn.Module):
    """
    Compute 'Scaled Dot Product Attention
    """

    def forward(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) \
                 / math.sqrt(query.size(-1))

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        p_attn = F.softmax(scores, dim=-1)

        if dropout is not None:
            p_attn = dropout(p_attn)

        return torch.matmul(p_attn, value), p_attn

In [128]:
class MultiHeadedAttention(nn.Module):
    """
    Take in model size and number of heads.
    Input: batch_size, seq_len, d_model
    Output: batch_size, seq_len, d_model
    """

    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0

        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h

        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
        self.output_linear = nn.Linear(d_model, d_model)
        self.attention = Attention()

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)

        return self.output_linear(x)

In [129]:
class GELU(nn.Module):
    """
    Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
    """

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

In [130]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = GELU()

    def forward(self, x):
        return self.w_2(self.dropout(self.activation(self.w_1(x))))

In [131]:
class LayerNorm(nn.Module):
    "Construct a layernorm module"

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [132]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [133]:
class TransformerBlock(nn.Module):
    """
    Bidirectional Encoder = Transformer (self-attention)
    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
    """

    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):
        x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
        x = self.output_sublayer(x, self.feed_forward)
        return self.dropout(x)

In [134]:
class BERT(nn.Module):
    """
    Input: batch_size, seq_len
    Output: batch_size, num_items+1
    """
    def __init__(self, bert_max_len, num_items, bert_num_blocks, bert_num_heads,
                 bert_hidden_units, bert_dropout):
        super().__init__()

        # fix_random_seed_as(args.model_init_seed)
        # self.init_weights()

        max_len = bert_max_len
        num_items = num_items
        n_layers = bert_num_blocks
        heads = bert_num_heads
        vocab_size = num_items + 2
        hidden = bert_hidden_units
        self.hidden = hidden
        dropout = bert_dropout

        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=self.hidden, max_len=max_len, dropout=dropout)

        # multi-layers transformer blocks, deep network
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(hidden, heads, hidden * 4, dropout) for _ in range(n_layers)])
        self.out = nn.Linear(hidden, num_items + 1)

    def forward(self, x):
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(x)

        # running over multiple transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer.forward(x, mask)

        x = self.out(x)
        return x

    def init_weights(self):
        pass

In [135]:
a = torch.tensor([[1, 2, 3], [4, 5, 6]])

In [136]:
net = BERT(bert_max_len = 3, num_items = 10, bert_num_blocks= 6, bert_num_heads = 8, bert_hidden_units = 8, bert_dropout = 0.4)

In [137]:
print(a.shape)

torch.Size([2, 3])


In [138]:
net(a)

tensor([[[-1.2090e+01,  5.9765e+00,  2.0671e+01, -1.8261e+01, -1.0214e+01,
          -1.0174e+01,  2.2575e+01,  1.7352e+01, -1.7450e+01, -1.9071e+01,
           1.8457e+01],
         [-8.5412e-01, -1.9900e+00, -1.7280e+00, -6.8813e-01, -1.9128e-01,
          -3.7145e-01, -2.1609e+00, -1.2267e+00, -2.7255e+00,  3.8858e-01,
           2.1636e+00],
         [ 1.2305e-01, -5.6109e-01, -2.3036e-01,  9.4978e-02, -3.7796e-01,
          -4.5657e-01, -1.5152e-01, -6.1546e-01, -2.8316e-01,  2.3488e-01,
          -1.8949e-01]],

        [[ 5.0190e-02, -5.5742e-01, -6.7704e-01,  8.3933e-01,  4.7182e-01,
           2.1683e-01, -8.8674e-01, -5.7699e-01, -3.0267e-01,  9.8549e-01,
          -1.9499e-02],
         [ 7.8952e-01, -2.0125e+00, -4.8079e-01,  7.9537e-01,  7.6216e-01,
           4.9253e-01,  4.0633e-01, -1.4233e-01, -2.1257e+00,  2.1292e+00,
           8.4430e-01],
         [-7.1577e-01, -1.4338e+00,  6.1999e-02, -1.0986e+00, -4.8529e-01,
          -1.3410e+00, -2.7364e-01,  6.1781e-01, -2.5

In [139]:
BERT_out = net(a)

In [140]:
print(BERT_out.shape)

torch.Size([2, 3, 11])


In [141]:
print(net)

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(12, 8, padding_idx=0)
    (position): PositionalEmbedding(
      (pe): Embedding(3, 8)
    )
    (dropout): Dropout(p=0.4, inplace=False)
  )
  (transformer_blocks): ModuleList(
    (0-5): 6 x TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0-2): 3 x Linear(in_features=8, out_features=8, bias=True)
        )
        (output_linear): Linear(in_features=8, out_features=8, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0.4, inplace=False)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=8, out_features=32, bias=True)
        (w_2): Linear(in_features=32, out_features=8, bias=True)
        (dropout): Dropout(p=0.4, inplace=False)
        (activation): GELU()
      )
      (input_sublayer): SublayerConnection(
        (norm): LayerNorm()
        (dropout): Dropout(p=0.4, inplace=False)
      )
      (outp

#Training

* Train data: [MASK] with portion (with random) in the session -> Validation
* Loss function: Negative log-likelihood of the masked targets.

Training:
1. Preprocess: Make Sequence Data -> BERT4REC data input -> Data Loader
2. 1 epoch: model.train() and log the total loss
3. Loop n epoch:
- Train 1 epoch
- scheduler?
- Metrics to validate
- Saved the best model

In [203]:
#Train 1 epoch
def train(model, criterion, optimizer, data_loader):
    model.train()
    loss_val = 0
    for seq, labels in tqdm(data_loader):
        seq, labels = seq.to(device), labels.to(device)
        logits = model(seq) # (bs, t, vocab)
        logits = logits.view(-1, logits.size(-1)) # (bs * t, vocab)
        labels = labels.view(-1) # (bs * t)

        #1. Zero the gradients for every batch
        optimizer.zero_grad()

        #2.Compute the loss
        loss = criterion(logits, labels)

        loss_val += loss.item()

        #3.Use loss to produce gradients
        loss.backward()

        #4.Use optimizer to take gradient step
        optimizer.step()

    loss_val /= len(data_loader)

    return loss_val

In [204]:
model = BERT(
    num_items = config['num_items'],
    bert_hidden_units = config['hidden_units'],
    bert_num_heads = config['num_heads'],
    bert_num_blocks = config['num_layers'],
    bert_max_len = config['max_len'],
    bert_dropout = config['dropout_rate'],
    ).to(device)

#criterion = nn.CrossEntropyLoss(ignore_index=0)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])

In [205]:
bert4rec_dataset = BERTRecDataSet(session_train=session_train,
                                  max_len=config['max_len'],
                                  num_session= MakeSequenceDataSet(config).num_session,
                                  num_item=config['num_items'],
                                  mask_prob=config['mask_prob'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7d5a6cf373d0>


In [206]:
data_loader = DataLoader(
    bert4rec_dataset,
    batch_size = config['batch_size'],
    shuffle = True,
    pin_memory = True,
    num_workers = config['num_workers'],
    )

In [None]:
#Train over epoch
loss_list = []
recall_list = []
mrr_list = []
for epoch in tqdm(range(1, config['num_epochs'] + 1)):
    train_loss = train(
        model = model,
        criterion = criterion,
        optimizer = optimizer,
        data_loader = data_loader)

    print(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}')

  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/777 [00:00<?, ?it/s][A
  0%|          | 1/777 [00:00<01:57,  6.58it/s][A
  0%|          | 3/777 [00:00<01:01, 12.49it/s][A
  1%|          | 6/777 [00:00<00:44, 17.17it/s][A
  1%|          | 9/777 [00:00<00:37, 20.26it/s][A
  2%|▏         | 12/777 [00:00<00:34, 22.02it/s][A
  2%|▏         | 15/777 [00:00<00:33, 23.08it/s][A
  2%|▏         | 18/777 [00:00<00:31, 23.80it/s][A
  3%|▎         | 21/777 [00:00<00:31, 24.26it/s][A
  3%|▎         | 24/777 [00:01<00:30, 24.51it/s][A
  3%|▎         | 27/777 [00:01<00:30, 24.47it/s][A
  4%|▍         | 30/777 [00:01<00:30, 24.35it/s][A
  4%|▍         | 33/777 [00:01<00:30, 24.41it/s][A
  5%|▍         | 36/777 [00:01<00:30, 24.65it/s][A
  5%|▌         | 39/777 [00:01<00:29, 24.78it/s][A
  5%|▌         | 42/777 [00:01<00:29, 24.79it/s][A
  6%|▌         | 45/777 [00:01<00:29, 24.97it/s][A
  6%|▌         | 48/777 [00:02<00:29, 25.11it/s][A
  7%|▋         | 51/777 [00:02<00:28, 2

Epoch:   1| Train loss: -820942361.38505



  0%|          | 0/777 [00:00<?, ?it/s][A
  0%|          | 1/777 [00:00<01:52,  6.91it/s][A
  1%|          | 4/777 [00:00<00:44, 17.39it/s][A
  1%|          | 7/777 [00:00<00:36, 20.84it/s][A
  1%|▏         | 10/777 [00:00<00:33, 22.66it/s][A
  2%|▏         | 13/777 [00:00<00:32, 23.62it/s][A
  2%|▏         | 16/777 [00:00<00:31, 24.36it/s][A
  2%|▏         | 19/777 [00:00<00:30, 24.83it/s][A
  3%|▎         | 22/777 [00:00<00:30, 25.04it/s][A
  3%|▎         | 25/777 [00:01<00:31, 24.16it/s][A
  4%|▎         | 28/777 [00:01<00:30, 24.57it/s][A
  4%|▍         | 31/777 [00:01<00:29, 24.93it/s][A
  4%|▍         | 34/777 [00:01<00:29, 25.17it/s][A
  5%|▍         | 37/777 [00:01<00:29, 25.26it/s][A
  5%|▌         | 40/777 [00:01<00:29, 25.38it/s][A
  6%|▌         | 43/777 [00:01<00:28, 25.46it/s][A
  6%|▌         | 46/777 [00:01<00:28, 25.47it/s][A
  6%|▋         | 49/777 [00:02<00:28, 25.18it/s][A
  7%|▋         | 52/777 [00:02<00:28, 25.13it/s][A
  7%|▋         | 55/777

Epoch:   2| Train loss: -21869574967.02188



  0%|          | 0/777 [00:00<?, ?it/s][A
  0%|          | 1/777 [00:00<01:55,  6.70it/s][A
  1%|          | 4/777 [00:00<00:45, 17.12it/s][A
  1%|          | 7/777 [00:00<00:36, 20.83it/s][A
  1%|▏         | 10/777 [00:00<00:33, 22.84it/s][A
  2%|▏         | 13/777 [00:00<00:31, 24.04it/s][A
  2%|▏         | 16/777 [00:00<00:30, 24.80it/s][A
  2%|▏         | 19/777 [00:00<00:30, 25.08it/s][A
  3%|▎         | 22/777 [00:00<00:29, 25.37it/s][A
  3%|▎         | 25/777 [00:01<00:29, 25.13it/s][A
  4%|▎         | 28/777 [00:01<00:29, 25.46it/s][A
  4%|▍         | 31/777 [00:01<00:29, 25.63it/s][A
  4%|▍         | 34/777 [00:01<00:28, 25.71it/s][A
  5%|▍         | 37/777 [00:01<00:28, 25.88it/s][A
  5%|▌         | 40/777 [00:01<00:28, 25.81it/s][A
  6%|▌         | 43/777 [00:01<00:28, 25.80it/s][A
  6%|▌         | 46/777 [00:01<00:28, 25.79it/s][A
  6%|▋         | 49/777 [00:02<00:28, 25.78it/s][A
  7%|▋         | 52/777 [00:02<00:28, 25.64it/s][A
  7%|▋         | 55/777

Epoch:   3| Train loss: -120864222499.25354



  0%|          | 0/777 [00:00<?, ?it/s][A
  0%|          | 1/777 [00:00<01:49,  7.10it/s][A
  1%|          | 4/777 [00:00<00:43, 17.73it/s][A
  1%|          | 7/777 [00:00<00:36, 21.19it/s][A
  1%|▏         | 10/777 [00:00<00:33, 22.86it/s][A
  2%|▏         | 13/777 [00:00<00:31, 23.94it/s][A
  2%|▏         | 16/777 [00:00<00:30, 24.59it/s][A
  2%|▏         | 19/777 [00:00<00:30, 24.90it/s][A
  3%|▎         | 22/777 [00:00<00:30, 25.07it/s][A
  3%|▎         | 25/777 [00:01<00:30, 24.62it/s][A
  4%|▎         | 28/777 [00:01<00:30, 24.83it/s][A
  4%|▍         | 31/777 [00:01<00:29, 25.03it/s][A
  4%|▍         | 34/777 [00:01<00:29, 25.28it/s][A
  5%|▍         | 37/777 [00:01<00:29, 25.49it/s][A
  5%|▌         | 40/777 [00:01<00:28, 25.50it/s][A
  6%|▌         | 43/777 [00:01<00:28, 25.51it/s][A
  6%|▌         | 46/777 [00:01<00:28, 25.58it/s][A
  6%|▋         | 49/777 [00:02<00:28, 25.47it/s][A
  7%|▋         | 52/777 [00:02<00:28, 25.35it/s][A
  7%|▋         | 55/777

Epoch:   4| Train loss: -363348082709.08624



  0%|          | 0/777 [00:00<?, ?it/s][A
  0%|          | 1/777 [00:00<01:49,  7.07it/s][A
  1%|          | 4/777 [00:00<00:43, 17.64it/s][A
  1%|          | 7/777 [00:00<00:36, 21.15it/s][A
  1%|▏         | 10/777 [00:00<00:33, 23.07it/s][A
  2%|▏         | 13/777 [00:00<00:31, 23.95it/s][A
  2%|▏         | 16/777 [00:00<00:31, 24.45it/s][A
  2%|▏         | 19/777 [00:00<00:30, 24.83it/s][A
  3%|▎         | 22/777 [00:00<00:29, 25.18it/s][A
  3%|▎         | 25/777 [00:01<00:30, 24.64it/s][A
  4%|▎         | 28/777 [00:01<00:30, 24.94it/s][A
  4%|▍         | 31/777 [00:01<00:29, 25.23it/s][A
  4%|▍         | 34/777 [00:01<00:29, 25.30it/s][A
  5%|▍         | 37/777 [00:01<00:29, 25.36it/s][A
  5%|▌         | 40/777 [00:01<00:28, 25.47it/s][A
  6%|▌         | 43/777 [00:01<00:28, 25.62it/s][A
  6%|▌         | 46/777 [00:01<00:28, 25.60it/s][A
  6%|▋         | 49/777 [00:02<00:28, 25.33it/s][A
  7%|▋         | 52/777 [00:02<00:28, 25.20it/s][A
  7%|▋         | 55/777

#Evaluation

Test data:
* [MASK] the last item in the session for prediction
* model.eval()
* get the top-k item in prediction and evaluate by evaluation metrics MRR@20 and Recall@20

In [None]:
make_sequence_dataset = MakeSequenceDataSet(config)

In [None]:
def evaluate(model, session_train, session_valid, max_len, data_loader, bert4rec_dataset, make_sequence_dataset):
    model.eval()

    NDCG = 0.0 # NDCG@20
    HIT = 0.0 # HIT@20
    MRR = 0.0 # MRR@20
    RECALL = 0.0 # Recall@20 equivalent with HIT because just one last item is predicted.

    num_item_sample = 100

    sessions = [session for session in range(make_sequence_dataset.num_session)]

    for session in tqdm(sessions):
        seq = (session_train[session] + [make_sequence_dataset.num_item + 1])[-max_len:] # mask last token
        padding_len = max_len - len(seq)
        seq = [0] * padding_len + seq
        rated = session_train[session] + session_valid[session]
        items = session_valid[session] + bert4rec_dataset.random_neg_sampling(rated_item = rated, num_item_sample = num_item_sample)

        with torch.no_grad():
            seq = torch.LongTensor([seq]).to(device)
            predictions = -model(seq)
            #print(predictions.shape)
            predictions = predictions[0][-1][items] # sampling
            #print(predictions.shape, 'prediction sampling')
            rank = predictions.argsort().argsort()[0].item() # label
            #print(rank, 'rank')

        if rank < 20: #Top20
            NDCG += 1 / np.log2(rank + 2)
            HIT += 1
            RECALL += 1
            if rank == 0:
              MRR += 0
            else:
              MRR += 1/rank



    NDCG /= len(sessions)
    HIT /= len(sessions)
    RECALL /= len(sessions)
    MRR /= len(sessions)

    return NDCG, HIT, RECALL, MRR

In [210]:
ndcg, hit, recall, mrr = evaluate(
    model = model,
    session_train = session_train,
    session_valid = session_valid,
    max_len = config['max_len'],
    data_loader = None,
    make_sequence_dataset = make_sequence_dataset,
    bert4rec_dataset = bert4rec_dataset
    )

100%|██████████| 99351/99351 [07:06<00:00, 232.71it/s]


In [211]:
print(ndcg)
print(hit)
print(mrr)
print(recall)

0.21846051220966156
0.47336212015983736
0.1052904848249313
0.47336212015983736
