#Import libraries

In [1]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from torchtext.vocab import vocab
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

from collections import Counter

import pandas as pd
import numpy as np

import time



#Import Datasets

In [2]:
#Get the data (session_id, item_id, date, Datetime, Timestamp)
url01 = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dataset_filtered/train_session01_seq.csv'

In [3]:
dataset01 = pd.read_csv(url01, index_col = 0, parse_dates=["date"])
dataset01 = dataset01.dropna()
dataset01 = dataset01.reset_index()
#fraction
dataset = dataset01.sample(frac=1)
dataset.shape

(516944, 8)

In [4]:
# Filter item less than 5 interactions
df_item_count = dataset[['item_id', 'session_id']].groupby('item_id').count().sort_values(by = 'session_id', ascending = False)
df_item_count.columns = ['CountItemId']
df_item_count_5 = df_item_count[df_item_count['CountItemId'] < 5]
# remove item_id less than 5 interactions
dataset = dataset[~dataset['item_id'].isin(list(df_item_count_5.index))]

In [5]:
# Filter session less than 2 iteractions
df_session_count = dataset[['item_id', 'session_id']].groupby('session_id').count().sort_values(by = 'item_id', ascending = False)
df_session_count.columns = ['items_in_session']
df_session_count_2 = df_session_count[df_session_count['items_in_session'] < 2]
# remove session_id less than 2 interactions
dataset = dataset[~dataset['session_id'].isin(list(df_session_count_2.index))]

In [6]:
dataset.shape

(253678, 8)

In [7]:
#Unique item_id in dataset
unique_item_id = dataset['item_id'].unique()

In [8]:
# Preventing ids to be written as integer or float data type
dataset["session_id"] = dataset["session_id"].apply(lambda x: f"session_{x}")

dataset["item_id"] = dataset["item_id"].apply(lambda x: f"item_{x}")

#Create Vocabulary

In [9]:
#list of unique item ids
item_ids = dataset.item_id.unique()

#Counter is used to feed items to item_vocab
item_counter = Counter(item_ids)

#Genarting vocabulary
item_vocab = vocab(item_counter, specials=['<unk>'])

#For indexing input ids
item_vocab_stoi = item_vocab.get_stoi()

#Session_id vocab
session_ids = dataset.session_id.unique()
session_counter = Counter(session_ids)
session_vocab = vocab(session_counter, specials=['<unk>'])
session_vocab_stoi = session_vocab.get_stoi()

#Config

In [31]:
config = {
    'num_items': len(item_vocab), #num_items + 1
    'num_sessions': len(session_vocab), #13542 +1
    'embedding_dim': 128,
    'hidden_d': 128,
    'trm_layers': 2,
    'n_head': 2,
    'dropout': 0.2,
    'lr': 0.1,
    'batch_size': 256,
    'epochs': 5,
    'label_smoothing': 0.1,
    'topk': 20
}
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


#Preprocessing Data

In [11]:
#Group by session_id after sort_values by timestamp
sessions_groups = dataset.sort_values(by=["timestamp"]).groupby("session_id")
sessions_train = pd.DataFrame(data = {
        "session_id": list(sessions_groups.groups.keys()),
        "month" : list(sessions_groups.month.unique().explode()),
        "weekYear" : list(sessions_groups.weekYear.unique().explode()),
        "season" : list(sessions_groups.season.unique().explode()),
        "item_ids": list(sessions_groups.item_id.apply(list)),
        "durations": list(sessions_groups.duration.apply(list)),
        "timestamps": list(sessions_groups.timestamp.apply(list)),
    })

In [12]:
sequence_length = 6
step = 2
def create_sequences(values, sequence, step):
  start_idx = 0
  sec_list = []
  #Handle case < sequence:
  if len(values) < sequence:
    values = values * 2
  #Handle case >= sequence:
  while True:
    end_idx = start_idx + sequence
    sec = values[start_idx:end_idx]
    start_idx += step
    if end_idx >= len(values):
      sec = values[-sequence:]
      sec_list.append(sec)
      break
    sec_list.append(sec)
  return sec_list

In [13]:
sessions_train["item_ids"] = sessions_train["item_ids"].apply(
    lambda values: create_sequences(
        values,sequence_length, step))

sessions_train["durations"] = sessions_train["durations"].apply(
    lambda values: create_sequences(
        values,sequence_length, step))

sessions_train = sessions_train.drop(columns = ["timestamps"])

sessions_train = sessions_train.explode(column=["item_ids", "durations"]).reset_index(drop=True)

In [14]:
#drop weekYear and season
#convert type column month to string
sessions_train["month"] = sessions_train["month"].astype(str)
sessions_train = sessions_train.drop(columns = ["weekYear", "season"])
sessions_train.head(5)

Unnamed: 0,session_id,month,item_ids,durations
0,session_1000066.0,10.0,"[item_22239.0, item_10408.0, item_15687.0, ite...","[1.0, 1.0, 34975.0, 1.0, 1.0, 34975.0]"
1,session_1000097.0,12.0,"[item_21532.0, item_17992.0, item_17151.0, ite...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]"
2,session_1000109.0,5.0,"[item_23774.0, item_15429.0, item_23774.0, ite...","[1.0, 1.0, 1.0, 1.0]"
3,session_1000118.0,4.0,"[item_23272.0, item_18125.0, item_18326.0, ite...","[1.0, 1.0, 120.0, 1.0, 1.0, 120.0]"
4,session_1000220.0,9.0,"[item_12816.0, item_18743.0, item_12816.0, ite...","[1.0, 1.0, 1.0, 1.0]"


##Train split data test

In [15]:
#Train split data test
random_selection = np.random.rand(len(sessions_train.index)) <= 0.85
train_data = sessions_train[random_selection]
test_data = sessions_train[~random_selection]

In [16]:
train_data_raw = train_data[["session_id", "item_ids"]].values
test_data_raw = test_data[["session_id", "item_ids"]].values

In [17]:
train_data_raw

array([['session_1000066.0',
        list(['item_22239.0', 'item_10408.0', 'item_15687.0', 'item_22239.0', 'item_10408.0', 'item_15687.0'])],
       ['session_1000097.0',
        list(['item_21532.0', 'item_17992.0', 'item_17151.0', 'item_21532.0', 'item_17992.0', 'item_17151.0'])],
       ['session_1000109.0',
        list(['item_23774.0', 'item_15429.0', 'item_23774.0', 'item_15429.0'])],
       ...,
       ['session_999951.0',
        list(['item_20087.0', 'item_23689.0', 'item_3499.0', 'item_20087.0', 'item_23689.0', 'item_3499.0'])],
       ['session_999990.0',
        list(['item_18764.0', 'item_26067.0', 'item_6548.0', 'item_13620.0', 'item_18764.0', 'item_26067.0'])],
       ['session_999990.0',
        list(['item_6548.0', 'item_13620.0', 'item_18764.0', 'item_26067.0', 'item_6548.0', 'item_13620.0'])]],
      dtype=object)

#Data Input for model and Data Loader

In [18]:
# Pytorch Dataset for session interactions
class ItemSeqDataset(Dataset):
    # Initialize dataset
    def __init__(self, data, item_vocab_stoi, session_vocab_stoi):
        self.data = data

        self.item_vocab_stoi = item_vocab_stoi
        self.session_vocab_stoi = session_vocab_stoi


    def __len__(self):
        return len(self.data)

    # Fetch data from the dataset
    def __getitem__(self, idx):
        session, item_sequence = self.data[idx]
        # Directly index into the vocabularies
        item_data = [self.item_vocab_stoi[item] for item in item_sequence]
        session_data = self.session_vocab_stoi[session]
        return torch.tensor(item_data), torch.tensor(session_data)


# Collate function and padding
def collate_batch(batch):
    item_list = [item[0] for item in batch]
    session_list = [item[1] for item in batch]
    return pad_sequence(item_list, padding_value=item_vocab_stoi['<unk>'], batch_first=True), torch.stack(session_list)

In [19]:
BATCH_SIZE = config['batch_size']
# Create instances of your Dataset for each set
train_dataset = ItemSeqDataset(train_data_raw, item_vocab_stoi, session_vocab_stoi)
val_dataset = ItemSeqDataset(test_data_raw, item_vocab_stoi, session_vocab_stoi)
# Create DataLoaders
train_iter = DataLoader(train_dataset, batch_size=config['batch_size'],
                        shuffle=True, collate_fn=collate_batch)
val_iter = DataLoader(val_dataset, batch_size=config['batch_size'],
                      shuffle=False, collate_fn=collate_batch)

#Model Architecture

In [20]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)

        # `div_term` is used in the calculation of the sinusoidal values.
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        # Initializing positional encoding matrix with zeros.
        pe = torch.zeros(max_len, 1, d_model)

        # Calculating the positional encodings.
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [21]:
class TransformerModel(nn.Module):
    def __init__(self, num_items: int, num_session: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        # positional encoder
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # Multihead attention mechanism.
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

        # Embedding layers
        self.item_embedding = nn.Embedding(num_items, d_model)
        self.session_embedding = nn.Embedding(num_session, d_model)

        # Defining the size of the input to the model.
        self.d_model = d_model

        # Linear layer to map the output toitem vocabulary.
        self.linear = nn.Linear(2*d_model, num_items)

        self.init_weights()

    def init_weights(self) -> None:
        # Initializing the weights of the embedding and linear layers.
        initrange = 0.1
        self.item_embedding.weight.data.uniform_(-initrange, initrange)
        self.session_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, session: Tensor, src_mask: Tensor = None) -> Tensor:
        # Embedding item ids and sessionid
        #output item_embed (batch_size, sequence length, self.d_model)
        #multiply with math.sqrt(self.d_model) -> scaling operation used in transformer models for stablize the training process
        item_embed = self.item_embedding(src) * math.sqrt(self.d_model) #item_embed: [batch_size, 3, embed_dim = 128]
        session_embed = self.session_embedding(session) * math.sqrt(self.d_model) #session_embed: [batch_size, 1, embed_dim = 128]

        # positional encoding
        item_embed = self.pos_encoder(item_embed) #item_embed pe: [batch_size, 3, embed_dim = 128]

        # generating output with final layers
        output = self.transformer_encoder(item_embed, src_mask) #output: [batch_size, 3, embed_dim = 128]

        # Expand session_embed tensor along the sequence length dimension
        session_embed = session_embed.expand(-1, output.size(1), -1) #session_embed: [batch_size, 3, embed_dim = 128]

        # Concatenate session embeddings with transformer output
        output = torch.cat((output, session_embed), dim=-1) #output: [batch_size, 3, embed_dim *2]

        output = self.linear(output) #output linear: [batch_size, 3, num_items]
        return output

#Training

In [22]:
model = TransformerModel(num_items = config['num_items'],
                         num_session = config['num_sessions'],
                         d_model = config['embedding_dim'],
                         nhead = config['n_head'],
                         d_hid = config['hidden_d'],
                         nlayers = config['trm_layers'],
                         dropout = config['dropout'])
criterion = nn.CrossEntropyLoss(label_smoothing=config['label_smoothing'])
optimizer = torch.optim.SGD(model.parameters(), lr = config['lr'])
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)



In [23]:
print(model)

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (item_embedding): Embedding(14576, 128)
  (session_embedding): Embedding(99352, 128)
  (linear): Linear(in_features=256, out_features=14576, bias=True)
)


##Train one epoch

In [24]:
def train(model: nn.Module, train_iter, epoch) -> None:
    model.train()
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    for i, (item_data, session_data) in enumerate(train_iter):
        # Load item sequence and session id
        item_data, session_data = item_data.to(device), session_data.to(device) #item_data [batch_size, seq_length], session_data [batch_size]: list of session_id
        session_data = session_data.reshape(-1, 1) #[[session1], [session2]...] reshape session_data [batch_size, 1]

        # Split item sequence to inputs and targets
        inputs, targets = item_data[:, :-1], item_data[:, 1:] #if max = 4: inputs[1,2,3]. Inputs [batch_size, 3]. targets [2,3,4]. targets [batch_size, 3]
        targets_flat = targets.reshape(-1) #targets_flat [batch_size * seq_length]

        # Predict items
        output = model(inputs, session_data)
        output_flat = output.reshape(-1, config['num_items']) # output_flat: [768, 8423] [batch_size *3, num_items]

        # Backpropogation process
        loss = criterion(output_flat, targets_flat)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        # Results
        if i % log_interval == 0 and i > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

#Evaluation

##Evaluation metric

In [25]:
#Evaluation metrics

def recall_metric(indices, targets):
  """
  Inputs: indices: [batch, top_k] indices of top_k items predicted
          targets: [batch, (seq_len-1: ex: 3)] targets items in the sequences
  Output: how many targets in indices

  """
  recall = 0
  check_recall_masked = torch.isin(indices, targets)
  check_recall_count = check_recall_masked.double()
  check_hits = torch.sum(check_recall_count) #return a torch.tensor([value.])
  hits = check_hits.item() #return an integer

  n = indices.numel() #return a total number of matrix targets

  recall = hits/n
  return recall

In [26]:
c_indices = torch.tensor([[2773, 136, 14, 56,77,88, 100, 89],
                             [612, 20, 26, 22, 2026, 67, 12 ,14]])
d_targets = torch.tensor([[2, 1], [20, 52]])

In [27]:
check_recall20 = recall_metric(indices = c_indices, targets = d_targets)
print(check_recall20)

0.0625


##Evaluation

In [32]:
#Evaluation
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()
    total_loss = 0.

    RECALL = []
    with torch.no_grad():
        for i, (item_data, session_data) in enumerate(eval_data):
            # Load item sequence and session id
            item_data, session_data = item_data.to(device), session_data.to(device)
            session_data = session_data.reshape(-1, 1)
            # Split item sequence to inputs and targets
            inputs, targets = item_data[:, :-1], item_data[:, 1:]  #if max = 4: inputs[1,2,3]. Inputs [batch_size, 3]. targets [2,3,4]. targets [batch_size, 3]
            targets_flat = targets.reshape(-1)
            # Predict items
            output = model(inputs, session_data)
            output_flat = output.reshape(-1, config['num_items']) # output_flat: [768, 8423] [batch_size *3, num_items]

            # Calculate loss
            loss = criterion(output_flat, targets_flat)
            total_loss += loss.item()

            #Get predictions
            targets_flat = targets.reshape(-1) #targets_flat [batch_size*3]. Ex: [768]
            # Reshape the output_flat to get top predictions
            outputs = output_flat.reshape(output_flat.shape[0] // inputs.shape[1],
                                          inputs.shape[1],
                                          output_flat.shape[1])[: , -1, :] #outputs: [batch, num_items]
            #values, indices = outputs.topk(k + inputs.shape[1], dim=-1) #dim = -1 sort
            values, indices = outputs.topk(k = config['topk'], dim=-1)

            #Evaluation
            recall = recall_metric(indices = indices, targets = targets)
            RECALL.append(recall)

    recall_avg = sum(RECALL)/len(RECALL)
    return total_loss / (len(eval_data) - 1), recall_avg

In [33]:
best_val_loss = float('inf')
epochs = config['epochs']

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()

        # Training
        train(model, train_iter, epoch)

        # Evaluation
        val_loss, recall_avg = evaluate(model, val_iter)

        # Compute the perplexity of the validation loss
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time

        # Results
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}  | '
            f'recall_avg {recall_avg}')
        print('-' * 89)

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

| epoch   1 lr 0.10 | ms/batch 931.25 | loss  7.97 | ppl  2884.33
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 368.33s | valid loss  8.00 | valid ppl  2995.73  | recall_avg 0.45532275800945365
-----------------------------------------------------------------------------------------
| epoch   2 lr 0.10 | ms/batch 913.61 | loss  7.89 | ppl  2679.26
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 363.85s | valid loss  7.96 | valid ppl  2858.20  | recall_avg 0.47291910123424374
-----------------------------------------------------------------------------------------
| epoch   3 lr 0.09 | ms/batch 921.87 | loss  7.85 | ppl  2562.75
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 364.76s | valid loss  7.93 | valid ppl  2784.72  | recall_avg 0.46764476102941166
-------------------------