In [2]:
import torch
from torch import nn
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [3]:
url = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dressipi_recsys2022_datasets.zip'
!wget $url
!unzip dressipi_recsys2022_datasets.zip


--2024-05-05 03:15:51--  https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dressipi_recsys2022_datasets.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 79384785 (76M) [application/zip]
Saving to: ‘dressipi_recsys2022_datasets.zip’


2024-05-05 03:15:53 (178 MB/s) - ‘dressipi_recsys2022_datasets.zip’ saved [79384785/79384785]

Archive:  dressipi_recsys2022_datasets.zip
   creating: dressipi_recsys2022_dataset/
  inflating: dressipi_recsys2022_dataset/README.txt  
  inflating: dressipi_recsys2022_dataset/candidate_items.csv  
  inflating: dressipi_recsys2022_dataset/item_features.csv  
  inflating: dressipi_recsys2022_dataset/test_final_purchases.csv  
  inflating: dressipi_recsys2022_dataset/test_final_sessions.csv  

In [4]:
train_sessions = pd.read_csv('dressipi_recsys2022_dataset/train_sessions.csv')
train_purchases = pd.read_csv('dressipi_recsys2022_dataset/train_purchases.csv')
item_features = pd.read_csv('dressipi_recsys2022_dataset/item_features.csv')
candidate_items = pd.read_csv('dressipi_recsys2022_dataset/candidate_items.csv')
test_full_sessions = pd.read_csv('dressipi_recsys2022_dataset/test_full_sessions.csv')
test_full_purchases = pd.read_csv('dressipi_recsys2022_dataset/test_full_purchases.csv')

In [None]:
#convert to timetsamp

In [None]:
#rename column

In [None]:
#remove short sessions <2

#Data Loader

In [2]:
class Dataset(object):
    def __init__(self, path, sep=',', session_key='SessionID', item_key='ItemID', time_key='Time', n_sample=-1, itemmap=None, itemstamp=None, time_sort=False):
        # Read csv
        self.df = pd.read_csv(path, sep=sep, dtype={session_key: int, item_key: int, time_key: float})
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.time_sort = time_sort
        if n_sample > 0:
            self.df = self.df[:n_sample]

        # Add colummn item index to data
        self.add_item_indices(itemmap=itemmap)
        """
        Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        clicks within a session are next to each other, where the clicks within a session are time-ordered.
        """
        self.df.sort_values([session_key, time_key], inplace=True)
        self.click_offsets = self.get_click_offset()
        self.session_idx_arr = self.order_session_idx()

    def add_item_indices(self, itemmap=None):
        """
        Add item index column named "item_idx" to the df
        Args:
            itemmap (pd.DataFrame): mapping between the item Ids and indices
        """
        if itemmap is None:
            item_ids = self.df[self.item_key].unique()  # type is numpy.ndarray
            item2idx = pd.Series(data=np.arange(len(item_ids)),
                                 index=item_ids)
            # Build itemmap is a DataFrame that have 2 columns (self.item_key, 'item_idx)
            itemmap = pd.DataFrame({self.item_key: item_ids,
                                   'item_idx': item2idx[item_ids].values})
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on=self.item_key, how='inner')

    def get_click_offset(self):
        """
        self.df[self.session_key] return a set of session_key
        self.df[self.session_key].nunique() return the size of session_key set (int)
        self.df.groupby(self.session_key).size() return the size of each session_id
        self.df.groupby(self.session_key).size().cumsum() retunn cumulative sum
        """
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()
        return offsets

    def order_session_idx(self):
        if self.time_sort:
            sessions_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.session_key].nunique())
        return session_idx_arr

    @property
    def items(self):
        return self.itemmap[self.item_key].unique()

In [3]:
class DataLoader():
    def __init__(self, dataset, batch_size=50):
        """
        A class for creating session-parallel mini-batches.

        Args:
             dataset (SessionDataset): the session dataset to generate the batches from
             batch_size (int): size of the batch
        """
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.

        Yields:
            input (B,): torch.FloatTensor. Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """
        # initializations
        df = self.dataset.df
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = []  # indicator for the sessions to be terminated
        finished = False

        while not finished:
            minlen = (end - start).min()
            # Item indices(for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]

            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                input = torch.LongTensor(idx_input)
                target = torch.LongTensor(idx_target)
                yield input, target, mask

            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

#Model Architectures

In [4]:
class GRU4REC(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, final_act='tanh',
                 dropout_hidden=.5, dropout_input=0, batch_size=50, embedding_dim=-1, use_cuda=False):
        super(GRU4REC, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout_hidden = dropout_hidden
        self.dropout_input = dropout_input
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.onehot_buffer = self.init_emb()
        self.h2o = nn.Linear(hidden_size, output_size)
        self.create_final_activation(final_act)
        if self.embedding_dim != -1:
            self.look_up = nn.Embedding(input_size, self.embedding_dim)
            self.gru = nn.GRU(self.embedding_dim, self.hidden_size, self.num_layers, dropout=self.dropout_hidden)
        else:
            self.gru = nn.GRU(self.input_size, self.hidden_size, self.num_layers, dropout=self.dropout_hidden)
        self = self.to(self.device)

    def create_final_activation(self, final_act):
        if final_act == 'tanh':
            self.final_activation = nn.Tanh()
        elif final_act == 'relu':
            self.final_activation = nn.ReLU()
        elif final_act == 'softmax':
            self.final_activation = nn.Softmax()
        elif final_act == 'softmax_logit':
            self.final_activation = nn.LogSoftmax()
        elif final_act.startswith('elu-'):
            self.final_activation = nn.ELU(alpha=float(final_act.split('-')[1]))
        elif final_act.startswith('leaky-'):
            self.final_activation = nn.LeakyReLU(negative_slope=float(final_act.split('-')[1]))

    def forward(self, input, hidden):
        '''
        Args:
            input (B,): a batch of item indices from a session-parallel mini-batch.
            target (B,): torch.LongTensor of next item indices from a session-parallel mini-batch.

        Returns:
            logit (B,C): Variable that stores the logits for the next items in the session-parallel mini-batch
            hidden: GRU hidden state
        '''

        if self.embedding_dim == -1:
            embedded = self.onehot_encode(input)
            if self.training and self.dropout_input > 0: embedded = self.embedding_dropout(embedded)
            embedded = embedded.unsqueeze(0)
        else:
            embedded = input.unsqueeze(0)
            embedded = self.look_up(embedded)

        output, hidden = self.gru(embedded, hidden) #(num_layer, B, H)
        output = output.view(-1, output.size(-1))  #(B,H)
        logit = self.final_activation(self.h2o(output))

        return logit, hidden

    def init_emb(self):
        '''
        Initialize the one_hot embedding buffer, which will be used for producing the one-hot embeddings efficiently
        '''
        onehot_buffer = torch.FloatTensor(self.batch_size, self.output_size)
        onehot_buffer = onehot_buffer.to(self.device)
        return onehot_buffer

    def onehot_encode(self, input):
        """
        Returns a one-hot vector corresponding to the input
        Args:
            input (B,): torch.LongTensor of item indices
            buffer (B,output_size): buffer that stores the one-hot vector
        Returns:
            one_hot (B,C): torch.FloatTensor of one-hot vectors
        """
        self.onehot_buffer.zero_()
        index = input.view(-1, 1)
        one_hot = self.onehot_buffer.scatter_(1, index, 1)
        return one_hot

    def embedding_dropout(self, input):
        p_drop = torch.Tensor(input.size(0), 1).fill_(1 - self.dropout_input)
        mask = torch.bernoulli(p_drop).expand_as(input) / (1 - self.dropout_input)
        mask = mask.to(self.device)
        input = input * mask
        return input

    def init_hidden(self):
        '''
        Initialize the hidden state of the GRU
        '''
        try:
            h0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(self.device)
        except:
            self.device = 'cpu'
            h0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(self.device)
        return h0

#Loss Function

In [5]:
from torch.autograd import Variable
import torch.nn.functional as F

In [6]:
class SampledCrossEntropyLoss(nn.Module):
    """ CrossEntropyLoss with n_classes = batch_size = the number of samples in the session-parallel mini-batch """
    def __init__(self, use_cuda):
        """
        Args:
             use_cuda (bool): whether to use cuda or not
        """
        super(SampledCrossEntropyLoss, self).__init__()
        self.xe_loss = nn.CrossEntropyLoss()
        self.use_cuda = use_cuda

    def forward(self, logit):
        batch_size = logit.size(1)
        target = Variable(torch.arange(batch_size).long())
        if self.use_cuda:
            target = target.cuda()

        return self.xe_loss(logit, target)

In [7]:
class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, logit):
        """
        Args:
            logit (BxB): Variable that stores the logits for the items in the mini-batch
                         The first dimension corresponds to the batches, and the second
                         dimension corresponds to sampled number of items to evaluate
        """
        # differences between the item scores
        diff = logit.diag().view(-1, 1).expand_as(logit) - logit
        # final loss
        loss = -torch.mean(F.logsigmoid(diff))
        return loss

In [8]:
class TOP1Loss(nn.Module):
    def __init__(self):
        super(TOP1Loss, self).__init__()
    def forward(self, logit):
        """
        Args:
            logit (BxB): Variable that stores the logits for the items in the mini-batch
                         The first dimension corresponds to the batches, and the second
                         dimension corresponds to sampled number of items to evaluate
        """
        diff = -(logit.diag().view(-1, 1).expand_as(logit) - logit)
        loss = torch.sigmoid(diff).mean() + torch.sigmoid(logit ** 2).mean()
        return loss

In [9]:
class TOP1_max(nn.Module):
    def __init__(self):
        super(TOP1_max, self).__init__()

    def forward(self, logit):
        logit_softmax = F.softmax(logit, dim=1)
        diff = -(logit.diag().view(-1, 1).expand_as(logit) - logit)
        loss = torch.mean(logit_softmax * (torch.sigmoid(diff) + torch.sigmoid(logit ** 2)))
        return loss

In [10]:
class BPR_max(nn.Module):
    def __init__(self):
        super(BPR_max, self).__init__()
    def forward(self, logit):
        logit_softmax = F.softmax(logit, dim=1)
        diff = logit.diag().view(-1, 1).expand_as(logit) - logit
        loss = -torch.log(torch.mean(logit_softmax * torch.sigmoid(diff)))
        return loss

In [11]:
class LossFunction(nn.Module):
  def __init__(self, loss_type = 'TOP1', use_cuda = False):
    ##Abstract loss function that can support many loss functions
    super(LossFunction, self).__init__()
    self.loss_type = loss_type
    self.use_cuda = use_cuda
    if loss_type == 'CrossEntropy':
      self._loss_fn = SampledCrossEntropyLoss(use_cuda)
    elif loss_type == 'TOP1':
      self._loss_fn = TOP1Loss()
    elif loss_type == 'BPR':
      self._loss_fn = BPRLoss()
    elif loss_type == 'TOP1_max':
      self._loss_fn = TOP1Loss()
    elif loss_type == 'BPR_max':
      self._loss_fn = BPRLoss()
    else:
      raise NotImplementedError

  def formward(self, logit):
    return self._loss_fn(logit)



#Optimizer

In [12]:
import torch.optim as optim

class Optimizer:
  def __init__(self, params, optimizer_type = 'Adagrad', lr = .05,
               momentum = 0, weight_decay = 0, eps = 1e-6):
    """
    Optimizer class for handling various kinds of optimizers.
    Usage is exactly the same as an instance of torch.optim

    Args:
      params: torch.nn.Parameter. The NN parameters to optimize
      optimizer_type: type of the optimizer to use
      lr: learning rate
      momentum: momentum, if needed
      weight_decay: weight decay, if needed. Equivalent to L2 regularization.
      eps: eps parameter, if needed
    """

    if optimizer_type == 'RMSProp':
            self.optimizer = optim.RMSprop(params, lr=lr, eps=eps, weight_decay=weight_decay, momentum=momentum)
    elif optimizer_type == 'Adagrad':
            self.optimizer = optim.Adagrad(params, lr=lr, weight_decay=weight_decay)
    elif optimizer_type == 'Adadelta':
            self.optimizer = optim.Adadelta(params, lr=lr, eps=eps, weight_decay=weight_decay)
    elif optimizer_type == 'Adam':
            self.optimizer = optim.Adam(params, lr=lr, eps=eps, weight_decay=weight_decay)
    elif optimizer_type == 'SparseAdam':
            self.optimizer = optim.SparseAdam(params, lr=lr, eps=eps)
    elif optimizer_type == 'SGD':
            self.optimizer = optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay)
    else:
        raise NotImplementedError

    def zero_grad(self):
      self.optimizer.zero_grad()

    def step(self):
      self.optimizer.step()


#Metric

In [13]:
#get_mrr metric
def get_mrr(indices, targets): #Mean Receiprocal Rank --> Average of rank of next item in the session.
    """
    Calculates the MRR score for the given predictions and targets
    Args:
        indices (Bxk): torch.LongTensor. top-k indices predicted by the model.
        targets (B): torch.LongTensor. actual target indices.
    Returns:
        mrr (float): the mrr score
    """
    tmp = targets.view(-1, 1)
    targets = tmp.expand_as(indices)
    hits = (targets == indices).nonzero()
    ranks = hits[:, -1] + 1
    ranks = ranks.float()
    rranks = torch.reciprocal(ranks)
    mrr = torch.sum(rranks).data / targets.size(0)
    return mrr

In [14]:
#get_recall metric
def get_recall(indices, targets): #recall --> wether next item in session is within top K=20 recommended items or not
    """
    Calculates the recall score for the given predictions and targets
    Args:
        indices (Bxk): torch.LongTensor. top-k indices predicted by the model.
        targets (B): torch.LongTensor. actual target indices.
    Returns:
        recall (float): the recall score
    """
    targets = targets.view(-1, 1).expand_as(indices)
    hits = (targets == indices).nonzero()
    if len(hits) == 0:
        return 0
    n_hits = (targets == indices).nonzero()[:, :-1].size(0)
    recall = float(n_hits) / targets.size(0)
    return recall

In [15]:
#evaluation function
def evaluate(indices, targets, k = 10):
  """
  Evaluates the model using metric scores: recall/ mrr/ hit rate

  Args:
    logits (B,C): torch.LongTensor. The predicted logit for the next items.
    targets (B): torch.LongTensor. actual target indices

  Returns:
    recall (float): the recall score
    mrr (float): the mrr score
    hitrate (float): the hit rate score
  """
  _, indices = torch.topk(indices, k, -1)
  recall = get_recall(indices, targets)
  mrr = get_mrr(indices, targets)
  return recall, mrr

#Evaluation

In [16]:
class Evaluation(object):
  def __init__(self, model, loss_func, use_cude, k=10):
    self.model = model
    self.loss_func = loss_func
    self.topk = k
    self.device = torch.device('cuda' if use_cuda else 'cpu')

  def eval(self, eval_data, batch_size):
    self.model.eval() #set model in evaluation mode. normalize layers using running statistics de-activate dropout.
    losses = []
    recalls = []
    mrrs = []
    dataloader = Dataloader(eval_data, batch_size) #Check bo lib
    with torch.no_grad(): #no backprobagation
      hidden = self.model.init_hidden()
      for ii, (input, target, mask) in tqdm(enumerate(dataloader), total = len(dataloader.dataset.df) //dataloader.batch_size, miniters = 1000):
      #for input, target, mask in dataloader:
        input = input.to(self.device)
        target = target.to(self.device)
        logit, hidden = self.model(input, hidden)
        logit_sampled = logit[:, target.view(-1)] #view(-1) to flatten the tensor
        loss = self.loss_func(logit_sampled)
        recall, mrr = evaluate(logit, target, k = self.topk) #check bo lib

        #torch.Tensor.item() to get a Python number from a tensor containing
        losses.append(loss.item())
        recalls.append(recall)
        mrrs.append(mrr)
    mean_losses = np.mean(losses)
    mean_recall = np.mean(recalls)
    mean_mrr = np.mean(mrrs)

    return mean_losses, mean_recall, mean_mrr


#Training

In [17]:
import os
import time
from tqdm import tqdm

In [18]:
class Trainer(object):
  def __init__(self, model, train_data, eval_data, optim, use_cuda, loss_func, batch_size, args):
    self.model = model
    self.train_data = train_data
    self.eval_data = eval_data
    self.optim = optim
    self.loss_func = loss_func
    self.evalution = evaluation(self.model, self.loss_func, use_cuda, k=args.k_eval)
    self.device = torch.device('cuda' if use_cuda else 'cpu')
    self.batch_size = batch_size
    self.args = args

  def train(self, start_epoch, end_epoch, start_time = None):
    if start_time is None:
      self.start_time = time.time()
    else:
      self.start_time = start_time

    for epoch in range(start_epoch, end_epoch + 1):
      st = time.time()
      print('Start Epoch #', epoch)
      train_loss = self.train_epoch(epoch)
      loss, recall, mrr = self.evaluation.eval(self.eval_data, self.batch_size)

      print("Epoch: {}, train loss: {:.4f}, loss: {:.4f}, recall: {:.4f}, mrr: {:.4f}, time: {}".format(epoch, train_loss, loss, recall, mrr, time.time() - st))
      checkpoint = {
          'model': self.model,
          'args': self.args,
          'epoch': epoch,
          'optim': self.optim,
          'loss': loss,
          'recall': recall,
          'mrr': mrr
      }
      model_name = os.path.join(self.args.checkpoint_dir, "model_{0:05d}.pt".format(epoch))
      torch.save(checkpoint, model_name)
      print('Save model as %s' %model_name)

  def train_epoch(self, epoch):
    self.model.train()
    losses = []

    def reset_hidden(hidden, mask):
      #check helper function that resets hidden state when some sessions terminate
      if len(mask) != 0:
        hidden[:, mask,:] = 0
        return hidden

    hidden = self.model.init_hidden()
    dataloader = DataLoader(self.train_data, self.batch_size)

    for ii, (input, target, mask) in tqdm(enumerate(dataloader), total=len(dataloader.dataset.df) // dataloader.batch_size, miniters = 1000):
      input = input.to(self.device)
      target = target.to(self.device)
      self.optim.zero_grad()
      hidden = reset_hidden(hidden, mask).detach() #.detach() Returns a new Tensor, detached from the current graph.
      logit, hidden = self.model(input, hidden)
      #output sampling
      logit_sampled = logit[:, target.view(-1)]
      loss = self.loss_func(logit_sampled)
      loss.backward()
      self.optim.step()

    mean_losses = np.mean(losses)
    return mean_losses

In [1]:
args = {
    'hidden_size': 100,
    'num_layers': 3,
    'batch_size': 50,
    'dropout_input': 0,
    'dropout_hidden': 0.5,
    'n_epochs': 5,
    'k_eval': 20,
    ###optimizer
    'optimizer_type': 'Adagrad',
    'final_act': 'tanh',
    'lr': 0.01,
    'weight_decay': 0,
    'momentum': 0,
    'eps': 1e-6,
    'seed': 22,
    'sigma': None,
    'embedding_dim': -1,
    ##loss function
    'loss_type': 'BPR'
    'time_sort': False,
    'model_name': 'GRU4REC-BPR'
    'save_dir': 'checkpoints/'
    'data_folder': '../dataset/dataset after/',
    #'train_data': '../train.txt',
    #'valid_data:: '../valid.txt',
    'is_eval': False,
    'load_model': None,
    'checkpoint_dir': 'checkpoints/',
}

SyntaxError: invalid syntax (<ipython-input-1-a7b90d4635fb>, line 21)

In [None]:
# n_items


In [None]:
def main():
  #Load train data

  #Load valid data

  #Set the parameters

  #Training
  if args['is_eval'] is False:
    ##Initialize the model
    model = GRU4REC(input_size, hidden_size, output_size, num_layers=1, final_act='tanh',
                 dropout_hidden=.5, dropout_input=0, batch_size=50, embedding_dim=-1, use_cuda=False)
    ##Weights initialization
    init_model(model)
    ##Optimizer
    optimizer = Optimizer(model.parameters(), optimizer_type = 'Adagrad', lr = .05,
               momentum = 0, weight_decay = 0, eps = 1e-6)
    ##Trainer class
    trainer = Trainer(model, train_data = args['train_data'], eval_data, optim, use_cuda, loss_func, batch_size, args)
    print('###START TRAINING...')
    trainer.train(0, n-epochs -1)

  #Testing
  else:
    if args['load_model'] is not None:
      print("Loading pre-trained model from {}".format(args['load_model']))
      try:
        checkpoint = torch.load(args['load_model'])
      except:
        checkpoint = torch.load(args['load_model'], map_location=lambda storage, loc: storage)
      model = checkpoint["model"]
      model.gru.flatten_parameters() #reset parameters data pointer so that they can use faster code paths
      evaluation = Evaluation(model, loss_function, use_cuda = args['use_cuda'], k = args['k_eval'])
      loss, recall, mrr = evaluation.eval(valid_data, batch_size)
      print("Final result: recall = {:.2f}, mrr = {:.2f}" .format(recall, mrr))
    else:
      print("No pretrained model was found!")

In [None]:
##train_model = main()