In [1]:
import numpy as np
import pandas as pd
import datetime
import json
from tqdm import tqdm
from sklearn.model_selection import GroupShuffleSplit
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


In [11]:
class Dataset(object):
    def __init__(self, data_dir, session_key='session', item_key='item', time_key='time', is_training=True):
        # Read csv
        self.data_dir = data_dir
        if is_training :
            self.df = pd.read_csv(self.data_dir + 'gru/train.csv')
        else :
            self.df = pd.read_csv(self.data_dir + 'gru/valid.csv')
        
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key

        self.add_item_indices()
        self.df.sort_values([session_key, time_key], inplace=True)
        self.click_offsets = self.get_click_offset()
        self.session_idx_arr = self.order_session_idx()

    def add_item_indices(self):
        with open(self.data_dir + 'zero_mapping.json', 'r') as f:
            dict_data= json.load(f)
        self.df['item']  = self.df['item'].map(lambda x : dict_data['item'][str(x)])

    def get_click_offset(self):
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()
        return offsets

    def order_session_idx(self):
        session_idx_arr = np.arange(self.df[self.session_key].nunique())
        return session_idx_arr

    @property
    def items(self):
        return self.df[self.item_key].unique()

In [10]:
class DataLoader():
    def __init__(self, dataset, batch_size=50):
        """
        A class for creating session-parallel mini-batches.
        Args:
             dataset (SessionDataset): the session dataset to generate the batches from
             batch_size (int): size of the batch
        """
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,): torch.FloatTensor. Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """
        # initializations
        df = self.dataset.df
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = []  # indicator for the sessions to be terminated
        finished = False

        while not finished:
            minlen = (end - start).min()
            # Item indices(for embedding) for clicks where the first sessions start
            idx_target = df.item.values[start]

            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item.values[start + i + 1]
                input = torch.LongTensor(idx_input)
                target = torch.LongTensor(idx_target)
                yield input, target, mask

            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

In [12]:
class GRU4REC(nn.Module):
    def __init__(self, input_size, hidden_size=100, num_layers=3, final_act='tanh',
                 dropout_hidden=0.5, dropout_input=0.5, batch_size=50, embedding_dim=-1, use_cuda=True):
        super(GRU4REC, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = self.input_size
        self.num_layers = num_layers
        self.dropout_hidden = dropout_hidden
        self.dropout_input = dropout_input
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.onehot_buffer = self.init_emb()
        self.h2o = nn.Linear(hidden_size, self.output_size)
        self.create_final_activation(final_act)
        if self.embedding_dim != -1:
            self.look_up = nn.Embedding(input_size, self.embedding_dim)
            self.gru = nn.GRU(self.embedding_dim, self.hidden_size, self.num_layers, dropout=self.dropout_hidden)
        else:
            self.gru = nn.GRU(self.input_size, self.hidden_size, self.num_layers, dropout=self.dropout_hidden)
        self = self.to(self.device)

    def create_final_activation(self, final_act):
        if final_act == 'tanh':
            self.final_activation = nn.Tanh()
        elif final_act == 'relu':
            self.final_activation = nn.ReLU()
        elif final_act == 'softmax':
            self.final_activation = nn.Softmax()
        elif final_act == 'softmax_logit':
            self.final_activation = nn.LogSoftmax()
        elif final_act.startswith('elu-'):
            self.final_activation = nn.ELU(alpha=float(final_act.split('-')[1]))
        elif final_act.startswith('leaky-'):
            self.final_activation = nn.LeakyReLU(negative_slope=float(final_act.split('-')[1]))

    def forward(self, input, hidden):
        '''
        Args:
            input (B,): a batch of item indices from a session-parallel mini-batch.
            target (B,): torch.LongTensor of next item indices from a session-parallel mini-batch.
        Returns:
            logit (B,C): Variable that stores the logits for the next items in the session-parallel mini-batch
            hidden: GRU hidden state
        '''

        if self.embedding_dim == -1:
            embedded = self.onehot_encode(input)
            if self.training and self.dropout_input > 0: embedded = self.embedding_dropout(embedded)
            embedded = embedded.unsqueeze(0)
        else:
            embedded = input.unsqueeze(0)
            embedded = self.look_up(embedded)

        output, hidden = self.gru(embedded, hidden) #(num_layer, B, H)
        output = output.view(-1, output.size(-1))  #(B,H)
        logit = self.final_activation(self.h2o(output))

        return logit, hidden

    def init_emb(self):
        '''
        Initialize the one_hot embedding buffer, which will be used for producing the one-hot embeddings efficiently
        '''
        onehot_buffer = torch.FloatTensor(self.batch_size, self.output_size)
        onehot_buffer = onehot_buffer.to(self.device)
        return onehot_buffer

    def onehot_encode(self, input):
        """
        Returns a one-hot vector corresponding to the input
        Args:
            input (B,): torch.LongTensor of item indices
            buffer (B,output_size): buffer that stores the one-hot vector
        Returns:
            one_hot (B,C): torch.FloatTensor of one-hot vectors
        """
        self.onehot_buffer.zero_()
        index = input.view(-1, 1)
        one_hot = self.onehot_buffer.scatter_(1, index, 1)
        return one_hot

    def embedding_dropout(self, input):
        p_drop = torch.Tensor(input.size(0), 1).fill_(1 - self.dropout_input)
        mask = torch.bernoulli(p_drop).expand_as(input) / (1 - self.dropout_input)
        mask = mask.to(self.device)
        input = input * mask
        return input

    def init_hidden(self):
        '''
        Initialize the hidden state of the GRU
        '''
        try:
            h0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(self.device)
        except:
            self.device = 'cpu'
            h0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(self.device)
        return h0

In [13]:
class TOP1_max(nn.Module):
    def __init__(self):
        super(TOP1_max, self).__init__()

    def forward(self, logit):
        logit_softmax = F.softmax(logit, dim=1)
        diff = -(logit.diag().view(-1, 1).expand_as(logit) - logit)
        loss = torch.mean(logit_softmax * (torch.sigmoid(diff) + torch.sigmoid(logit ** 2)))
        return loss

In [77]:
def reset_hidden(hidden, mask):
    """Helper function that resets hidden state when some sessions terminate"""
    if len(mask) != 0:
        hidden[:, mask, :] = 0
    return hidden

In [78]:
def get_recall(indices, targets, k=10): #recall --> wether next item in session is within top K=20 recommended items or not
    """
    Calculates the recall score for the given predictions and targets
    Args:
        indices (Bxk): torch.LongTensor. top-k indices predicted by the model.
        targets (B): torch.LongTensor. actual target indices.
    Returns:
        recall (float): the recall score
    """
    _, indices = torch.topk(indices, k, -1)
    targets = targets.view(-1, 1).expand_as(indices)
    hits = (targets == indices).nonzero()
    if len(hits) == 0:
        return 0
    n_hits = (targets == indices).nonzero()[:, :-1].size(0)
    recall = float(n_hits) / targets.size(0)
    return recall

In [14]:
data_dir = '/opt/ml/movie-recommendation/data/train/'
train_dataset = Dataset(data_dir)
valid_dataset = Dataset(data_dir, is_training=False)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GRU4REC(input_size=len(train_dataset.items)).to(device)
criterion = TOP1_max()
optimizer = optim.Adagrad(model.parameters(), lr=0.01)

In [80]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train() 
    train_losses = []
    train_loader = DataLoader(train_dataset)
    
    hidden = model.init_hidden()
    for ii, (input, target, mask) in tqdm(enumerate(train_loader), total=len(train_loader.dataset.df) // train_loader.batch_size, miniters = 1000):
        input = input.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        hidden = reset_hidden(hidden, mask).detach()
        logit, hidden = model(input, hidden)
        
        logit_sampled = logit[:, target.view(-1)]
        loss = criterion(logit_sampled)
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()

    train_mean_losses = np.mean(train_losses)
    print(f'Train loss: {train_mean_losses} ({epoch}/{n_epochs})')

    model.eval()
    valid_losses = []
    recalls = []
    valid_loader = DataLoader(valid_dataset)

    with torch.no_grad():
        hidden = model.init_hidden()
        for ii, (input, target, mask) in tqdm(enumerate(valid_loader), total=len(valid_loader.dataset.df) // valid_loader.batch_size, miniters = 1000):
            input = input.to(device)
            target = target.to(device)
            logit, hidden = model(input, hidden)
            logit_sampled = logit[:, target.view(-1)]
            loss = criterion(logit_sampled)
            recall = get_recall(logit, target)

            valid_losses.append(loss.item())
            recalls.append(recall)
            
    valid_mean_losses = np.mean(valid_losses)
    valid_mean_recall = np.mean(recalls)
    print(f'Valid loss: {valid_mean_losses}, Valid recall@10: {valid_mean_recall} ({epoch}/{n_epochs})')

 97%|█████████▋| 88995/92057 [05:24<00:11, 274.48it/s]


Train loss: 0.01946418443238965 (0/1)


 96%|█████████▌| 9944/10353 [00:10<00:00, 968.26it/s] 

Valid loss: 0.019254385100150134, Valid recall@10: 0.056534593724859215 (0/1)



