# sequence-to-sequence approach to recommendation

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

from collections import defaultdict
import pandas as pd
from tqdm import tqdm

### read a sample interaction file and create the required input-output sequence


In [2]:
data_dir = "/recsys_data/RecSys/h_and_m_personalized_fashion_recommendation"
file_name = "hnm_3w_sessionized.txt" # "hnm_big.txt"
seq_file_name = "seq_" + file_name

inp_seq_len, tgt_seq_len = 10, 12
colsep = "\t"

def get_ids(elems):
    ids = []
    for ii, e in enumerate(elems):
        if e not in prod_dict[ii]:
            prod_dict[ii][e] = len(prod_dict[ii]) + 1
        ids.append(prod_dict[ii][e])
    return ids

def break_sessions(seqs):    
    sids = sorted(list(set([x[-1] for x in seqs])))
    temp = [[] for _ in range(len(sids))]
    for seq in seqs:
        temp[seq[-1]].append(seq[:-1])
    return temp

if not os.path.isfile(os.path.join(data_dir, seq_file_name)):
    inp_file = os.path.join(data_dir, file_name)
    sample = pd.read_csv(inp_file, sep=colsep, nrows=5)
    ncol = sample.shape[1]

    num_prod_dim = ncol - 3  # other than u, i, t
    if num_prod_dim > 0:
        prod_dict = [{} for _ in range(num_prod_dim)]
        
    User = defaultdict(list)
    with open(os.path.join(data_dir, file_name), 'r') as fr:
        for line in tqdm(fr):
            if ncol == 3:
                u, i, _ = line.rstrip().split(colsep)
            elif ncol >= 4:
                elems = line.rstrip().split(colsep)
                u, i, t = elems[0], elems[1], elems[-1]
                pdims = elems[2:-1]
                pids = get_ids(pdims)
            u = int(u)
            i = int(i)
            t = int(t)
            if ncol >= 4:
                User[u].append([i] + pids + [t])
            else:
                User[u].append(i)
    print(f"Read {len(User)} user interactions")

    with open(os.path.join(data_dir, seq_file_name), 'w') as fw:
        for u in User:
            seqs = break_sessions(User[u])
            for ii in range(1, len(seqs)):
                inp, tgt = seqs[ii-1], seqs[ii]
                if len(inp) > inp_seq_len:
                    inp = inp[-inp_seq_len:] # taking the last 12
                if len(tgt) > tgt_seq_len:
                    tgt = tgt[:tgt_seq_len]  # taking the first 12
                inp = [str(ii[0]) for ii in inp]  # only the product-id
                tgt = [str(ii[0]) for ii in tgt]  # always only the product-id
                fw.write(" ".join(inp) + "\t" + " ".join(tgt) + "\n")
                

In [3]:
# write the test file - only the last session for each user
test_seq_file = "seq_test_" + file_name
if not os.path.isfile(os.path.join(data_dir, test_seq_file)):
    inp_file = os.path.join(data_dir, file_name)
    sample = pd.read_csv(inp_file, sep=colsep, nrows=5)
    ncol = sample.shape[1]

    num_prod_dim = ncol - 3  # other than u, i, t
    if num_prod_dim > 0:
        prod_dict = [{} for _ in range(num_prod_dim)]
        
    User = defaultdict(list)
    with open(os.path.join(data_dir, file_name), 'r') as fr:
        for line in tqdm(fr):
            if ncol == 3:
                u, i, _ = line.rstrip().split(colsep)
            elif ncol >= 4:
                elems = line.rstrip().split(colsep)
                u, i, t = elems[0], elems[1], elems[-1]
                pdims = elems[2:-1]
                pids = get_ids(pdims)
            u = int(u)
            i = int(i)
            t = int(t)
            if ncol >= 4:
                User[u].append([i] + pids + [t])
            else:
                User[u].append(i)
    print(f"Read {len(User)} user interactions")

    with open(os.path.join(data_dir, test_seq_file), 'w') as fw:
        for u in User:
            seqs = break_sessions(User[u])
            inp = seqs[-1]  # take the last session
            inp = inp[:inp_seq_len]
            inp = [str(ii[0]) for ii in inp]  # only the product-id
            fw.write(" ".join(inp) + "\n")


### Write the data in the required seq2seq form

In [4]:
class RecoDataset:
    def __init__(self, problem_type='reco'):
        self.problem_type = 'reco'
        self.inp_lang_tokenizer = None
        self.targ_lang_tokenizer = None
    

    def unicode_to_ascii(self, s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

    ## Step 1 and Step 2 
    def preprocess_sentence_text(self, w):
        w = self.unicode_to_ascii(w.lower().strip())

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)

        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

        w = w.strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w
    
    def preprocess_sentence(self, w):
        w = w.lower().strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w

    def create_dataset(self, path, num_examples=None):
        # path : path to spa-eng.txt file
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
        print(f"Read {len(lines)} examples")
        if num_examples:
            word_pairs = [[self.preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
        else:
            word_pairs = [[self.preprocess_sentence(w) for w in l.split('\t')]  for l in lines]

        return zip(*word_pairs)

    # Step 3 and Step 4
    def tokenize(self, lang):
        # lang = list of sentences in a language
        
        # print(len(lang), "example sentence: {}".format(lang[0]))
        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
        lang_tokenizer.fit_on_texts(lang)

        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
        tensor = lang_tokenizer.texts_to_sequences(lang) 

        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
        ## and pads the sequences to match the longest sequences in the given input
        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='pre')

        return tensor, lang_tokenizer

    def load_dataset(self, path, num_examples=None):
        # creating cleaned input, output pairs
        targ_lang, inp_lang = self.create_dataset(path, num_examples)

        input_tensor, inp_lang_tokenizer = self.tokenize(inp_lang)
        target_tensor, targ_lang_tokenizer = self.tokenize(targ_lang)

        return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

    def load_dataset_common(self, path, num_examples=None):
        # creating cleaned input, output pairs with the same tokenizer
        inp_lang, targ_lang = self.create_dataset(path, num_examples)
        
        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
        lang_tokenizer.fit_on_texts(inp_lang + targ_lang)
        
        input_tensor = lang_tokenizer.texts_to_sequences(inp_lang) 
        target_tensor = lang_tokenizer.texts_to_sequences(targ_lang)
        
        input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding='pre')
        target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='pre')

        return input_tensor, target_tensor, lang_tokenizer

    def call(self, file_path, num_examples, BUFFER_SIZE, BATCH_SIZE):
        input_tensor, target_tensor, tokenizer = self.load_dataset_common(file_path, num_examples)
        self.inp_lang_tokenizer, self.targ_lang_tokenizer = tokenizer, tokenizer
        
        print("Example input:", input_tensor[0])
        print("Example target:", target_tensor[0])
        print("TENSOR SHAPE", input_tensor.shape, target_tensor.shape)
        
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

        print("TRAIN:", input_tensor_train.shape, target_tensor_train.shape)
        print("VALID:", input_tensor_val.shape, target_tensor_val.shape)
        num_train, num_val = input_tensor_train.shape[0], input_tensor_val.shape[0]

        train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

        val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
        val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)
        
        return train_dataset, val_dataset, self.inp_lang_tokenizer, self.targ_lang_tokenizer, num_train, num_val
    
    def get_test_data(self, file_path, num_examples, BUFFER_SIZE, BATCH_SIZE, inp_seq_len):
        
        lines = io.open(file_path, encoding='UTF-8').read().strip().split('\n')
        print(f"Read {len(lines)} test examples")
        sentences = [self.preprocess_sentence(l) for l in lines]
        
        # we take the target as input for 
        targ_lang = [x.split()[1:-1] for x in sentences]
        targ_lang = [x[:inp_seq_len] for x in targ_lang]
        targ_lang = [' '.join(x) for x in targ_lang]
        targ_lang = ['<start> ' + x + ' <end>' for x in targ_lang]
        target_tensor = self.inp_lang_tokenizer.texts_to_sequences(targ_lang)
        target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='pre')
        print(target_tensor.shape)

        test_dataset = tf.data.Dataset.from_tensor_slices(target_tensor)
        test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=False)
        
        return test_dataset


In [5]:
BUFFER_SIZE = 71460 # 1362281
BATCH_SIZE = 256
num_examples = None
file_path = os.path.join(data_dir, seq_file_name)

dataset_creator = RecoDataset('reco')
train_dataset, val_dataset, inp_lang, targ_lang, num_train, num_val = dataset_creator.call(file_path, num_examples, BUFFER_SIZE, BATCH_SIZE)
print(f"Total {num_train} training and {num_val} validation examples")

Read 71460 examples
Example input: [  0   0   0   0   0   0   0   0   0   2 945   3]
Example target: [    0     0     0     0     0     0     0     0     2 11992 13838  3548
    42     3]
TENSOR SHAPE (71460, 12) (71460, 14)
TRAIN: (57168, 12) (57168, 14)
VALID: (14292, 12) (14292, 14)
Total 57168 training and 14292 validation examples


In [6]:
test_file_path = os.path.join(data_dir, test_seq_file)
test_data = dataset_creator.get_test_data(test_file_path, num_examples, BUFFER_SIZE, BATCH_SIZE, inp_seq_len)

Read 48709 test examples
(48709, 12)


In [7]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([256, 12]), TensorShape([256, 14]))

In [8]:
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]

embedding_dim = 256
units = 512
steps_per_epoch = num_train//BATCH_SIZE

In [9]:
print("max_length_input, max_length_target, vocab_size_input, vocab_size_target, steps")
max_length_input, max_length_output, vocab_inp_size, vocab_tar_size, steps_per_epoch

max_length_input, max_length_target, vocab_size_input, vocab_size_target, steps


(12, 14, 21464, 21464, 223)

In [10]:
##### 
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        ##-------- LSTM layer in Encoder ------- ##
        self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, h, c = self.lstm_layer(x, initial_state = hidden)
        return output, h, c

    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))] 

In [11]:
## Test Encoder Stack

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)


# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (256, 12, 512)
Encoder h vecotr shape: (batch size, units) (256, 512)
Encoder c vector shape: (batch size, units) (256, 512)


In [12]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.attention_type = attention_type
    
    # Embedding Layer
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    
    #Final Dense layer on which softmax will be applied
    self.fc = tf.keras.layers.Dense(vocab_size)

    # Define the fundamental cell for decoder recurrent structure
    self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)
   


    # Sampler
    self.sampler = tfa.seq2seq.sampler.TrainingSampler()

    # Create attention mechanism with memory = None
    self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                                              None, self.batch_sz*[max_length_input], self.attention_type)

    # Wrap attention mechanism with the fundamental rnn cell of decoder
    self.rnn_cell = self.build_rnn_cell(batch_sz)

    # Define the decoder with respect to fundamental rnn cell
    self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

    
  def build_rnn_cell(self, batch_sz):
    rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                  self.attention_mechanism, attention_layer_size=self.dec_units)
    return rnn_cell

  def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
    # ------------- #
    # typ: Which sort of attention (Bahdanau, Luong)
    # dec_units: final dimension of attention outputs 
    # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
    # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

    if(attention_type=='bahdanau'):
      return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
    else:
      return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

  def build_initial_state(self, batch_sz, encoder_state, Dtype):
    decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
    decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
    return decoder_initial_state


  def call(self, inputs, initial_state):
    x = self.embedding(inputs)
    outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])
    return outputs


In [13]:
# Test decoder stack

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, 'luong')
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)


sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)


Decoder Outputs Shape:  (256, 13, 21464)


In [14]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss  

In [15]:
checkpoint_dir = './reco_training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [16]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


        dec_input = targ[ : , :-1 ] # Ignore <end> token
        real = targ[ : , 1: ]         # ignore <start> token

        # Set the AttentionMechanism object with encoder_outputs
        decoder.attention_mechanism.setup_memory(enc_output)

        # Create AttentionWrapperState as initial_state for decoder
        decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
        pred = decoder(dec_input, decoder_initial_state)
        logits = pred.rnn_output
        loss = loss_function(real, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [17]:
def rel(true, pred):
    return 1 if true == pred else 0


def precision_k(actual, predicted, k) -> float:
    actual_set = set(actual[:k])
    predicted_set = set(predicted[:k])
    precision_k_value = len(actual_set & predicted_set) / k

    return precision_k_value


def mAP_k(actual, predicted) -> float:
    # actual = row['valid_true'].split() # prediction_string --> prediction list
    # predicted = row['valid_pred'].split() # prediction_string --> prediction list

    M = min(len(actual), len(predicted))
    K = min(M, 12)

    if M == 0:
        return 0
    else:
        score = 0
        for k in range(1, K + 1):
            precision_k_value = precision_k(actual, predicted, k)

            score += precision_k_value * rel(actual[k - 1], predicted[k - 1])
        return score


def map_batch(label, prediction):
    """
    label: (batch, 12)
    prediction: (batch, 12)
    """
    pred = prediction.numpy()
    label = label.numpy()
    maps = []
    for ii in range(prediction.shape[0]):
        l_ii = [x for x in label[ii,:] if x not in [0, 2, 3]]
        p_ii = [x for x in pred[ii,:] if x not in [2, 3]]
        if len(p_ii) > 0:
            maps.append(mAP_k(l_ii, p_ii))
        else:
            maps.append(0)
    return np.mean(maps)


def eval(dataset, encoder, decoder):
    enc_hidden = encoder.initialize_hidden_state()
    all_maps = []
    for (inp, targ) in tqdm(dataset):

        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)
        dec_input = targ[ : , :-1 ] # Ignore <end> token
        real = targ[ : , 1: ]         # ignore <start> token

        # Set the AttentionMechanism object with encoder_outputs
        decoder.attention_mechanism.setup_memory(enc_output)

        # Create AttentionWrapperState as initial_state for decoder
        decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
        pred = decoder(dec_input, decoder_initial_state)
        logits = pred.rnn_output
        prediction = tf.argmax(logits, axis=-1)
        mapr = map_batch(real, prediction)
        all_maps.append(mapr)
    return np.mean(all_maps)


## Train the Model

In [16]:
EPOCHS = 50
print(f"Training for {EPOCHS} epochs with {steps_per_epoch} steps per epoch")
best_val = 0.0
patience, max_patience = 0, 5
for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)

    for (inp, targ) in tqdm(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

#     if batch % 10 == 0:
#       print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
#                                                    batch,
#                                                    batch_loss.numpy()))

    # evaluate validation data
    map_val = eval(val_dataset, encoder, decoder)
    if map_val > best_val:
        best_val = map_val
        print("Performance improved ... saving the model")
        checkpoint.save(file_prefix = checkpoint_prefix)
    else:
        patience += 1
        if patience == max_patience:
            print(f"Maximum patience ({max_patience}) reached ... exiting!")
            break
            
  # saving (checkpoint) the model every 2 epochs
#     if (epoch + 1) % 2 == 0:
    print('Epoch {} Loss {:.4f}, val-MAP {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch, map_val))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Training for 50 epochs with 223 steps per epoch


223it [01:39,  2.25it/s]
55it [00:11,  4.91it/s]


Epoch 1 Loss 2.1180, val-MAP 0.0007
Time taken for 1 epoch 110.65342926979065 sec



223it [01:27,  2.54it/s]
55it [00:12,  4.52it/s]


Epoch 2 Loss 1.8303, val-MAP 0.0063
Time taken for 1 epoch 100.14052391052246 sec



223it [01:26,  2.57it/s]
55it [00:12,  4.33it/s]


Epoch 3 Loss 1.7859, val-MAP 0.0075
Time taken for 1 epoch 99.86541700363159 sec



223it [01:30,  2.48it/s]
55it [00:13,  4.21it/s]


Epoch 4 Loss 1.7599, val-MAP 0.0079
Time taken for 1 epoch 103.39815473556519 sec



223it [01:28,  2.52it/s]
55it [00:13,  4.11it/s]


Epoch 5 Loss 1.7304, val-MAP 0.0079
Time taken for 1 epoch 102.30819010734558 sec



223it [01:30,  2.47it/s]
55it [00:13,  4.14it/s]


Epoch 6 Loss 1.6973, val-MAP 0.0100
Time taken for 1 epoch 103.92908501625061 sec



223it [01:29,  2.50it/s]
55it [00:13,  4.18it/s]


Epoch 7 Loss 1.6557, val-MAP 0.0105
Time taken for 1 epoch 102.73507618904114 sec



223it [01:29,  2.49it/s]
55it [00:13,  4.18it/s]


Epoch 8 Loss 1.6059, val-MAP 0.0145
Time taken for 1 epoch 103.18530130386353 sec



223it [01:28,  2.52it/s]
55it [00:13,  4.14it/s]


Epoch 9 Loss 1.5516, val-MAP 0.0176
Time taken for 1 epoch 102.21515345573425 sec



223it [01:30,  2.47it/s]
55it [00:13,  4.04it/s]


Epoch 10 Loss 1.4943, val-MAP 0.0188
Time taken for 1 epoch 104.09590363502502 sec



223it [01:29,  2.48it/s]
55it [00:13,  4.06it/s]


Epoch 11 Loss 1.4304, val-MAP 0.0227
Time taken for 1 epoch 103.65496921539307 sec



223it [01:31,  2.45it/s]
55it [00:13,  4.01it/s]


Epoch 12 Loss 1.3594, val-MAP 0.0272
Time taken for 1 epoch 105.13080525398254 sec



223it [01:29,  2.49it/s]
55it [00:13,  4.17it/s]


Epoch 13 Loss 1.2819, val-MAP 0.0284
Time taken for 1 epoch 102.95392656326294 sec



223it [01:32,  2.42it/s]
55it [00:13,  4.14it/s]


Epoch 14 Loss 1.2017, val-MAP 0.0315
Time taken for 1 epoch 105.64360928535461 sec



223it [01:30,  2.47it/s]
55it [00:13,  4.13it/s]


Epoch 15 Loss 1.1240, val-MAP 0.0318
Time taken for 1 epoch 103.76611638069153 sec



223it [01:30,  2.46it/s]
55it [00:13,  4.11it/s]


Epoch 16 Loss 1.0490, val-MAP 0.0335
Time taken for 1 epoch 104.50241160392761 sec



223it [01:30,  2.46it/s]
55it [00:13,  3.99it/s]


Epoch 17 Loss 0.9782, val-MAP 0.0344
Time taken for 1 epoch 104.66958284378052 sec



223it [01:31,  2.43it/s]
55it [00:13,  4.01it/s]


Epoch 18 Loss 0.9117, val-MAP 0.0354
Time taken for 1 epoch 105.65183734893799 sec



223it [01:30,  2.45it/s]
55it [00:13,  3.98it/s]


Epoch 19 Loss 0.8505, val-MAP 0.0349
Time taken for 1 epoch 104.71269512176514 sec



223it [01:31,  2.44it/s]
55it [00:13,  4.08it/s]


Epoch 20 Loss 0.7932, val-MAP 0.0341
Time taken for 1 epoch 104.98081827163696 sec



223it [01:29,  2.48it/s]
55it [00:13,  4.10it/s]


Epoch 21 Loss 0.7396, val-MAP 0.0335
Time taken for 1 epoch 103.26875376701355 sec



223it [01:30,  2.48it/s]
55it [00:13,  4.10it/s]


Epoch 22 Loss 0.6917, val-MAP 0.0334
Time taken for 1 epoch 103.43218040466309 sec



223it [01:28,  2.53it/s]
55it [00:13,  4.10it/s]

Maximum patience (5) reached ... exiting!





## Use tf-addons BasicDecoder

In [18]:
def evaluate_sentence(sentence):
    sentence = dataset_creator.preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_input,
                                                          padding='pre')
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])
    end_token = targ_lang.word_index['<end>']

    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
    # Setup Memory in decoder stack
    decoder.attention_mechanism.setup_memory(enc_out)

    # set decoder_initial_state
    decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

    decoder_embedding_matrix = decoder.embedding.variables[0]

    outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
    return outputs.sample_id.numpy()

def translate(sentence):
    result = evaluate_sentence(sentence)
    print(result)
    result = targ_lang.sequences_to_texts(result)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

## Restore the latest checkpoint & test

In [19]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f2c2bbe54d0>

In [20]:
translate('1186')

[[879 190 190 190 693 693 693 693 693 693 693 693   3]]
Input: 1186
Predicted translation: ['851 209 209 209 687 687 687 687 687 687 687 687 <end>']


In [21]:
translate('13112 16042 3871 35')

[[ 640 2138  666  607   39   39 3552 3552  188  188  188  188    3]]
Input: 13112 16042 3871 35
Predicted translation: ['555 2159 587 545 44 44 3353 3353 164 164 164 164 <end>']


In [22]:
def evaluate_multiple_sentence(sentences):
    sentences = [dataset_creator.preprocess_sentence(sentence) for sentence in sentences]
    inputs = inp_lang.texts_to_sequences(sentences)
    inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=inp_seq_len, padding='pre')
    inference_batch_size = inputs.shape[0]

    enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])
    end_token = targ_lang.word_index['<end>']

    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
    # Setup Memory in decoder stack
    decoder.attention_mechanism.setup_memory(enc_out)

    # set decoder_initial_state
    decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

    decoder_embedding_matrix = decoder.embedding.variables[0]

    outputs, _, _ = decoder_instance(decoder_embedding_matrix,
                                     start_tokens = start_tokens,
                                     end_token= end_token,
                                     initial_state=decoder_initial_state)
    return outputs.sample_id.numpy()

def translate_multiple(sentences):
    result = evaluate_multiple_sentence(sentences)
    result = targ_lang.sequences_to_texts(result)
    result = [[r for r in res.split() if r != "<end>"] for res in result]
    result = [r[:tgt_seq_len] for r in result]
    result = [' '.join(r) for r in result]
    return result


In [23]:
translate_multiple(['1186', '13112 16042 3871 35'])

['851 209 209 209 1146 1146 1301 1301 4584 284 1448',
 '555 555 555 130 130 545 545 572 1719 572 1590']

In [24]:
test_seq_file

'seq_test_hnm_3w_sessionized.txt'

In [31]:
test_file_path

'/recsys_data/RecSys/h_and_m_personalized_fashion_recommendation/seq_test_hnm_3w_sessionized.txt'

## Evaluate on Test Data

In [30]:
lines = io.open(test_seq_file, encoding='UTF-8').read().strip().split('\n')
print(f"Read {len(lines)} test examples")
batch_size, count = 32, 0
with open(os.path.join(data_dir, 'seq_test_pred.txt'), 'w') as fw:
    for line in tqdm(lines):
        result = evaluate_sentence(line)
        result = targ_lang.sequences_to_texts(result)[0]
        result = [r for r in result.split() if r != "<end>"]
        result = result[:tgt_seq_len]
        result = ' '.join(result)
        fw.write(result + '\n')
        count += 1
print(f"Written {count} lines in {test_file_path}")

Read 48709 test examples


100%|██████████| 48709/48709 [1:34:12<00:00,  8.62it/s]

Written 48709 lines in /recsys_data/RecSys/h_and_m_personalized_fashion_recommendation/seq_test_hnm_3w_sessionized.txt





In [25]:
lines = io.open(test_seq_file, encoding='UTF-8').read().strip().split('\n')
print(f"Read {len(lines)} test examples")
batch_size, count = 32, 0
with open(os.path.join(data_dir, 'seq_test_pred.txt'), 'w') as fw:
    for ii in tqdm(range(0, len(lines), batch_size)):
        end = ii + batch_size
        if end > len(lines):
            end = len(lines)
        inps = lines[ii:end]
#         result = translate_multiple(inps)
#         for res in result:
#             fw.write(res + '\n')
#             count += 1
print(f"Written {count} lines in {test_file_path}")

Read 48709 test examples


100%|██████████| 1523/1523 [00:00<00:00, 1272495.02it/s]

Written 0 lines in /recsys_data/RecSys/h_and_m_personalized_fashion_recommendation/seq_test_hnm_3w_sessionized.txt





In [26]:
inps

['12873',
 '5345 679',
 '7437 10120 10565 14782 1336 857 12314 1369 2335 2335',
 '180',
 '15']

## Use tf-addons BeamSearch Decoder

In [24]:
def beam_evaluate_sentence(sentence, beam_width=3):
  sentence = dataset_creator.preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_input,
                                                          padding='pre')
  inputs = tf.convert_to_tensor(inputs)
  inference_batch_size = inputs.shape[0]
  result = ''

  enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
  enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

  dec_h = enc_h
  dec_c = enc_c

  start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])
  end_token = targ_lang.word_index['<end>']

  # From official documentation
  # NOTE If you are using the BeamSearchDecoder with a cell wrapped in AttentionWrapper, then you must ensure that:
  # The encoder output has been tiled to beam_width via tfa.seq2seq.tile_batch (NOT tf.tile).
  # The batch_size argument passed to the get_initial_state method of this wrapper is equal to true_batch_size * beam_width.
  # The initial state created with get_initial_state above contains a cell_state value containing properly tiled final state from the encoder.

  enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)
  decoder.attention_mechanism.setup_memory(enc_out)
  print("beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] :", enc_out.shape)

  # set decoder_inital_state which is an AttentionWrapperState considering beam_width
  hidden_state = tfa.seq2seq.tile_batch([enc_h, enc_c], multiplier=beam_width)
  decoder_initial_state = decoder.rnn_cell.get_initial_state(batch_size=beam_width*inference_batch_size, dtype=tf.float32)
  decoder_initial_state = decoder_initial_state.clone(cell_state=hidden_state)

  # Instantiate BeamSearchDecoder
  decoder_instance = tfa.seq2seq.BeamSearchDecoder(decoder.rnn_cell,beam_width=beam_width, output_layer=decoder.fc)
  decoder_embedding_matrix = decoder.embedding.variables[0]

  # The BeamSearchDecoder object's call() function takes care of everything.
  outputs, final_state, sequence_lengths = decoder_instance(decoder_embedding_matrix, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state)
  # outputs is tfa.seq2seq.FinalBeamSearchDecoderOutput object. 
  # The final beam predictions are stored in outputs.predicted_id
  # outputs.beam_search_decoder_output is a tfa.seq2seq.BeamSearchDecoderOutput object which keep tracks of beam_scores and parent_ids while performing a beam decoding step
  # final_state = tfa.seq2seq.BeamSearchDecoderState object.
  # Sequence Length = [inference_batch_size, beam_width] details the maximum length of the beams that are generated

  
  # outputs.predicted_id.shape = (inference_batch_size, time_step_outputs, beam_width)
  # outputs.beam_search_decoder_output.scores.shape = (inference_batch_size, time_step_outputs, beam_width)
  # Convert the shape of outputs and beam_scores to (inference_batch_size, beam_width, time_step_outputs)
  final_outputs = tf.transpose(outputs.predicted_ids, perm=(0,2,1))
  beam_scores = tf.transpose(outputs.beam_search_decoder_output.scores, perm=(0,2,1))
  
  return final_outputs.numpy(), beam_scores.numpy()

def beam_translate(sentence):
  result, beam_scores = beam_evaluate_sentence(sentence)
  print(result.shape, beam_scores.shape)
  for beam, score in zip(result, beam_scores):
    print(beam.shape, score.shape)
    output = targ_lang.sequences_to_texts(beam)
    output = [a[:a.index('<end>')] for a in output]
    beam_score = [a.sum() for a in score]
    print('Input: %s' % (sentence))
    for i in range(len(output)):
      print('{} Predicted translation: {}  {}'.format(i+1, output[i], beam_score[i]))


In [51]:
beam_translate('75 76 77 78 79')

beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] : (3, 102, 1024)
(1, 3, 13) (1, 3, 13)
(3, 13) (3, 13)
Input: 75 76 77 78 79
1 Predicted translation: 24621 24622 24622 20323 24623 24624 468 8919 24625 15315 24622 24626   -34.05118942260742
2 Predicted translation: 21809 21809 31703 411 34156 1546 1546 1546 29535 6505 21752 11602   -36.71038055419922
3 Predicted translation: 21809 21809 31703 411 34156 1546 1546 1546 29535 6505 6505 2549   -73.9906997680664


In [69]:
x = [1,2,3,4,5]
x[:3:]

[1, 2, 3]