# Recommendations in Keras using triplet loss
Along the lines of BPR [1]. 

[1] Rendle, Steffen, et al. "BPR: Bayesian personalized ranking from implicit feedback." Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. AUAI Press, 2009.

This is implemented (more efficiently) in LightFM (https://github.com/lyst/lightfm). See the MovieLens example (https://github.com/lyst/lightfm/blob/master/examples/movielens/example.ipynb) for results comparable to this notebook.

## Set up the architecture
A simple dense layer for both users and items: this is exactly equivalent to latent factor matrix when multiplied by binary user and item indices. There are three inputs: users, positive items, and negative items. In the triplet objective we try to make the positive item rank higher than the negative item for that user.

Because we want just one single embedding for the items, we use shared weights for the positive and negative item inputs (a siamese architecture).

This is all very simple but could be made arbitrarily complex, with more layers, conv layers and so on. I expect we'll be seeing a lot of papers doing just that.


In [None]:
"""
Triplet loss network example for recommenders
"""

from __future__ import print_function

import numpy as np

from keras import backend as K
from keras.models import Model
from keras.layers import Embedding, Flatten, Input, Lambda
from keras.layers.merge import concatenate, dot
from keras.callbacks import TensorBoard
from keras.optimizers import Adam
from keras.engine.topology import Layer

import tensorflow as tf

import data
import metrics


class BprLoss(Layer):
    
    def __init__(self, **kwargs):
        self.support_mask = True
        super(BprLoss, self).__init__(**kwargs)

    def build(self, input_shape):
        super(BprLoss, self).build(input_shape)
        
    def call(self, inputs):
        
        assert len(inputs) == 3
        pos_item = inputs[0]
        neg_item = inputs[1]
        user = inputs[2]
        
        loss = 1.0 - K.sigmoid(
            K.sum(user * pos_item, axis=-1, keepdims=True) -
            K.sum(user * neg_item, axis=-1, keepdims=True))

        return loss


class ZeroMaskedEntries(Layer):
    """
    This layer is called after an Embedding layer.
    It zeros out all of the masked-out embeddings.
    It also swallows the mask without passing it on.
    You can change this to default pass-on behavior as follows:

    def compute_mask(self, x, mask=None):
        if not self.mask_zero:
            return None
        else:
            return K.not_equal(x, 0)
    """

    def __init__(self, **kwargs):
        self.support_mask = True
        super(ZeroMaskedEntries, self).__init__(**kwargs)

    def build(self, input_shape):
        self.output_dim = input_shape[1]
        self.repeat_dim = input_shape[2]

    def call(self, x, mask=None):
        mask = K.cast(mask, 'float32')
        mask = K.repeat(mask, self.repeat_dim)
        mask = K.permute_dimensions(mask, (0, 2, 1))
        return x * mask

    def compute_mask(self, input_shape, input_mask=None):
        return None


def mask_aware_mean(x):
    # recreate the masks - all zero rows have been masked
    mask = K.not_equal(K.sum(K.abs(x), axis=2, keepdims=True), 0)

    # number of that rows are not all zeros
    n = K.sum(K.cast(mask, 'float32'), axis=1, keepdims=False)
    
    # compute mask-aware mean of x, or all zeroes if no rows present
    x_mean = K.sum(x, axis=1, keepdims=False) / n
    x_mean = tf.check_numerics(
        x_mean,
        'unexpected nans found in mean -- check at least one entry is present')

    return x_mean


def mask_aware_mean_output_shape(input_shape):
    shape = list(input_shape)
    assert len(shape) == 3 
    return (shape[0], shape[2])


def identity_loss(y_true, y_pred):

    return K.mean(y_pred * y_true)


def build_model(num_users, num_items, num_tags, max_tags,
                item_latent_dim, tag_latent_dim):

    # ID vectors for the positive and negative items
    positive_item_input = Input((1, ), name='positive_item_input')
    negative_item_input = Input((1, ), name='negative_item_input')
    
    # Zero-padded tag ID vectors, for the positive and negative items
    positive_tags = Input((max_tags, ), name='positive_tags')
    negative_tags = Input((max_tags, ), name='negative_tags')

    # Shared embedding layer for positive and negative items
    item_embedding_layer = Embedding(
        num_items, item_latent_dim, name='item_embedding', input_length=1)
    
    # Shared embedding layer for positive and negative items' tags
    tag_embedding_layer = Embedding(
        num_tags, tag_latent_dim, name='tag_embedding', input_length=max_tags)

    user_input = Input((1, ), name='user_input')

    positive_item_embedding = Flatten()(item_embedding_layer(
        positive_item_input))
    
    negative_item_embedding = Flatten()(item_embedding_layer(
        negative_item_input))
    
    positive_tags_embedding = Lambda(
        mask_aware_mean, mask_aware_mean_output_shape, name='pos_mean')(tag_embedding_layer(positive_tags))
    
    negative_tags_embedding = Lambda(
        mask_aware_mean, mask_aware_mean_output_shape, name='neg_mean')(tag_embedding_layer(negative_tags))
    
    positive_vec = concatenate([positive_item_embedding, positive_tags_embedding])
    
    negative_vec = concatenate([negative_item_embedding, negative_tags_embedding])
    
    # User embedding has to have dimensionality equal to item plus tag embeddings,
    # as they need to align element-wise
    user_latent_dim = item_latent_dim + tag_latent_dim
    user_embedding = Flatten()(Embedding(
        num_users, user_latent_dim, name='user_embedding', input_length=1)(
            user_input))

    loss = BprLoss(name='bpr_loss')([positive_vec, negative_vec, user_embedding])

    model = Model(
        inputs=[positive_item_input, positive_tags, negative_item_input, negative_tags, user_input],
        outputs=loss)
    model.compile(loss=identity_loss, optimizer=Adam())
    
    # Now define a separate model for prediction, only using one half of the
    # siamese network, plus the user
    
    user_dot_item = dot(
        [positive_vec, user_embedding], axes=-1, name='user_dot_item')
    
    pred_model = Model(
        inputs=[positive_item_input, positive_tags, user_input],
        outputs=user_dot_item)

    return model, pred_model

## Load and transform data
We're going to load the Movielens 100k dataset and create triplets of (user, known positive item, randomly sampled negative item).

The success metric is AUC: in this case, the probability that a randomly chosen known positive item from the test set is ranked higher for a given user than a ranomly chosen negative item.

In [None]:
item_latent_dim = 90
tag_latent_dim = 10

# Read data
train, test = data.get_movielens_data()
num_users, num_items = train.shape

item_features = data.get_movielens_item_metadata(use_item_ids=False)

max_tags = item_features.shape[1]
num_tags = item_features.max() + 1

# Prepare the test triplets
test_uid, test_pid, test_nid = data.get_triplets(test)

model, pred_model = build_model(
    num_users, num_items, num_tags, max_tags,
    item_latent_dim, tag_latent_dim)

# Print the model structure
print('Model for training:')
print(model.summary())
print()
print('Model for inference:')
print(pred_model.summary())
print()

# Sanity check, should be around 0.5
print('AUC before training %s' % metrics.full_auc(pred_model, test, item_features))

## Write metadata for TensorBoard

### TODO move this stuff to data.py (or maybe metadata.py)

In [None]:
import os
import shutil

log_dir = '/tmp/tfboard/triplet_keras/'

shutil.rmtree(log_dir, onerror=lambda f, p, e: print(e))

try:
    os.makedirs(log_dir)
except Exception, e:
    print(str(e))

items_metadata = os.path.join(log_dir, 'items.txt')
with open(items_metadata, 'w') as f:
    print('0 - None', file=f)
    for line in data._get_movie_raw_metadata():
        fields = line.split('|')
        if len(fields) > 1:
            print('%s - %s' % (fields[0], fields[1]), file=f)

tags_metadata = os.path.join(log_dir, 'tags.txt')
with open(tags_metadata, 'w') as f:
    # No need for dummy '0' as above -- this is already provided
    for line in data._get_genre_raw_metadata():
        fields = line.split('|')
        if len(fields) > 1:
            # Note fields are opposite way round from movies
            print('%s - %s' % (fields[1], fields[0]), file=f)

## Run the model
Run for a couple of epochs, checking the AUC after every epoch.

In [None]:
tensorboard = TensorBoard(
    log_dir='/tmp/tfboard/triplet_keras/',
    embeddings_freq=1,
    embeddings_layer_names=['item_embedding', 'tag_embedding', 'user_embedding'],
    embeddings_metadata={'item_embedding': items_metadata, 'tag_embedding': tags_metadata})

num_epochs = 30
checkpoint_every = 10

for epoch in range(num_epochs):

    print('Epoch %s' % epoch)

    # Sample triplets from the training data
    uid, pid, nid = data.get_triplets(train)
    ptags = item_features[pid]
    ntags = item_features[nid]

    X = {
        'user_input': uid,
        'positive_item_input': pid,
        'negative_item_input': nid,
        'positive_tags': ptags,
        'negative_tags': ntags
    }
    
    checkpoint = ((epoch + 1) % checkpoint_every == 0)
    
    if checkpoint:
        callbacks=[tensorboard]
    else:
        callbacks=[]

    model.fit(X,
              np.ones(len(uid)),
              batch_size=64,
              epochs=1,
              verbose=1,
              shuffle=True,
              callbacks=callbacks)

    if checkpoint:
        print('AUC %s' % metrics.full_auc(pred_model, test, item_features))

The AUC is in the low-90s. At some point we start overfitting, so it would be a good idea to stop early or add some regularization.