# Implementation of an attention-based model for item recommendation.

Wang, Shoujin, Liang Hu, Longbing Cao, Xiaoshui Huang, Defu Lian, and Wei Liu.
"Attention-based transactional context embedding for next-item recommendation."
In Proceedings of the AAAI conference on artificial intelligence, vol. 32, no. 1. 2018.

In [None]:
import os
import sys
import json
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
from pathlib import Path
import matplotlib.pyplot as plt

sys.path.append("./../../")
from choice_learn.basket_models import TripDataset
from choice_learn.basket_models.basic_attention_model import AttentionBasedContextEmbedding
from choice_learn.basket_models.synthetic_dataset import SyntheticDataGenerator

### Parameters

In [None]:
lr = 0.05
epochs = 30
n_baskets = 1000
embedding_dim = 4
n_negative_samples = 3
full_assortment_matrix = np.array([[1,1,1,1,1,1,1,1]])
n_items = full_assortment_matrix.shape[1]


### Synthetic Data Generator

In [None]:
data_gen = SyntheticDataGenerator(
    proba_complementary_items=0.7,
    proba_neutral_items=0.3,
    noise_proba=0.15,
    items_nest = {0:[0, 1, 2],
                   1: [3, 4, 5],
                   2: [6],
                   3: [7]},
    nests_interactions = [["", "compl", "neutral", "neutral"],
                          ["compl", "", "neutral", "neutral"],
                          ["neutral", "neutral", "", "neutral"],
                          ["neutral", "neutral", "neutral", ""]])

### Two functions to visualise distributions

* `visualise_tripdataset_trips` : to show on a heatmap the conditional items distribution P(i|j) in a tripdataset
* `get_model_representation` : to show on a heatmap the conditional items distribution P(i|j) when calling a model's predict()

In [None]:
def visualise_tripdataset_trips(dataset, n_items):
    """
    Visualize the conditional probability P(i|j) of items co-occurring in baskets.

    Parameters
    ----------
    dataset : TripDataset
        The dataset containing trips.
    n_items : int
        Number of unique items.
    """
    distribution_matrix = np.zeros((n_items, n_items))
    for trip in dataset.trips:
        basket = trip.purchases
        for i in basket:
            for j in basket:
                if i != j:
                    distribution_matrix[i, j] += 1
    row_sums = distribution_matrix.sum(axis=1, keepdims=True)
    for i in range(len(row_sums)):
        if row_sums[i] != 0:
            distribution_matrix[i] = distribution_matrix[i]/row_sums[i]
    
    return distribution_matrix



def get_model_representation(model, n_items, test_dataset=None, assortment_matrix=None):
    """
    Visualize the model's conditional probability matrix and training loss history.

    Parameters
    ----------
    model : AttentionBasedContextEmbedding
        The trained model.
    hist : dict
        Training history with "train_loss" key.
    n_items : int
        Number of unique items.
    test_dataset : TripDataset, optional
        Dataset for evaluation. If None, uses single-item contexts.
    assortment_matrix : np.ndarray, optional
        Binary matrix indicating available items.
    """
    if assortment_matrix is None:
        assortment_matrix = np.ones((1, n_items), dtype=int)

    if test_dataset is None:
        available_items = assortment_matrix[0]
        contexts = tf.constant([[i] for i in range(n_items)], dtype=tf.int32)
        
    else:
        contexts = []
        for batch in test_dataset.iter_batch(1, data_method="aleacarta"):
            contexts.append(batch[1][0])
        contexts = tf.ragged.constant(
            [row[row != -1] for row in contexts], dtype=tf.int32
        )
        available_items = batch[-1][0]

    context_prediction = model.predict(contexts, available_items=available_items)
    predicted_items = [np.argmax(context_prediction[i]) for i in range(context_prediction.shape[0])]

    if test_dataset is None:
        distribution_matrix = np.stack(context_prediction)
        for i in range(len(available_items)):
            if available_items[i] == 0:
                distribution_matrix[i] *= 0
    else:
        distribution_matrix = np.zeros((n_items, n_items))
        for i in range(contexts.shape[0]):
            for j in contexts[i]:
                distribution_matrix[predicted_items[i], j] += 1

    row_sums = distribution_matrix.sum(axis=1, keepdims=True)
    for i in range(len(row_sums)):
        if row_sums[i] != 0:
            distribution_matrix[i] = distribution_matrix[i]/row_sums[i]

    return distribution_matrix

### Generate full assortments synthetic dataset : train & test

In [None]:
trip_dataset_train = data_gen.generate_trip_dataset(n_baskets,full_assortment_matrix)
trip_dataset_test = data_gen.generate_trip_dataset(n_baskets,full_assortment_matrix)

    
distribution_matrix = visualise_tripdataset_trips(trip_dataset_train,n_items)

plt.figure(figsize=(4, 3))
plt.imshow(distribution_matrix, vmin=0, vmax=1, interpolation='nearest', cmap="coolwarm")
plt.colorbar(label='P(i|j)')
plt.title('Items distribution in the train dataset (A_full)')
plt.xlabel('j')
plt.ylabel('i')
plt.tight_layout()
plt.show()

### Instantiate and train the model 1 on A_full
 -> The model uses the true NCE sampling distribution; items frequencies aware

In [None]:
model1 = AttentionBasedContextEmbedding(
    epochs=epochs,
    lr=lr,
    embedding_dim=embedding_dim,
    n_negative_samples=n_negative_samples)

model1.instantiate(n_items=len(full_assortment_matrix[0]))
history1 = model1.fit(trip_dataset_train,use_true_nce_distribution = True)


### Instantiate and train the model 2 on A_full
-> he model uses a uniform sampling distribution for NCE (1/(n_items-1))

In [None]:
model2 = AttentionBasedContextEmbedding(
    epochs=epochs,
    lr=lr,
    embedding_dim=embedding_dim,
    n_negative_samples=n_negative_samples)

model2.instantiate(n_items=len(full_assortment_matrix[0]))
history2 = model2.fit(trip_dataset_train, use_true_nce_distribution = False)


Model 1 is using the true NCE distribution

The following plot of the evaluation on the test dataset shows P(i|j) in the predictions for the test dataset

In [None]:
M1 = get_model_representation(model1, n_items)
M2 = get_model_representation(model1, n_items, trip_dataset_test)
M3 = visualise_tripdataset_trips(trip_dataset_train, n_items)

fig, axes = plt.subplots(1, 3, figsize=(10, 4), constrained_layout = True)

im1 = axes[0].imshow(M1, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[0].set_title("Model1 evaluated on [[0], [1], ...]")

im2 = axes[1].imshow(M2, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[1].set_title("Model1 evaluated on test_dataset")

im3 = axes[2].imshow(M3, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[2].set_title("Training distribution")

cbar = fig.colorbar(im1, ax=axes, orientation='vertical', shrink=0.8)
cbar.set_label("Probability")

plt.show()

loss_train_dataset = model1.evaluate(trip_dataset_train)
loss_test_dataset = model1.evaluate(trip_dataset_test)
print(f"Loss of model1 on the train dataset {loss_train_dataset}")
print(f"Loss of model1 on the test dataset {loss_test_dataset}")
print("Used loss for evaluation: NLL")

Model 2 is using a uniform sampling distribution for NCE

The following plot of the evaluation on the test dataset shows P(i|j) in the predictions for the test dataset

In [None]:
M1 = get_model_representation(model2, n_items)
M2 = get_model_representation(model2, n_items, trip_dataset_test)
M3 = visualise_tripdataset_trips(trip_dataset_train, n_items)

fig, axes = plt.subplots(1, 3, figsize=(10, 4), constrained_layout = True)

im1 = axes[0].imshow(M1, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[0].set_title("Model2 evaluated on [[0], [1], ...]")

im2 = axes[1].imshow(M2, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[1].set_title("Model2 evaluated on test_dataset")

im3 = axes[2].imshow(M3, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[2].set_title("Training distribution")

cbar = fig.colorbar(im1, ax=axes, orientation='vertical', shrink=0.8)
cbar.set_label("Probability")

plt.show()

loss_train_dataset = model2.evaluate(trip_dataset_train)
loss_test_dataset = model2.evaluate(trip_dataset_test)
print(f"Loss of model2 on the train dataset {loss_train_dataset}")
print(f"Loss of model2 on the test dataset {loss_test_dataset}")
print("Used loss for evaluation: NLL")

### Remarks
* The uniform sampling distribution yields a smaller loss.
* The plot of the P(i|j) distribution, calculated after evaluation on the test dataset, differs from the base model

  Maybe we should focus on P(i|j1, j2) instead.

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

def plot_Pi_given_not_i(model, n_items, available_items=None, max_context_size=None):
    """
    Plot the average probability P(i | context not containing i) for each item i,
    averaged over all possible contexts that do not include i.

    Parameters
    ----------
    model : object
        The trained model with a .predict() method.
    n_items : int
        Number of unique items.
    available_items : array-like or None, optional
        Binary array indicating available items. If None, all items are available.
    max_context_size : int or None, optional
        Maximum context size to consider for tractability. If None, uses all sizes.
    """
    Pi_given_not_i = np.zeros(n_items)
    counts = np.zeros(n_items)

    for i in range(n_items):
        context_candidates = [j for j in range(n_items) if j != i]
        if max_context_size is not None:
            context_sizes = range(1, max_context_size + 1)
        else:
            context_sizes = range(1, n_items)
        for k in context_sizes:
            for context in itertools.combinations(context_candidates, k):
                context_tensor = tf.ragged.constant([list(context)], dtype=tf.int32)
                avail = np.ones(n_items, dtype=np.float32) if available_items is None else available_items
                probas = model.predict(context_tensor, available_items=avail)
                Pi_given_not_i[i] += probas[0, i]
                counts[i] += 1

    return Pi_given_not_i / np.maximum(counts, 1)

def empirical_Pi_given_not_i(tripdataset, n_items, max_context_size=None):
    """
    Compute empirical P(i | context not containing i) from a TripDataset.

    Parameters
    ----------
    tripdataset : TripDataset
        The dataset containing trips.
    n_items : int
        Number of unique items.
    max_context_size : int or None
        If set, only consider contexts up to this size.

    Returns
    -------
    np.ndarray
        Array of shape (n_items,) with empirical P(i | context not containing i).
    """
    numerators = np.zeros(n_items)
    denominators = np.zeros(n_items)

    for trip in tripdataset.trips:
        basket = list(trip.purchases)
        for idx, target in enumerate(basket):
            context = basket[:idx] + basket[idx+1:]
            if target >= n_items:
                continue
            if max_context_size is not None and len(context) > max_context_size:
                continue
            # For every item, if it is NOT in the context, increment denominator
            for i in range(n_items):
                if i not in context:
                    denominators[i] += 1
                    if target == i:
                        numerators[i] += 1

    return numerators / np.maximum(denominators, 1)

pi_not_i_train_dataset = empirical_Pi_given_not_i(trip_dataset_train, n_items)
pi_not_i_model1 = plot_Pi_given_not_i(model1, n_items)
pi_not_i_model2 = plot_Pi_given_not_i(model2, n_items)

fig, axes = plt.subplots(1, 3, figsize=(10, 4))
axes[0].bar(np.arange(n_items), pi_not_i_model1)
axes[0].set_ylabel(r"Average $P(i\,|\,\mathrm{context\ not\ containing}\ i)$")
axes[0].set_title("Model 1")

axes[1].bar(np.arange(n_items), pi_not_i_model2)
axes[1].set_title("Model 2")

axes[2].bar(np.arange(n_items), pi_not_i_train_dataset)
axes[2].set_title("Train dataset")

fig.supxlabel("Item index")
plt.tight_layout()
plt.show()

### Train on A1, A2, A3 and test on A4 cf J.Désir, V.Auriau, E. Malherbes paper


In [None]:
# Assortments definition
assortment1 = np.array([[1,1,0,1,1,1,1,1]])
assortment2 = np.array([[1,0,1,0,1,1,1,1]])
assortment3 = np.array([[0,1,1,1,0,1,1,1]])
assortment4 = np.array([[1,1,0,0,1,1,1,1]])
assortment_full = np.array([[1,1,1,1,1,1,1,1]])

n_baskets = 500

trip_dataset_1 = data_gen.generate_trip_dataset(n_baskets,assortment1)
trip_dataset_2 = data_gen.generate_trip_dataset(n_baskets,assortment2)
trip_dataset_3 = data_gen.generate_trip_dataset(n_baskets,assortment3)


paper_trip_dataset_train = trip_dataset_1.concatenate(trip_dataset_2).concatenate(trip_dataset_3)
paper_trip_dataset_test_a4 = data_gen.generate_trip_dataset(10*n_baskets,assortment4)
paper_trip_dataset_test_full = data_gen.generate_trip_dataset(10*n_baskets,assortment_full)


M1 = visualise_tripdataset_trips(paper_trip_dataset_train,n_items)
M2 = visualise_tripdataset_trips(paper_trip_dataset_test_a4,n_items)
M3 = visualise_tripdataset_trips(paper_trip_dataset_test_full,n_items)

fig, axes = plt.subplots(1, 3, figsize=(10, 4), constrained_layout = True)

im1 = axes[0].imshow(M1, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[0].set_title("Train dataset (A1,A2,A3)")

im2 = axes[1].imshow(M2, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[1].set_title("Test dataset (A4)")

im3 = axes[2].imshow(M3, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[2].set_title("Test dataset (A_full)")

cbar = fig.colorbar(im1, ax=axes, orientation='vertical', shrink=0.8)
cbar.set_label("Probability")

plt.show()

### Instantiate and train model 3 on A1,A2,A3 with uniform sampling for NCE

In [None]:
model3 = AttentionBasedContextEmbedding(
    epochs=50,
    lr=0.05,
    embedding_dim=3,
    n_negative_samples=3)

model3.instantiate(n_items=n_items)
history3 = model3.fit(paper_trip_dataset_train, use_true_nce_distribution = False)

In [None]:
M1 = get_model_representation(model3, n_items, assortment_matrix = [[1,1,0,0,1,1,1,1]])
M2 = get_model_representation(model3, n_items)
M3 = visualise_tripdataset_trips(paper_trip_dataset_train,n_items)


fig, axes = plt.subplots(1, 3, figsize=(10, 4), constrained_layout = True)

im1 = axes[0].imshow(M3, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[0].set_title("Train dataset (A1,A2,A3)")

im2 = axes[1].imshow(M1, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[1].set_title("Model3 evaluated on A4")

im3 = axes[2].imshow(M2, vmin=0.0, vmax=1, interpolation='nearest', cmap="coolwarm")
axes[2].set_title("Model3 evaluated on Afull")


cbar = fig.colorbar(im1, ax=axes, orientation='vertical', shrink=0.8)
cbar.set_label("Probability")

plt.show()

loss_paper_train_dataset = model3.evaluate(paper_trip_dataset_train)
loss_paper_test_dataset_a4 = model3.evaluate(paper_trip_dataset_test_a4)
loss_paper_test_dataset_afull = model3.evaluate(paper_trip_dataset_test_full)
print(f"Loss of model3 on the train dataset {loss_paper_train_dataset}")
print(f"Loss of model3 on the test dataset A4 {loss_paper_test_dataset_a4}")
print(f"Loss of model2 on the test dataset Afull {loss_paper_test_dataset_afull}")
print("Used loss for evaluation : NLL")

In [None]:
pi_not_i_train_dataset = empirical_Pi_given_not_i(trip_dataset_train, n_items)
pi_not_i_model3_a4 = plot_Pi_given_not_i(model3, n_items, available_items = [1,1,0,0,1,1,1,1])
pi_not_i_model3_full = plot_Pi_given_not_i(model3, n_items)

fig, axes = plt.subplots(1, 3, figsize=(10, 4))
axes[0].bar(np.arange(n_items), pi_not_i_model3_full)
axes[0].set_ylabel(r"Average $P(i\,|\,\mathrm{context\ not\ containing}\ i)$")
axes[0].set_title("Model 3 on Afull")

axes[1].bar(np.arange(n_items), pi_not_i_model3_a4)
axes[1].set_title("Model 3 on A4")

axes[2].bar(np.arange(n_items), pi_not_i_train_dataset)
axes[2].set_title("Train dataset")

fig.supxlabel("Item index")
plt.tight_layout()
plt.show()

### Test save and load methods

In [None]:
# Create evaluation dataset
eval_dataset = data_gen.generate_trip_dataset(100, full_assortment_matrix)

# Evaluate model
loss_eval_dataset_1 = model1.evaluate(eval_dataset)
print(f"Loss of model1 on the evaluation dataset {loss_eval_dataset_1}")

# Save model
model1.save_model("attn_model.json")

# Create a model 3 without instantiating
model3 = AttentionBasedContextEmbedding(
    epochs=epochs,
    lr=lr,
    embedding_dim=embedding_dim,
    n_negative_samples=n_negative_samples)

# Load first model and compare results on evaluation dataset
model3.load_model("attn_model.json")
loss_eval_dataset_3 = model3.evaluate(eval_dataset)
print(f"Loss of model3 on the evaluation dataset {loss_eval_dataset_3}")
os.remove("attn_model.json")
os.remove("attn_model_empty_context_embedding.npy")
os.remove("attn_model_wa.npy")
os.remove("attn_model_Wi.npy")
os.remove("attn_model_Wo.npy")