## Setup

In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import math
from zipfile import ZipFile
from urllib.request import urlretrieve

import keras
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import layers
from keras.layers import StringLookup

In [2]:
#import train test split
from sklearn.model_selection import train_test_split
import random

In [3]:
!pip install -q keras-core
import keras_core as keras_core

Using TensorFlow backend


In [4]:
!pip install --upgrade keras
from keras import ops



## Prepare the data

### Download and prepare the DataFrames

In [5]:
url = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dressipi_recsys2022_datasets.zip'
!wget $url
!unzip dressipi_recsys2022_datasets.zip

--2024-05-26 15:58:03--  https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dressipi_recsys2022_datasets.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 79384785 (76M) [application/zip]
Saving to: ‘dressipi_recsys2022_datasets.zip’


2024-05-26 15:58:04 (317 MB/s) - ‘dressipi_recsys2022_datasets.zip’ saved [79384785/79384785]

Archive:  dressipi_recsys2022_datasets.zip
   creating: dressipi_recsys2022_dataset/
  inflating: dressipi_recsys2022_dataset/README.txt  
  inflating: dressipi_recsys2022_dataset/candidate_items.csv  
  inflating: dressipi_recsys2022_dataset/item_features.csv  
  inflating: dressipi_recsys2022_dataset/test_final_purchases.csv  
  inflating: dressipi_recsys2022_dataset/test_final_sessions.csv  

In [6]:
item_features = pd.read_csv('dressipi_recsys2022_dataset/item_features.csv')

In [7]:
candidate_items = pd.read_csv('dressipi_recsys2022_dataset/candidate_items.csv')

In [8]:
#url = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dataset_filtered/train_sessions_duration003.csv'
#train_session fraction 10% ~ 500.000
url01 = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dataset_filtered/train_session01_seq.csv'

In [9]:
dataset01 = pd.read_csv(url01, index_col = 0, parse_dates=["date"])
dataset01 = dataset01.dropna()
dataset01 = dataset01.reset_index()
#fraction
dataset = dataset01.sample(frac=0.6)

In [10]:
dataset.dtypes

session_id           float64
item_id              float64
date          datetime64[ns]
timestamp            float64
month                float64
weekYear             float64
season               float64
duration             float64
dtype: object

In [11]:
dataset.shape

(310166, 8)

In [12]:
#Group by session_id after sort_values by timestamp
sessions_groups = dataset.sort_values(by=["timestamp"]).groupby("session_id")
sessions_train = pd.DataFrame(data = {
        "session_id": list(sessions_groups.groups.keys()),
        "month" : list(sessions_groups.month.unique().explode()),
        "weekYear" : list(sessions_groups.weekYear.unique().explode()),
        "season" : list(sessions_groups.season.unique().explode()),
        "item_ids": list(sessions_groups.item_id.apply(list)),
        "durations": list(sessions_groups.duration.apply(list)),
        "timestamps": list(sessions_groups.timestamp.apply(list)),
    })

In [13]:
#drop session has item_ids <2
sessions_train = sessions_train[sessions_train.item_ids.apply(len) >= 2]

In [14]:
sequence_length = 4
step = 2
def create_sequences(values, sequence, step):
  start_idx = 0
  sec_list = []
  #Handle case < sequence:
  if len(values) < sequence:
    values = values * 2
  #Handle case >= sequence:
  while True:
    end_idx = start_idx + sequence
    sec = values[start_idx:end_idx]
    start_idx += step
    if end_idx >= len(values):
      sec = values[-sequence:]
      sec_list.append(sec)
      break
    sec_list.append(sec)
  return sec_list

In [15]:
sessions_train["item_ids"] = sessions_train["item_ids"].apply(
    lambda values: create_sequences(
        values,sequence_length, step))

sessions_train["durations"] = sessions_train["durations"].apply(
    lambda values: create_sequences(
        values,sequence_length, step))

sessions_train = sessions_train.drop(columns = ["timestamps"])

sessions_train = sessions_train.explode(column=["item_ids", "durations"]).reset_index(drop=True)

In [16]:
#remove '[' ']' in a sequence
sessions_train.item_ids = sessions_train.item_ids.apply(
    lambda x: ",".join([str(v) for v in x]))

sessions_train.durations = sessions_train.durations.apply(
    lambda x: ",".join([str(v) for v in x]))


In [17]:
#drop weekYear and season
#convert type column month to string
sessions_train["month"] = sessions_train["month"].astype(str)
sessions_train = sessions_train.drop(columns = ["weekYear", "season"])
sessions_train.head(5)

Unnamed: 0,session_id,month,item_ids,durations
0,19.0,11.0,"20033.0,6704.0,20033.0,6704.0","1.0,1.0,1.0,1.0"
1,108.0,6.0,"13885.0,26130.0,13885.0,26130.0","1.0,67.0,1.0,67.0"
2,154.0,4.0,"21152.0,27613.0,21152.0,27613.0","1.0,1.0,1.0,1.0"
3,428.0,11.0,"1720.0,14376.0,1720.0,14376.0","1.0,1.0,1.0,1.0"
4,453.0,8.0,"19974.0,11386.0,19974.0,11386.0","1.0,1.0,1.0,1.0"


In [18]:
sessions_train = sessions_train.rename(columns = {"item_ids": "sequence_item_ids",
                                  "durations": "sequence_durations"})


In [19]:
sessions_train.shape

(59372, 4)

#Find similar item by item2vec

In [45]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [42]:
#convert to string and store all the sequence_item_ids to a list
all_sequence_items = []
for i in range(len(sessions_train.index)):
  list_seq = list(sessions_train.sequence_item_ids[i].split(","))
  all_sequence_items.append(list_seq)

In [43]:
all_sequence_items[:4]

[['20033.0', '6704.0', '20033.0', '6704.0'],
 ['13885.0', '26130.0', '13885.0', '26130.0'],
 ['21152.0', '27613.0', '21152.0', '27613.0'],
 ['1720.0', '14376.0', '1720.0', '14376.0']]

In [44]:
len(all_sequence_items)

59372

In [46]:
#Create a tokenizer and fit it on the sequence list
tok_obj = Tokenizer()
tok_obj.fit_on_texts(all_sequence_items)

In [48]:
#Create mapping dictionaries for items to ids and ids to items
item_to_id = tok_obj.word_index # Example output {'8060.0': 1, '26853.0': 2, '2447.0': 3, '17089.0': 4}
id_to_item = {v:k for k, v in item_to_id.items()}

In [49]:
item_ids_list = [[item_to_id[w] for w in sequence] for sequence in all_sequence_items]

In [50]:
item_ids_list[:4]

[[728, 967, 728, 967],
 [7424, 1230, 7424, 1230],
 [74, 24, 74, 24],
 [4838, 684, 4838, 684]]

In [51]:
#check vocab_size
vocab_size = len(item_to_id) + 2
vocab_size

15904

##Generate Skip grams

In [52]:
#GENERATE SKIP GRAMS
# generate skip-grams
skip_grams = [tf.keras.preprocessing.sequence.skipgrams(wid, vocabulary_size=vocab_size) for wid in item_ids_list]
# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]

In [53]:
len(skip_grams)

59372

In [54]:
for i in range(4):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id_to_item[pairs[i][0]], pairs[i][0],
          id_to_item[pairs[i][1]], pairs[i][1],
          labels[i]))

(6704.0 (967), 20033.0 (728)) -> 1
(6704.0 (967), 6704.0 (967)) -> 1
(20033.0 (728), 6608.0 (7397)) -> 0
(20033.0 (728), 10454.0 (2962)) -> 0


##Model Architectures

In [55]:
embedding_size = 100
from tensorflow.keras.layers import Concatenate, Dense, Embedding, Reshape
from tensorflow.keras.models import Model

# Define the input layers for the target and context words
target_word_input = tf.keras.Input(shape=(1,))
context_word_input = tf.keras.Input(shape=(1,))

# Build skip-gram architecture
target_word_model = Embedding(vocab_size, embedding_size,
                              embeddings_initializer="glorot_uniform")(target_word_input)
target_word_model = Reshape((embedding_size,))(target_word_model)

context_word_model = Embedding(vocab_size, embedding_size,
                               embeddings_initializer="glorot_uniform")(context_word_input)
context_word_model = Reshape((embedding_size,))(context_word_model)

# Concatenate the output of the target and context models
merged = Concatenate(axis=1)([target_word_model, context_word_model])

# Add a dense layer and sigmoid activation
output = Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid")(merged)

# Define the model
model_item2vec = Model(inputs=[target_word_input, context_word_input], outputs=output)

# Compile the model
model_item2vec.compile(loss="mean_squared_error", optimizer="adam")

# View model summary
print(model_item2vec.summary())

None


In [57]:
skip_grams_train_test = skip_grams[:500]
frac_train_test = round(len(skip_grams_train_test)*0.8)

In [58]:
frac_train_test

400

In [59]:
train_skip_grams = skip_grams[:frac_train_test]
test_skip_grams = skip_grams[frac_train_test:]

In [60]:
#TRAIN THE MODEL
# train the model on the skip-grams
for epoch in range(1, 4):
    total_loss = 0
    for i, elem in enumerate(train_skip_grams):
        #print('start', i)
        skip_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        #print(skip_first_elem, 'skip_first_elem', i)
        skip_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        #print(skip_second_elem, 'skip_second_elem', i)
        labels = np.array(elem[1], dtype='int32')
        #print(labels, 'labels', i)
        X = [skip_first_elem, skip_second_elem]
        #print(X, 'X', i)
        Y = labels
        #print(Y, 'Y', i)
        if i % 200 == 0:
            print('Processed {} skip-gram pairs'.format(i))
        total_loss += model_item2vec.train_on_batch(X,Y)
        #print(total_loss, 'total_loss', i)

    print('Epoch: {} Loss: {}'.format(epoch, total_loss))

Processed 0 skip-gram pairs




Processed 200 skip-gram pairs
Epoch: 1 Loss: 99.6143197864294
Processed 0 skip-gram pairs
Processed 200 skip-gram pairs
Epoch: 2 Loss: 85.4210883975029
Processed 0 skip-gram pairs
Processed 200 skip-gram pairs
Epoch: 3 Loss: 59.39291423559189


In [61]:
#save model keras
model_item2vec.save('model_item2vec.keras')

In [62]:
#TEST and EVALUATE THE MODEL
# train the model on the skip-grams
for epoch in range(1, 2):
    total_score = 0
    for i, elem in enumerate(test_skip_grams[:100]):
        #print('start', i)
        skip_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        #print(skip_first_elem, 'skip_first_elem', i)
        skip_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        #print(skip_second_elem, 'skip_second_elem', i)
        labels = np.array(elem[1], dtype='int32')
        #print(labels, 'labels', i)
        X = [skip_first_elem, skip_second_elem]
        #print(X, 'X', i)
        Y = labels
        #print(Y, 'Y', i)
        if i % 1000 == 0:
            print('Processed {} skip-gram pairs'.format(i))
        score = model_item2vec.evaluate(X,Y, verbose=0)
        total_score += score
        #print('Accuracy: %f' % (score*100))

    #average score
    score = total_score/100
    print('Epoch: {} Score: {}'.format(epoch, score))

Processed 0 skip-gram pairs
Epoch: 1 Loss: 0.316150251571089


##Get word embeddings to find similar items

In [64]:
#GET WORD EMBEDDINGS
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import euclidean_distances

# get the embeddings for the words in the vocabulary
#weights = model.layers[2].get_weights()[0]

embeddings = model_item2vec.layers[2].get_weights()[0]


# `embeddings` has a shape of (num_vocab, embedding_dim)

# `word_to_index` is a mapping (i.e. dict) from words to their index, e.g. `love`: 69
words_embeddings = {w:embeddings[idx] for w, idx in item_to_id.items()}

In [65]:
embeddings.shape

(15904, 100)

In [66]:
distance_matrix = euclidean_distances(embeddings)

In [67]:
distance_matrix.shape

(15904, 15904)

In [68]:
def similar_item(item):
    list = []
    try:
        similar_items = {search_term: [id_to_item[idx] for idx in distance_matrix[item_to_id[search_term]-1].argsort()[1:11]+1]
                       for search_term in [item]}
        list = similar_items[item]
    except KeyError:
        pass
    return list

In [69]:
similar_list_example = similar_item('1890.0')
print(similar_list_example)

['13393.0', '4211.0', '25191.0', '11980.0', '20117.0', '26802.0', '3460.0', '24708.0', '22075.0', '4740.0']


#DATA AUGMENTATION

In [20]:
import random

In [70]:
#Synonym Replacement
##Randomly choose an item in a sequence and replace it by a similar item
def aug_edaSR(seq):
  ##seq: item viewed in sequence
  ##n: the number of times the process has to be repeated
  ##itemList: danh sách unique các item - tương ứng với negative sample
  ##List ra các item có trong seq này, thay thế random 1 item trong chuỗi bằng 1 random bất kì trong item này nhưng không phải là chính nó.

  #convert string to list
  def stringToListConvert(string):
    li = list(string.split(","))
    return li
  #convert string to list
  seqList = stringToListConvert(seq)

  randomIndex = random.choice(range(len(seqList)))
  #find a similar item of seq(randomIndex)
  list_similarItem = similar_item(seqList[randomIndex])

  #if list_similarItem is null -> pass
  if len(list_similarItem) > 0:
      #replace an item in seq by an item in list_similarItem
      itemReplace = random.choice(list_similarItem[:5]) #random in top 5
      while itemReplace != seqList[randomIndex]:
        seqList[randomIndex] = itemReplace
      else:
        itemReplace = random.choice(list_similarItem)
  else:
    pass
  ##convert seq list to string
  seqList = list(map(str, seqList))
  seqString = ','.join(seqList)
  return seqString

In [71]:
#Test Synonym Replacement function
seqTest = '4816.0,26130.0,4816.0,26130.0'
#itemList = [111,222,333,444,555,12,22,32,42,52]
aug_edaSR(seqTest)

'4816.0,26130.0,27942.0,26130.0'

In [76]:
def buildAugDatasetRS(dataTrain, nAug, fraction, augMethod):
  #get fraction of dataset for augmentation: fraction*dataset for augmentation, the rest is remain
  ##dataFrac= dataTrain.sample(frac=fraction)
  leftFrac, rightFrac = train_test_split(dataTrain, random_state = 104, test_size = fraction, shuffle = True)
  #with every session in split_dataTrain -> Generate (N_aug-1) more session like that with one item swap/ random in aug strategy
  rightFracLen = len(rightFrac.index)
  ##print('check n',rightFracLen)
  for i in range (rightFracLen):
    currentRow = rightFrac.iloc[i]
    ##print('check currentRow', currentRow)
    for j in range (nAug-1):
      duplicateRow = currentRow.copy()
      duplicateRow['sequence_item_ids'] = augMethod(duplicateRow['sequence_item_ids'])
      ##add row to rightFrac
      rightFrac = pd.concat([rightFrac, duplicateRow.to_frame().T], ignore_index=True)
      ##print('check rightFrac', rightFrac)

  #check again
  #after augment the fraction*dataset, boost the number of input dataset => combine with the rest
  dataAug = pd.concat([rightFrac, leftFrac], ignore_index = True, sort = False)

  return dataAug

# Train Test Split (85%)

In [100]:
#dfAug = sessions_train

In [101]:
print(sessions_train.shape)

(59372, 4)


In [102]:
random_selection = np.random.rand(len(sessions_train.index)) <= 0.85
train_data = sessions_train[random_selection]
test_data = sessions_train[~random_selection]

In [103]:
print(train_data.shape)
print(test_data.shape)

(50621, 4)
(8751, 4)


In [None]:
dfAug = buildAugDatasetRS(train_data, 2, 0.4, aug_edaSR)

In [None]:
train_data = dfAug

In [None]:
train_data.to_csv("train_data.csv", index=False, sep="|", header=False)
test_data.to_csv("test_data.csv", index=False, sep="|", header=False)

In [None]:
test_train_data = pd.read_csv('train_data.csv')

In [None]:
test_train_data

In [None]:
print(train_data.shape)
print(test_data.shape)

# Define metadata

In [28]:
CSV_HEADER = list(sessions_train.columns)

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "session_id": list(sessions_train.session_id.unique()),
    "item_id": list(item_features.item_id.unique()),
    "item_category": list(item_features.feature_category_id.unique()),
    "item_category_value": list(item_features.feature_value_id.unique()),
    "month": list(sessions_train.month.unique())
}

OTHER_FEATURES = ["month"]

ITEM_FEATURES = ["item_category", "item_category_value"]

In [29]:
CSV_HEADER

['session_id', 'month', 'sequence_item_ids', 'sequence_durations']

# Create `tf.data.Dataset` for training and evaluation

In [30]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        item_ids_string = features["sequence_item_ids"]
        sequence_item_ids = tf.strings.split(item_ids_string, ",").to_tensor()

        # The last movie id in the sequence is the target movie.
        features["target_item_id"] = sequence_item_ids[:, -1]
        features["sequence_item_ids"] = sequence_item_ids[:, :-1]

        durations_string = features["sequence_durations"]
        sequence_durations = tf.strings.to_number(
            tf.strings.split(durations_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last durations in the sequence is the target for the model to predict.
        target = sequence_durations[:, -1]
        features["sequence_durations"] = sequence_durations[:, :-1]

        return features, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        num_epochs=1,
        header=False,
        field_delim="|",
        shuffle=shuffle,
    ).map(process)

    return dataset


In [31]:
train_dataset = get_dataset_from_csv("train_data.csv", shuffle=True, batch_size=265)

In [32]:
train_dataset

<_MapDataset element_spec=(OrderedDict([('session_id', TensorSpec(shape=(None,), dtype=tf.float32, name=None)), ('month', TensorSpec(shape=(None,), dtype=tf.float32, name=None)), ('sequence_item_ids', TensorSpec(shape=(None, None), dtype=tf.string, name=None)), ('sequence_durations', TensorSpec(shape=(None, None), dtype=tf.float32, name=None)), ('target_item_id', TensorSpec(shape=(None,), dtype=tf.string, name=None))]), TensorSpec(shape=(None,), dtype=tf.float32, name=None))>

# Create model inputs

In [33]:
def create_model_inputs(): #to transform tensor-like object
    return {
        "session_id": keras.Input(name="session_id", shape=(1,), dtype="string"),
        "sequence_item_ids": keras.Input(
            name="sequence_item_ids", shape=(sequence_length - 1,), dtype="string"
        ),
        "target_item_id": keras.Input(
            name="target_item_id", shape=(1,), dtype="string"
        ),
        "sequence_durations": keras.Input(
            name="sequence_durations", shape=(sequence_length - 1,), dtype=tf.float32
        ),
        "month": keras.Input(name="month", shape=(1,), dtype="string"),
    }

In [34]:
#Build vocabulary of item features
#filter a subset of item_features table just from item from data train
dataset = dataset[dataset["item_id"].notnull()]
item_features_filtered = item_features.loc[dataset["item_id"]]

In [35]:
item_features_filtered = item_features_filtered.drop(columns = ['feature_value_id'])

In [36]:
#create one-hot encoding for items(feature_category_id)
one_hot_item_features_filtered = pd.get_dummies(item_features_filtered, prefix = 'feature_', columns = ['item_id', 'feature_category_id'])

In [37]:
one_hot_item_features_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 310166 entries, 13081 to 26381
Columns: 1488 entries, feature__2 to feature__73
dtypes: bool(1488)
memory usage: 442.5 MB


#Encode input features

The `encode_input_features` method works as follows:

1. Each categorical other feature (month) is encoded using `layers.Embedding`, with embedding
dimension equals to the square root of the vocabulary size of the feature.
The embeddings of these features are concatenated to form a single input tensor.

2. Each item in the item sequence and the target item is encoded `layers.Embedding`,
where the dimension size is the square root of the number of items.

3. A multi-hot cates vector for each item is concatenated with its embedding vector,
and processed using a non-linear `layers.Dense` to output a vector of the same item
embedding dimensions.

4. A positional embedding is added to each item embedding in the sequence, and then
multiplied by its durations from the durations sequence.

5. The target item embedding is concatenated to the sequence item embeddings, producing
a tensor with the shape of `[batch size, sequence length, embedding size]`, as expected
by the attention layer for the transformer architecture.

6. The method returns a tuple of two elements:  `encoded_transformer_features` and
`encoded_other_features`.

In [38]:
def encode_input_features(
    inputs,
    include_session_id=False,
    include_other_features=False,
    include_item_features=True,
):
    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_session_id:
        other_feature_names.append("session_id")
    if include_other_features:
        other_feature_names.extend(OTHER_FEATURES) #extend to add item in a list of other features

    ## Encode other features
    for feature_name in other_feature_names:
        # Convert the string input values into integer indices.
        vocabulary = [str(i) for i in CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]] #add item in a list of other features
        print(vocabulary)
        idx = StringLookup(vocabulary=vocabulary,
                           mask_token=None,
                           num_oov_indices=1,
                           )(inputs[feature_name])
        # Compute embedding dimensions
        embedding_dims = int(math.sqrt(len(vocabulary)))
        # Create an embedding layer with the specified dimensions.
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )
        # Convert the index values to embedding representations.
        encoded_other_features.append(embedding_encoder(idx))

    ## Create a single embedding vector for the session features ##Skip
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    ## Create a item embedding encoder
    item_vocabulary = [str(i) for i in CATEGORICAL_FEATURES_WITH_VOCABULARY['item_id']]
                      #tf.strings.as_string(CATEGORICAL_FEATURES_WITH_VOCABULARY["item_id"])
    item_embedding_dims = int(math.sqrt(len(item_vocabulary)))
    #print(item_vocabulary, 'item_vocabulary')
    # Create a lookup to convert string values to integer indices.
    item_index_lookup = StringLookup(
        vocabulary=item_vocabulary,
        mask_token=None,
        num_oov_indices=1,
        name="item_index_lookup",
    )
    # Create an item embedding layer with the specified dimensions.
    item_embedding_encoder = layers.Embedding(
        input_dim=len(item_vocabulary),
        output_dim=item_embedding_dims,
        name=f"item_embedding",
    )
    # Create a vector lookup for item category
    #cate_vectors present one-hot-encoding
    #input_dim presents a len of unique item_id and output_dim presents a len of unique
    cate_vectors = one_hot_item_features_filtered.to_numpy()
    # Create a lookup to convert string values to integer indices.
    item_cate_lookup = layers.Embedding(
        input_dim=cate_vectors.shape[0],
        output_dim=cate_vectors.shape[1],
        embeddings_initializer=keras.initializers.Constant(cate_vectors),
        trainable=False,
        name="cate_vector",
    )
    # Create a processing layer for cate.
    item_embedding_dims_embedding_processor = layers.Dense(
        units=item_embedding_dims,
        activation="relu",
        name="process_item_embedding_with_cate",
    )

    ## Define a function to encode a given item id.
    def encode_item(item_id):
        # Convert the string input values into integer indices.
        item_idx = item_index_lookup(item_id)
        # Item embedding
        item_embedding = item_embedding_encoder(item_idx)
        encoded_item = item_embedding
        if include_item_features:
            item_cate_vector = item_cate_lookup(item_idx)
            encoded_item = item_embedding_dims_embedding_processor (
                layers.concatenate([item_embedding, item_cate_vector])
            )
        return encoded_item

    ## Encoding target_item_id
    target_item_id = inputs["target_item_id"]
    encoded_target_item = encode_item(target_item_id)

    ## Encoding sequence item_ids.
    sequence_item_ids = inputs["sequence_item_ids"]
    encoded_sequence_items = encode_item(sequence_item_ids)
    # Create positional embedding.
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=item_embedding_dims,
        name="position_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encodded_positions = position_embedding_encoder(positions)
    # Retrieve sequence durations to incorporate them into the encoding of the item.
    sequence_durations = inputs["sequence_durations"]
    sequence_durations = keras.ops.expand_dims(sequence_durations, -1)
    # Add the positional encoding to the item encodings and multiply them by rating.
    encoded_sequence_items_with_position_and_rating = layers.Multiply()(
        [(encoded_sequence_items + encodded_positions), sequence_durations]
    )

    # Construct the transformer inputs.
    for i in range(sequence_length - 1):
        feature = encoded_sequence_items_with_position_and_rating[:, i, ...]
        feature = keras.ops.expand_dims(feature, 1)
        encoded_transformer_features.append(feature)
    encoded_transformer_features.append(encoded_target_item)

    encoded_transformer_features = layers.concatenate(
        encoded_transformer_features, axis=1
    )

    return encoded_transformer_features, encoded_other_features


# Create a BST model

In [39]:
include_session_id = False
include_other_features = False
include_item_features = True

hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3


def create_model():
    inputs = create_model_inputs()
    transformer_features, other_features = encode_input_features(
        inputs, include_session_id, include_other_features, include_item_features
    )

    # Create a multi-headed attention layer.
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
    )(transformer_features, transformer_features)

    # Transformer block.
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    x1 = layers.Add()([transformer_features, attention_output])
    x1 = layers.LayerNormalization()(x1)
    x2 = layers.LeakyReLU()(x1)
    x2 = layers.Dense(units=x2.shape[-1])(x2)
    x2 = layers.Dropout(dropout_rate)(x2)
    transformer_features = layers.Add()([x1, x2])
    transformer_features = layers.LayerNormalization()(transformer_features)
    features = layers.Flatten()(transformer_features)

    # Included the other features.
    if other_features is not None:
        features = layers.concatenate(
            [features, layers.Reshape([other_features.shape[-1]])(other_features)]
        )

    # Fully-connected layers.
    for num_units in hidden_units:
        features = layers.Dense(num_units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.LeakyReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=1)(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


model = create_model()

#Run training and evaluation experiment

In [40]:
from keras import losses

In [99]:
# Compile the model.
model.compile(
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
    metrics=[keras.metrics.Precision(name = 'precision'),
             keras.metrics.Recall(name = 'recall')]
)

# Read the training data.
train_dataset = get_dataset_from_csv("train_data.csv", shuffle=True, batch_size=265)

# Fit the model with the training data.
model.fit(train_dataset, epochs=5)

# Read the test data.
test_dataset = get_dataset_from_csv("test_data.csv", batch_size=265)

# Evaluate the model on the test data.

_, precision, recall = model.evaluate(test_dataset, verbose=0)
print(f"Test recall: {round(recall, 3)}")

Epoch 1/5


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 29ms/step - loss: 1.1814e-05 - precision: 1.0000 - recall: 0.4070
Epoch 2/5


  self.gen.throw(typ, value, traceback)


[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - loss: 1.1765e-05 - precision: 1.0000 - recall: 0.4060
Epoch 3/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 26ms/step - loss: 1.1614e-05 - precision: 1.0000 - recall: 0.4091
Epoch 4/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - loss: 1.1566e-05 - precision: 1.0000 - recall: 0.4080
Epoch 5/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 26ms/step - loss: 1.2127e-05 - precision: 1.0000 - recall: 0.4049
Test recall: 0.0
