## Setup

In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import math
from zipfile import ZipFile
from urllib.request import urlretrieve

import keras
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import layers
from keras.layers import StringLookup

In [2]:
!pip install -q keras-core
import keras_core as keras_core

Using TensorFlow backend


In [3]:
!pip install --upgrade keras
from keras import ops



## Prepare the data

### Download and prepare the DataFrames

In [4]:
url = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dressipi_recsys2022_datasets.zip'
!wget $url
!unzip dressipi_recsys2022_datasets.zip

--2024-05-12 06:24:47--  https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dressipi_recsys2022_datasets.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 79384785 (76M) [application/zip]
Saving to: ‘dressipi_recsys2022_datasets.zip’


2024-05-12 06:24:48 (155 MB/s) - ‘dressipi_recsys2022_datasets.zip’ saved [79384785/79384785]

Archive:  dressipi_recsys2022_datasets.zip
   creating: dressipi_recsys2022_dataset/
  inflating: dressipi_recsys2022_dataset/README.txt  
  inflating: dressipi_recsys2022_dataset/candidate_items.csv  
  inflating: dressipi_recsys2022_dataset/item_features.csv  
  inflating: dressipi_recsys2022_dataset/test_final_purchases.csv  
  inflating: dressipi_recsys2022_dataset/test_final_sessions.csv  

In [5]:
item_features = pd.read_csv('dressipi_recsys2022_dataset/item_features.csv')

In [6]:
url = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dataset_filtered/train_sessions_duration003.csv'

In [7]:
dataset = pd.read_csv(url)
dataset = dataset.dropna()
dataset = dataset.reset_index()
dataset

Unnamed: 0.1,index,Unnamed: 0,session_id,item_id,date,timestamp,month,weekYear,season,duration
0,0,6,19.0,25772.0,2020-11-02 16:31:18.543,1.604335e+09,11.0,45.0,3.0,1.0
1,1,38,31.0,2069.0,2021-04-20 19:40:21.952,1.618948e+09,4.0,16.0,1.0,1.0
2,2,52,48.0,8398.0,2020-04-15 17:17:42.594,1.586971e+09,4.0,16.0,1.0,1.0
3,3,96,113.0,19760.0,2021-05-05 13:18:49.495,1.620221e+09,5.0,18.0,1.0,1.0
4,4,114,127.0,26016.0,2020-01-15 08:51:02.273,1.579078e+09,1.0,3.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...
142310,142310,4743736,4439943.0,1815.0,2020-07-11 20:02:05.871,1.594498e+09,7.0,28.0,2.0,1.0
142311,142311,4743754,4439964.0,10093.0,2020-03-24 08:17:18.104,1.585038e+09,3.0,13.0,1.0,1.0
142312,142312,4743763,4439973.0,1942.0,2020-10-30 07:04:47.790,1.604041e+09,10.0,44.0,3.0,1.0
142313,142313,4743767,4439974.0,5508.0,2020-04-25 16:35:42.873,1.587833e+09,4.0,17.0,1.0,1.0


In [8]:
#Group by session_id after sort_values by timestamp
sessions_groups = dataset.sort_values(by=["timestamp"]).groupby("session_id")
sessions_train = pd.DataFrame(data = {
        "session_id": list(sessions_groups.groups.keys()),
        "month" : list(sessions_groups.month.unique().explode()),
        "weekYear" : list(sessions_groups.weekYear.unique().explode()),
        "season" : list(sessions_groups.season.unique().explode()),
        "item_ids": list(sessions_groups.item_id.apply(list)),
        "durations": list(sessions_groups.duration.apply(list)),
        "timestamps": list(sessions_groups.timestamp.apply(list)),
    })

In [9]:
#drop session has item_ids <2
sessions_train = sessions_train[sessions_train.item_ids.apply(len) >= 2]

In [10]:
sequence_length = 4
step = 2
def create_sequences(values, sequence, step):
  start_idx = 0
  sec_list = []
  #Handle case < sequence:
  if len(values) < sequence:
    values = values * 2
  #Handle case >= sequence:
  while True:
    end_idx = start_idx + sequence
    sec = values[start_idx:end_idx]
    start_idx += step
    if end_idx >= len(values):
      sec = values[-sequence:]
      sec_list.append(sec)
      break
    sec_list.append(sec)
  return sec_list

In [11]:
sessions_train["item_ids"] = sessions_train["item_ids"].apply(
    lambda values: create_sequences(
        values,sequence_length, step))

sessions_train["durations"] = sessions_train["durations"].apply(
    lambda values: create_sequences(
        values,sequence_length, step))

sessions_train = sessions_train.drop(columns = ["timestamps"])

sessions_train = sessions_train.explode(column=["item_ids", "durations"]).reset_index(drop=True)

In [12]:
#remove '[' ']' in a sequence
sessions_train.item_ids = sessions_train.item_ids.apply(
    lambda x: ",".join([str(v) for v in x]))

sessions_train.durations = sessions_train.durations.apply(
    lambda x: ",".join([str(v) for v in x]))


In [13]:
#drop weekYear and season
#convert type column month to string
sessions_train["month"] = sessions_train["month"].astype(str)
sessions_train = sessions_train.drop(columns = ["weekYear", "season"])
sessions_train.head(5)

Unnamed: 0,session_id,month,item_ids,durations
0,376.0,1.0,"20048.0,23276.0,23276.0,20048.0","1.0,1.0,1.0,1.0"
1,376.0,1.0,"23276.0,20048.0,23276.0,23276.0","1.0,1.0,1.0,1.0"
2,475.0,11.0,"22705.0,16127.0,22705.0,16127.0","1.0,8.0,1.0,8.0"
3,1531.0,2.0,"3244.0,21767.0,3244.0,21767.0","1.0,1.0,1.0,1.0"
4,1881.0,11.0,"27349.0,19840.0,27349.0,19840.0","1.0,1.0,1.0,1.0"


In [14]:
sessions_train = sessions_train.rename(columns = {"item_ids": "sequence_item_ids",
                                  "durations": "sequence_durations"})


In [15]:
random_selection = np.random.rand(len(sessions_train.index)) <= 0.85
train_data = sessions_train[random_selection]
test_data = sessions_train[~random_selection]

In [16]:
train_data.to_csv("train_data.csv", index=False, sep="|", header=False)
test_data.to_csv("test_data.csv", index=False, sep="|", header=False)

In [17]:
test_train_data = pd.read_csv('train_data.csv')

In [18]:
test_train_data

Unnamed: 0,376.0|1.0|20048.0,23276.0,23276.0.1,20048.0|1.0,1.0,1.0.1,1.0.2
0,376.0|1.0|23276.0,20048.0,23276.0,23276.0|1.0,1.0,1.0,1.0
1,1531.0|2.0|3244.0,21767.0,3244.0,21767.0|1.0,1.0,1.0,1.0
2,1881.0|11.0|27349.0,19840.0,27349.0,19840.0|1.0,1.0,1.0,1.0
3,2010.0|8.0|14330.0,11025.0,14330.0,11025.0|1.0,26.0,1.0,26.0
4,2032.0|4.0|17312.0,4891.0,17312.0,4891.0|1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
15910,4438280.0|3.0|5408.0,20474.0,5408.0,20474.0|1.0,1.0,1.0,1.0
15911,4438351.0|4.0|5358.0,18968.0,5358.0,18968.0|1.0,1.0,1.0,1.0
15912,4438923.0|7.0|4816.0,3563.0,4816.0,3563.0|1.0,1.0,1.0,1.0
15913,4439216.0|1.0|28117.0,11053.0,28117.0,11053.0|1.0,1.0,1.0,1.0


## Define metadata

In [19]:
CSV_HEADER = list(sessions_train.columns)

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "session_id": list(sessions_train.session_id.unique()),
    "item_id": list(item_features.item_id.unique()),
    "item_category": list(item_features.feature_category_id.unique()),
    "item_category_value": list(item_features.feature_value_id.unique()),
    "month": list(sessions_train.month.unique())
}

OTHER_FEATURES = ["month"]

ITEM_FEATURES = ["item_category", "item_category_value"]

In [20]:
CSV_HEADER

['session_id', 'month', 'sequence_item_ids', 'sequence_durations']

## Create `tf.data.Dataset` for training and evaluation

In [21]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        item_ids_string = features["sequence_item_ids"]
        sequence_item_ids = tf.strings.split(item_ids_string, ",").to_tensor()

        # The last movie id in the sequence is the target movie.
        features["target_item_id"] = sequence_item_ids[:, -1]
        features["sequence_item_ids"] = sequence_item_ids[:, :-1]

        durations_string = features["sequence_durations"]
        sequence_durations = tf.strings.to_number(
            tf.strings.split(durations_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last durations in the sequence is the target for the model to predict.
        target = sequence_durations[:, -1]
        features["sequence_durations"] = sequence_durations[:, :-1]

        return features, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        num_epochs=1,
        header=False,
        field_delim="|",
        shuffle=shuffle,
    ).map(process)

    return dataset


In [22]:
train_dataset = get_dataset_from_csv("train_data.csv", shuffle=True, batch_size=265)

In [23]:
train_dataset

<_MapDataset element_spec=(OrderedDict([('session_id', TensorSpec(shape=(None,), dtype=tf.float32, name=None)), ('month', TensorSpec(shape=(None,), dtype=tf.float32, name=None)), ('sequence_item_ids', TensorSpec(shape=(None, None), dtype=tf.string, name=None)), ('sequence_durations', TensorSpec(shape=(None, None), dtype=tf.float32, name=None)), ('target_item_id', TensorSpec(shape=(None,), dtype=tf.string, name=None))]), TensorSpec(shape=(None,), dtype=tf.float32, name=None))>

## Create model inputs

In [24]:
def create_model_inputs(): #to transform tensor-like object
    return {
        "session_id": keras.Input(name="session_id", shape=(1,), dtype="string"),
        "sequence_item_ids": keras.Input(
            name="sequence_item_ids", shape=(sequence_length - 1,), dtype="string"
        ),
        "target_item_id": keras.Input(
            name="target_item_id", shape=(1,), dtype="string"
        ),
        "sequence_durations": keras.Input(
            name="sequence_durations", shape=(sequence_length - 1,), dtype=tf.float32
        ),
        "month": keras.Input(name="month", shape=(1,), dtype="string"),
    }

In [25]:
#Build vocabulary of item features
#filter a subset of item_features table just from item from data train
dataset = dataset[dataset["item_id"].notnull()]
item_features_filtered = item_features.loc[dataset["item_id"]]

In [26]:
item_features_filtered = item_features_filtered.drop(columns = ['feature_value_id'])

In [27]:
#create one-hot encoding for items(feature_category_id)
one_hot_item_features_filtered = pd.get_dummies(item_features_filtered, prefix = 'feature_', columns = ['item_id', 'feature_category_id'])

In [28]:
one_hot_item_features_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142315 entries, 25772 to 23303
Columns: 1487 entries, feature__2 to feature__73
dtypes: bool(1487)
memory usage: 202.9 MB


## Encode input features

The `encode_input_features` method works as follows:

1. Each categorical other feature (month) is encoded using `layers.Embedding`, with embedding
dimension equals to the square root of the vocabulary size of the feature.
The embeddings of these features are concatenated to form a single input tensor.

2. Each item in the item sequence and the target item is encoded `layers.Embedding`,
where the dimension size is the square root of the number of items.

3. A multi-hot cates vector for each item is concatenated with its embedding vector,
and processed using a non-linear `layers.Dense` to output a vector of the same item
embedding dimensions.

4. A positional embedding is added to each item embedding in the sequence, and then
multiplied by its durations from the durations sequence.

5. The target item embedding is concatenated to the sequence item embeddings, producing
a tensor with the shape of `[batch size, sequence length, embedding size]`, as expected
by the attention layer for the transformer architecture.

6. The method returns a tuple of two elements:  `encoded_transformer_features` and
`encoded_other_features`.

In [32]:
def encode_input_features(
    inputs,
    include_session_id=False,
    include_other_features=False,
    include_item_features=True,
):
    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_session_id:
        other_feature_names.append("session_id")
    if include_other_features:
        other_feature_names.extend(OTHER_FEATURES) #extend to add item in a list of other features

    ## Encode other features
    for feature_name in other_feature_names:
        # Convert the string input values into integer indices.
        vocabulary = [str(i) for i in CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]] #add item in a list of other features
        print(vocabulary)
        idx = StringLookup(vocabulary=vocabulary,
                           mask_token=None,
                           num_oov_indices=1,
                           )(inputs[feature_name])
        # Compute embedding dimensions
        embedding_dims = int(math.sqrt(len(vocabulary)))
        # Create an embedding layer with the specified dimensions.
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )
        # Convert the index values to embedding representations.
        encoded_other_features.append(embedding_encoder(idx))

    ## Create a single embedding vector for the session features ##Skip
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    ## Create a item embedding encoder
    item_vocabulary = [str(i) for i in CATEGORICAL_FEATURES_WITH_VOCABULARY['item_id']]
                      #tf.strings.as_string(CATEGORICAL_FEATURES_WITH_VOCABULARY["item_id"])
    item_embedding_dims = int(math.sqrt(len(item_vocabulary)))
    #print(item_vocabulary, 'item_vocabulary')
    # Create a lookup to convert string values to integer indices.
    item_index_lookup = StringLookup(
        vocabulary=item_vocabulary,
        mask_token=None,
        num_oov_indices=1,
        name="item_index_lookup",
    )
    # Create an item embedding layer with the specified dimensions.
    item_embedding_encoder = layers.Embedding(
        input_dim=len(item_vocabulary),
        output_dim=item_embedding_dims,
        name=f"item_embedding",
    )
    # Create a vector lookup for item category
    #cate_vectors present one-hot-encoding
    #input_dim presents a len of unique item_id and output_dim presents a len of unique
    cate_vectors = one_hot_item_features_filtered.to_numpy()
    # Create a lookup to convert string values to integer indices.
    item_cate_lookup = layers.Embedding(
        input_dim=cate_vectors.shape[0],
        output_dim=cate_vectors.shape[1],
        embeddings_initializer=keras.initializers.Constant(cate_vectors),
        trainable=False,
        name="cate_vector",
    )
    # Create a processing layer for cate.
    item_embedding_dims_embedding_processor = layers.Dense(
        units=item_embedding_dims,
        activation="relu",
        name="process_item_embedding_with_cate",
    )

    ## Define a function to encode a given item id.
    def encode_item(item_id):
        # Convert the string input values into integer indices.
        item_idx = item_index_lookup(item_id)
        # Item embedding
        item_embedding = item_embedding_encoder(item_idx)
        encoded_item = item_embedding
        if include_item_features:
            item_cate_vector = item_cate_lookup(item_idx)
            encoded_item = item_embedding_dims_embedding_processor (
                layers.concatenate([item_embedding, item_cate_vector])
            )
        return encoded_item

    ## Encoding target_item_id
    target_item_id = inputs["target_item_id"]
    encoded_target_item = encode_item(target_item_id)

    ## Encoding sequence item_ids.
    sequence_item_ids = inputs["sequence_item_ids"]
    encoded_sequence_items = encode_item(sequence_item_ids)
    # Create positional embedding.
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=item_embedding_dims,
        name="position_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encodded_positions = position_embedding_encoder(positions)
    # Retrieve sequence durations to incorporate them into the encoding of the item.
    sequence_durations = inputs["sequence_durations"]
    sequence_durations = keras.ops.expand_dims(sequence_durations, -1)
    # Add the positional encoding to the item encodings and multiply them by rating.
    encoded_sequence_items_with_position_and_rating = layers.Multiply()(
        [(encoded_sequence_items + encodded_positions), sequence_durations]
    )

    # Construct the transformer inputs.
    for i in range(sequence_length - 1):
        feature = encoded_sequence_items_with_position_and_rating[:, i, ...]
        feature = keras.ops.expand_dims(feature, 1)
        encoded_transformer_features.append(feature)
    encoded_transformer_features.append(encoded_target_item)

    encoded_transformer_features = layers.concatenate(
        encoded_transformer_features, axis=1
    )

    return encoded_transformer_features, encoded_other_features


## Create a BST model

In [33]:
include_session_id = False
include_other_features = False
include_item_features = True

hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3


def create_model():
    inputs = create_model_inputs()
    transformer_features, other_features = encode_input_features(
        inputs, include_session_id, include_other_features, include_item_features
    )

    # Create a multi-headed attention layer.
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
    )(transformer_features, transformer_features)

    # Transformer block.
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    x1 = layers.Add()([transformer_features, attention_output])
    x1 = layers.LayerNormalization()(x1)
    x2 = layers.LeakyReLU()(x1)
    x2 = layers.Dense(units=x2.shape[-1])(x2)
    x2 = layers.Dropout(dropout_rate)(x2)
    transformer_features = layers.Add()([x1, x2])
    transformer_features = layers.LayerNormalization()(transformer_features)
    features = layers.Flatten()(transformer_features)

    # Included the other features.
    if other_features is not None:
        features = layers.concatenate(
            [features, layers.Reshape([other_features.shape[-1]])(other_features)]
        )

    # Fully-connected layers.
    for num_units in hidden_units:
        features = layers.Dense(num_units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.LeakyReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=1)(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


model = create_model()

## Run training and evaluation experiment

In [34]:
# Compile the model.
model.compile(
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.Precision( name = 'precision'),
             keras.metrics.Recall( name = 'recall')]
)

# Read the training data.
train_dataset = get_dataset_from_csv("train_data.csv", shuffle=True, batch_size=265)

# Fit the model with the training data.
model.fit(train_dataset, epochs=5)

# Read the test data.
test_dataset = get_dataset_from_csv("test_data.csv", batch_size=265)

# Evaluate the model on the test data.
_, precision, recall = model.evaluate(test_dataset, verbose=0)
print(f"Test MAE: {round(recall, 3)}")

Epoch 1/5
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - loss: 583768.5000 - precision: 1.0000 - recall: 0.2788
Epoch 2/5


  self.gen.throw(typ, value, traceback)


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - loss: 271956.5000 - precision: 1.0000 - recall: 0.3125
Epoch 3/5
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - loss: 354804.8438 - precision: 1.0000 - recall: 0.4273
Epoch 4/5
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - loss: 378322.6562 - precision: 1.0000 - recall: 0.4631
Epoch 5/5
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - loss: 549103.5625 - precision: 1.0000 - recall: 0.4241
Test MAE: 0.005
