**Tutorial**: https://keras.io/examples/structured_data/movielens_recommendations_transformers/

## Setup

In [1]:
import os
import math
from zipfile import ZipFile
from urllib.request import urlretrieve
import numpy as np
!pip install pandas
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import StringLookup

[0m

## Preprocessing

### Prepare data

In [2]:
raw_df = pd.read_csv('Office_Products.csv', usecols=['rating', 'reviewerID', 'product_id', 'date'])

In [3]:
# Prototyping 목적으로 50만 개의 행만 이용
raw_df = raw_df.iloc[:500000, :]

In [4]:
raw_df.rename(columns={'reviewerID': 'user_id'}, inplace=True)

In [5]:
raw_df['rating'] = raw_df['rating'].apply(lambda x: float(x))

In [6]:
raw_df.head()

Unnamed: 0,rating,user_id,product_id,date
0,3.0,A2WJLOXXIB7NF3,140503528,1162512000
1,5.0,A1RKICUK0GG6VF,140503528,1147132800
2,5.0,A1QA5E50M398VW,140503528,1142035200
3,5.0,A3N0HBW8IP8CZQ,140503528,980294400
4,5.0,A1K1JW1C5CUSUZ,140503528,964915200


### Transform data into sequences

In [7]:
df_group = raw_df.sort_values(by=['date']).groupby('user_id')

df = pd.DataFrame(
    data={
        'user_id': list(df_group.groups.keys()),
        'product_id': list(df_group.product_id.apply(list)),
        'rating': list(df_group.rating.apply(list)),
        'date': list(df_group.date.apply(list)),
    }
)

In [8]:
sequence_length = 3
step_size = 1


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


df.product_id = df.product_id.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

df.rating = df.rating.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del df['date']

In [9]:
# sequence_length 보다 product_id, rating 컬럼의 item 수가 적은 경우를 고려하지 않았기 때문에 값이 NaN으로 생성된 rows 존재
df_transformed = df.explode(['product_id', 'rating'], ignore_index=True)
df_transformed.info()

# TODO: item 수에 flexible 한 embedding 방식 고민

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441528 entries, 0 to 441527
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   user_id     441528 non-null  object
 1   product_id  46732 non-null   object
 2   rating      46732 non-null   object
dtypes: object(3)
memory usage: 10.1+ MB


In [10]:
df_transformed.dropna(axis=0, how='any', inplace=True)
df_transformed.isnull().any()

user_id       False
product_id    False
rating        False
dtype: bool

In [11]:
df_transformed.product_id = df_transformed.product_id.apply(lambda x: ",".join(x))
df_transformed.rating = df_transformed.rating.apply(lambda x: ",".join([str(v) for v in x]))
df_transformed.rename(columns={'product_id': 'seq_product_ids', 'rating': 'seq_ratings'}, inplace=True)

In [12]:
df_transformed.head()

Unnamed: 0,user_id,seq_product_ids,seq_ratings
16,A0220159ZRNBTRKLG08H,"8862930003,B00006IE7J,B00005249G","5.0,5.0,5.0"
17,A0220159ZRNBTRKLG08H,"B00006IE7J,B00005249G,B00006IEJC","5.0,5.0,5.0"
18,A0220159ZRNBTRKLG08H,"B00006IE7J,B00005249G,B00006IEJC","5.0,5.0,5.0"
31,A03492194F0T997EZQ04,"B00005249G,B00006JNNE,B00006IE7J","5.0,5.0,5.0"
32,A03492194F0T997EZQ04,"B00005249G,B00006JNNE,B00006IE7J","5.0,5.0,5.0"


In [13]:
random_selection = np.random.rand(len(df_transformed.index)) <= 0.85
train_data = df_transformed[random_selection]
test_data = df_transformed[~random_selection]

train_data.to_csv('train_data.csv', index=False, sep='|', header=False)
test_data.to_csv('test_data.csv', index=False, sep='|', header=False)

## Training

### Define metadata

In [14]:
CSV_HEADER = list(df_transformed.columns)

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    'user_id': list(df_transformed.user_id.unique()),
    'product_id': list(df_transformed.seq_product_ids.unique()),
}

### Create `tf.data.Dataset` for training and evaluation

In [15]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        product_ids_string = features['seq_product_ids']
        seq_product_ids = tf.strings.split(product_ids_string, ',').to_tensor()

        # The last product id in the sequence is the target product.
        features['target_product_id'] = seq_product_ids[:, -1]
        features['seq_product_ids'] = seq_product_ids[:, :-1]

        ratings_string = features['seq_ratings']
        seq_ratings = tf.strings.to_number(
            tf.strings.split(ratings_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last rating in the sequence is the target for the model to predict.
        target = seq_ratings[:, -1]
        features['seq_ratings'] = seq_ratings[:, :-1]

        return features, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        num_epochs=1,
        header=False,
        field_delim='|',
        shuffle=shuffle,
    ).map(process)

    return dataset

### Create model inputs

In [16]:
def create_model_inputs():
    return {
        'user_id': layers.Input(name='user_id', shape=(1,), dtype=tf.string),
        'seq_product_ids': layers.Input(
            name='seq_product_ids', shape=(sequence_length - 1,), dtype=tf.string
        ),
        'target_product_id': layers.Input(
            name='target_product_id', shape=(1,), dtype=tf.string
        ),
        'seq_ratings': layers.Input(
            name='seq_ratings', shape=(sequence_length - 1,), dtype=tf.float32
        )
    }

### Encode input features

In [17]:
def encode_input_features(
    inputs,
    include_user_id = True,
    include_user_features = True,
    include_product_features = True
):

    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append('user_id')
    if include_user_features:
        other_feature_names.extend(USER_FEATURES)

    ## Encode user features
    for feature_name in other_feature_names:
        # Convert the string input values into integer indices.
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=1)(
            inputs[feature_name]
        )
        # Compute embedding dimensions
        embedding_dims = int(math.sqrt(len(vocabulary)))
        # Create an embedding layer with the specified dimensions.
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f'{feature_name}_embedding',
        )
        # Convert the index values to embedding representations.
        encoded_other_features.append(embedding_encoder(idx))

    ## Create a single embedding vector for the user features
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    ## Create a product embedding encoder
    product_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["product_id"]
    product_embedding_dims = int(math.sqrt(len(product_vocabulary)))
    # Create a lookup to convert string values to integer indices.
    product_index_lookup = StringLookup(
        vocabulary=product_vocabulary,
        mask_token=None,
        num_oov_indices=1,
        name='product_index_lookup',
    )
    # Create an embedding layer with the specified dimensions.
    product_embedding_encoder = layers.Embedding(
        input_dim=len(product_vocabulary),
        output_dim=product_embedding_dims,
        name=f'product_embedding',
    )

    ## Define a function to encode a given product id.
    def encode_product(product_id):
        # Convert the string input values into integer indices.
        product_idx = product_index_lookup(product_id)
        product_embedding = product_embedding_encoder(product_idx)
        encoded_product = product_embedding

        return encoded_product

    ## Encoding target_product_id
    target_product_id = inputs['target_product_id']
    encoded_target_product = encode_product(target_product_id)

    ## Encoding sequence product_ids.
    seq_product_ids = inputs['seq_product_ids']
    encoded_seq_products = encode_product(seq_product_ids)
    # Create positional embedding.
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=product_embedding_dims,
        name='position_embedding',
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encodded_positions = position_embedding_encoder(positions)
    # Retrieve sequence ratings to incorporate them into the encoding of the product.
    seq_ratings = tf.expand_dims(inputs['seq_ratings'], -1)
    # Add the positional encoding to the product encodings and multiply them by rating.
    encoded_seq_products_with_poistion_and_rating = layers.Multiply()(
        [(encoded_seq_products + encodded_positions), seq_ratings]
    )

    # Construct the transformer inputs.
    for encoded_product in tf.unstack(
        encoded_seq_products_with_poistion_and_rating, axis=1
    ):
        encoded_transformer_features.append(tf.expand_dims(encoded_product, 1))
    encoded_transformer_features.append(encoded_target_product)

    encoded_transformer_features = layers.concatenate(
        encoded_transformer_features, axis=1
    )

    return encoded_transformer_features, encoded_other_features


# num_oov_indices 속성의 역할은?

### Create a BST model

In [18]:
include_user_id = False
include_user_features = False
include_product_features = False

hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3


def create_model():
    inputs = create_model_inputs()
    transformer_features, other_features = encode_input_features(
        inputs, include_user_id, include_user_features
    )

    # Create a multi-headed attention layer.
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
    )(transformer_features, transformer_features)

    # Transformer block.
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    x1 = layers.Add()([transformer_features, attention_output])
    x1 = layers.LayerNormalization()(x1)
    x2 = layers.LeakyReLU()(x1)
    x2 = layers.Dense(units=x2.shape[-1])(x2)
    x2 = layers.Dropout(dropout_rate)(x2)
    transformer_features = layers.Add()([x1, x2])
    transformer_features = layers.LayerNormalization()(transformer_features)
    features = layers.Flatten()(transformer_features)

    # Included the other features.
    if other_features is not None:
        features = layers.concatenate(
            [features, layers.Reshape([other_features.shape[-1]])(other_features)]
        )

    # Fully-connected layers.
    for num_units in hidden_units:
        features = layers.Dense(num_units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.LeakyReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=1)(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


model = create_model()

2022-04-28 10:25:08.217705: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-28 10:25:10.392694: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22311 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:5e:00.0, compute capability: 8.6
2022-04-28 10:25:10.396019: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 23 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:d9:00.0, compute capability: 8.6


### Run training and evaluation experiment

In [19]:
# Compile the model.
model.compile(
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanAbsoluteError()],
)

# Read the training data.
train_dataset = get_dataset_from_csv("train_data.csv", shuffle=True, batch_size=265)

# Fit the model with the training data.
model.fit(train_dataset, epochs=5)

# Read the test data.
test_dataset = get_dataset_from_csv("test_data.csv", batch_size=265)

# Evaluate the model on the test data.
_, rmse = model.evaluate(test_dataset, verbose=0)
print(f"Test MAE: {round(rmse, 3)}")

Epoch 1/5


2022-04-28 10:25:14.788164: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


      1/Unknown - 4s 4s/step - loss: 24.3558 - mean_absolute_error: 4.7479

2022-04-28 10:25:15.977529: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8100


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test MAE: 0.411
