In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from src.utils.model.retrieval_model import RetrievalModel

In [2]:
val_rate = 0.2
test_rate = 0.1
batch_size = 100
embedding_dimension = 100
learning_rate = 0.1
early_stopping_flg = True
tensorboard_flg = False
max_epoch_num = 20

In [3]:
behaviors_df = pd.read_csv(
    "data/RentalProperties/user_activity.csv", names=("item_id", "user_id", "event_type", "create_timestamp")
)

In [4]:
# behaviors_df

In [5]:
# behaviors_df.query('event_type == "seen"')["user_id"].value_counts()
seen_df = behaviors_df.query('event_type == "seen"')
count_df = pd.DataFrame(seen_df["user_id"].value_counts()).reset_index().rename(columns={"index": "user_id", "user_id": "count"})

unique_user_ids = list(count_df.query("count >= 10")["user_id"])
seen_df = seen_df[seen_df["user_id"].isin(unique_user_ids)]

In [6]:
len(unique_user_ids)

2435

In [7]:
train_val_df, test_df = train_test_split(seen_df, test_size=0.1, stratify=seen_df["user_id"])
train_df, val_df = train_test_split(train_val_df, test_size=0.2, stratify=train_val_df["user_id"])

In [8]:
print(len(train_df["user_id"].unique()))
print(len(val_df["user_id"].unique()))
print(len(test_df["user_id"].unique()))

2435
2435
2435


In [9]:
# train_df

In [10]:
# val_df

In [11]:
# test_df

In [12]:
train_ratings = tf.data.Dataset.from_tensor_slices({"user_id": train_df["user_id"], "item_id": train_df["item_id"]})
val_ratings = tf.data.Dataset.from_tensor_slices({"user_id": val_df["user_id"], "item_id": val_df["item_id"]})
test_ratings = tf.data.Dataset.from_tensor_slices({"user_id": test_df["user_id"], "item_id": test_df["item_id"]})

2022-09-11 13:40:31.552776: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
train = train_ratings.batch(batch_size)
val = val_ratings.batch(batch_size)
test = test_ratings.batch(batch_size)

In [16]:
unique_user_ids = np.array(
    list(
        (set(train_df["user_id"].unique()) | set(val_df["user_id"].unique()) | set(test_df["user_id"].unique()))
    )
)
unique_item_ids = np.array(
    list(set(train_df["item_id"].unique()) | set(val_df["item_id"].unique()) | set(test_df["item_id"].unique()))
)
unique_item_dataset = tf.data.Dataset.from_tensor_slices(unique_item_ids)

In [15]:
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    model = RetrievalModel(
        unique_user_ids=unique_user_ids,
        unique_item_ids=unique_item_ids,
        user_dict_key="user_id",
        item_dict_key="item_id",
        embedding_dimension=embedding_dimension,
        metrics_candidate_dataset=unique_item_dataset,
    )
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


  return bool(asarray(a1 == a2).all())


In [16]:
callbacks = []
if early_stopping_flg:
    callbacks.append(
        tf.keras.callbacks.EarlyStopping(
            monitor="total_loss",
            min_delta=0,
            patience=3,
            verbose=0,
            mode="auto",
            baseline=None,
            restore_best_weights=False,
        )
    )
if tensorboard_flg:
    tfb_log_path = log_path + datetime.now().strftime("%Y%m%d-%H%M%S")
    callbacks.append(
        tf.keras.callbacks.TensorBoard(
            log_dir=tfb_log_path,
            histogram_freq=1,
        )
    )

In [17]:
model.fit(x=train, validation_data=val, epochs=max_epoch_num, callbacks=callbacks)

Epoch 1/20


2022-09-11 13:40:32.046921: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_2"
op: "TensorSliceDataset"
input: "Placeholder/_0"
input: "Placeholder/_1"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 40668
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:0"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
      args {
      



2022-09-11 13:40:35.532750: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_2"
op: "TensorSliceDataset"
input: "Placeholder/_0"
input: "Placeholder/_1"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 10168
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:1"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
      args {
      

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

2022-09-11 13:41:02.895686: W tensorflow/core/framework/dataset.cc:768] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.




<keras.callbacks.History at 0x136f7f0d0>

In [18]:
model.evaluate(test, return_dict=True)

2022-09-11 13:41:07.330466: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_2"
op: "TensorSliceDataset"
input: "Placeholder/_0"
input: "Placeholder/_1"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 5649
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:2"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
      shape {
      }
    }
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
      args {
       



{'factorized_top_k/top_1_categorical_accuracy': 0.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.0012391573982313275,
 'factorized_top_k/top_10_categorical_accuracy': 0.00495662959292531,
 'factorized_top_k/top_50_categorical_accuracy': 0.052929721772670746,
 'factorized_top_k/top_100_categorical_accuracy': 0.11081607639789581,
 'loss': 3917.065673828125,
 'regularization_loss': 0,
 'total_loss': 3917.065673828125}