In [2]:
import os
import pprint
import tempfile

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
spark = SparkSession.builder.appName("retrival").getOrCreate()

In [4]:
dataPath = "hdfs://localhost:9000/user/nhom7/book/data/"
bookFilePath = dataPath + "BX-Books.csv"
ratingsFilePath = dataPath + "BX-Book-Ratings.csv"
books_df = spark.read.options(inferSchema="true", header="true", delimiter=';').csv(bookFilePath)
rating_df = spark.read.options(inferSchema="true", header="true", delimiter=';').csv(ratingsFilePath)

In [5]:
rating_pd = rating_df.drop('Book-Rating').toPandas()
book_pd = books_df.select('ISBN').toPandas()

In [6]:
ratings = tf.data.Dataset.from_tensor_slices(dict(rating_pd))
books = tf.data.Dataset.from_tensor_slices(dict(book_pd))

In [7]:
ratings = ratings.map(lambda x: {
    "isbn": x["ISBN"],
    "user_id": x["User-ID"],
})

books = books.map(lambda x: x["ISBN"])

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [8]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [9]:
book_ISBN = books.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_book_ISBN = np.unique(np.concatenate(list(book_ISBN)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [10]:
embedding_dimension = 32

In [11]:
user_model = tf.keras.Sequential([
  tf.keras.layers.IntegerLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

In [12]:
book_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_book_ISBN, mask_token=None),
  tf.keras.layers.Embedding(len(unique_book_ISBN) + 1, embedding_dimension)
])

In [13]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=books.batch(128).map(book_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [14]:
class RetrivalModel(tfrs.Model):

  def __init__(self, user_model, book_model):
    super().__init__()
    self.book_model: tf.keras.Model = book_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_book_embeddings = self.book_model(features["isbn"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_book_embeddings)

In [15]:
model = RetrivalModel(user_model, book_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [16]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [17]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x25edabde770>

In [18]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.024000000208616257,
 'factorized_top_k/top_5_categorical_accuracy': 0.041450001299381256,
 'factorized_top_k/top_10_categorical_accuracy': 0.0442500002682209,
 'factorized_top_k/top_50_categorical_accuracy': 0.04740000143647194,
 'factorized_top_k/top_100_categorical_accuracy': 0.05065000057220459,
 'loss': 29616.52734375,
 'regularization_loss': 0,
 'total_loss': 29616.52734375}

In [20]:
# Create a model that takes in raw query features, and
bruteforce = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=20)
# recommends movies out of the entire movies dataset.
bruteforce.index_from_dataset(
  tf.data.Dataset.zip((books.batch(128), books.batch(128).map(model.book_model)))
)

# Get recommendations.
_, titles = bruteforce(tf.constant([12]))
# 

for t in titles:
  print(t)

tf.Tensor(
[b'0345404793' b'0380841940' b'0451129040' b'0812510488' b'0553114271'
 b'1551668459' b'0425158632' b'0345308808' b'0373087187' b'0375724958'
 b'0312952066' b'0936672765' b'037325640X' b'0373271069' b'0345308921'
 b'0517708957' b'0441800106' b'0373168861' b'1854879995' b'0380807351'], shape=(20,), dtype=string)


In [21]:
path = ("./model/retrival_model/1/")
tf.saved_model.save(
    bruteforce,
    path
)

loaded = tf.saved_model.load(path)



INFO:tensorflow:Assets written to: ./model/retrival_model/1/assets


INFO:tensorflow:Assets written to: ./model/retrival_model/1/assets
