In [155]:
import os
import pprint
import tempfile
import pandas as pd

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
import warnings
warnings.filterwarnings('ignore')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0 = all messages, 1 = INFO and WARNING, 2 = ERROR and WARNING, 3 = ERROR only

import logging
tf.get_logger().setLevel(logging.ERROR)

### Data Load

In [109]:
#Users
u_cols = ['user_id', 'location', 'age']
users = pd.read_csv('BX-CSV-Dump/BX-Users.csv', sep=';', encoding='latin-1', on_bad_lines='warn')

#Books
i_cols = ['isbn', 'book_title' ,'book_author','year_of_publication', 'publisher', 'img_s', 'img_m', 'img_l']
items = pd.read_csv('BX-CSV-Dump/BX-Books.csv', sep=';', encoding='latin-1',low_memory=False, on_bad_lines='warn')

#Ratings
r_cols = ['user_id', 'isbn', 'rating']
ratings = pd.read_csv('BX-CSV-Dump/BX-Book-Ratings.csv', sep=';', encoding='latin-1',low_memory=False, on_bad_lines='warn')

Skipping line 6452: expected 8 fields, saw 9
Skipping line 43667: expected 8 fields, saw 10
Skipping line 51751: expected 8 fields, saw 9
Skipping line 92038: expected 8 fields, saw 9
Skipping line 104319: expected 8 fields, saw 9
Skipping line 121768: expected 8 fields, saw 9
Skipping line 144058: expected 8 fields, saw 9
Skipping line 150789: expected 8 fields, saw 9
Skipping line 157128: expected 8 fields, saw 9
Skipping line 180189: expected 8 fields, saw 9
Skipping line 185738: expected 8 fields, saw 9
Skipping line 209388: expected 8 fields, saw 9
Skipping line 220626: expected 8 fields, saw 9
Skipping line 227933: expected 8 fields, saw 11
Skipping line 228957: expected 8 fields, saw 10
Skipping line 245933: expected 8 fields, saw 9
Skipping line 251296: expected 8 fields, saw 9
Skipping line 259941: expected 8 fields, saw 9
Skipping line 261529: expected 8 fields, saw 9



### Preprocessing

In [110]:
users = users.join(users['Location'].str.split(',', 2, expand=True).rename(columns={0:'City', 1:'Region', 2:'Country'}))
users['Country'] = users['Country'].fillna('usa')
users['Age'] = users['Age'].fillna(np.mean(users['Age']))
users.drop(columns=['City', 'Region', 'Location'], inplace=True)
users['User-ID'] = users['User-ID'].astype('str')
users.rename(columns={'User-ID':'user_id', 'Age':'user_age', 'Country':'user_country'}, inplace=True)
users.head()

Unnamed: 0,user_id,user_age,user_country
0,1,34.751434,usa
1,2,18.0,usa
2,3,34.751434,russia
3,4,17.0,portugal
4,5,34.751434,united kingdom


In [111]:
items = items[['ISBN', 'Book-Title']]
items.rename(columns={'ISBN':'book_id', 'Book-Title':'book_title'}, inplace=True)
items.head()

Unnamed: 0,book_id,book_title
0,195153448,Classical Mythology
1,2005018,Clara Callan
2,60973129,Decision in Normandy
3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,393045218,The Mummies of Urumchi


In [112]:
items_100000 = items.sample(n=100000)
users_10000 = users.sample(n=10000)

In [113]:
import random
from datetime import datetime, timedelta

# Function to generate a random date between the specified range
def random_date(start, end):
    delta = end - start
    random_days = random.randrange(delta.days)
    return start + timedelta(days=random_days)

# Set the date range
start_date = datetime(2020, 1, 1)
end_date = datetime(2022, 1, 1)

In [114]:
ratings.rename(columns={'User-ID':'user_id', 'ISBN':'book_id'}, inplace=True)
ratings['user_id'] = ratings['user_id'].astype('str')
ratings_df = pd.merge(ratings, users_10000, on='user_id', how='left')
ratings_df = pd.merge(ratings_df, items_100000, on='book_id', how='left')
ratings_df.dropna(inplace=True)
ratings_df['rating_timestamp'] = [random_date(start_date, end_date) for _ in range(len(ratings_df))]
ratings_df['rating_timestamp'] = ratings_df['rating_timestamp'].apply(lambda x: x.timestamp())
ratings_df.head()

Unnamed: 0,user_id,book_id,Book-Rating,user_age,user_country,book_title,rating_timestamp
85,276788,055310666X,10,34.751434,usa,False Memory,1583280000.0
133,276822,0060096195,10,11.0,canada,The Boy Next Door,1594426000.0
134,276822,0141310340,9,11.0,canada,Skin and Other Stories (Now in Speak!),1602374000.0
137,276822,0375821813,9,11.0,canada,Hoot (Newbery Honor Book),1600387000.0
140,276822,0439401399,6,11.0,canada,The Contest,1604362000.0


In [115]:
print('Total Users: ', ratings_df.user_id.nunique())
print('Total Books: ', ratings_df.book_id.nunique())
print('Total Interactions: ', ratings_df.shape[0])

Total Users:  1913
Total Books:  9506
Total Interactions:  13565


In [117]:
tf_ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df))

In [118]:
tf_ratings_map = tf_ratings.map(lambda x: {
    "book_id": x["book_id"],
    "user_id": x["user_id"],
    "user_rating": x["Book-Rating"],
    "user_age": x["user_age"],
    "user_country": x["user_country"],
    "book_title": x["book_title"],
    "rating_timestamp": x["rating_timestamp"]
})

In [141]:
book_ids = tf_ratings_map.batch(10_000).map(lambda x: x["book_id"])
book_titles = tf_ratings_map.batch(10_000).map(lambda x: x["book_title"])
user_ids = tf_ratings_map.batch(10_000).map(lambda x: x["user_id"])
user_countries = tf_ratings_map.batch(10_000).map(lambda x: x["user_country"])

unique_book_ids = np.unique(np.concatenate(list(book_ids)))
unique_book_titles = np.unique(np.concatenate(list(book_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))
unique_user_countries = np.unique(np.concatenate(list(user_countries)))

In [142]:
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = tf_ratings_map.shuffle(15_00_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(10_000)
test = shuffled.skip(10_000).take(4_000)

In [143]:
max_age = tf_ratings_map.map(lambda x: x["user_age"]).reduce(
    tf.cast(0, tf.float64), tf.maximum).numpy().max()
min_age = tf_ratings_map.map(lambda x: x["user_age"]).reduce(
    tf.cast(0, tf.float64), tf.minimum).numpy().min()

age_buckets = np.linspace(
    min_age, max_age, num=20)

print(f"Buckets: {age_buckets}")

Buckets: [ 0.          4.31578947  8.63157895 12.94736842 17.26315789 21.57894737
 25.89473684 30.21052632 34.52631579 38.84210526 43.15789474 47.47368421
 51.78947368 56.10526316 60.42105263 64.73684211 69.05263158 73.36842105
 77.68421053 82.        ]


In [144]:
timestamps = np.concatenate(list(tf_ratings_map.map(lambda x: x["rating_timestamp"]).batch(100)))

### User Model

In [145]:
class UserModel(tf.keras.Model):
  
    def __init__(self, unique_user_ids, unique_user_countries, age_buckets, timestamps, embedding_dimension):
        super().__init__()

        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
        ])
        
        self.country_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_countries, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_countries) + 1, embedding_dimension),
        ])
        
        self.age_embedding = tf.keras.Sequential([
            tf.keras.layers.Discretization(age_buckets.tolist()),
            tf.keras.layers.Embedding(len(age_buckets) + 2, embedding_dimension)
        ])
        
        self.normalized_timestamp = tf.keras.layers.Normalization(
            axis=None
        )

        self.normalized_timestamp.adapt(timestamps)

    def call(self, inputs):
        # Take the input dictionary, pass it through each input layer,
        # and concatenate the result.
        return tf.concat([
            self.user_embedding(inputs["user_id"]),
            self.country_embedding(inputs["user_country"]),
            self.age_embedding(inputs["user_age"]),
            tf.reshape(self.normalized_timestamp(inputs["rating_timestamp"]), (-1, 1)),
        ], axis=1)

In [146]:
user_model = UserModel(unique_user_ids, unique_user_countries, age_buckets, timestamps, 32)
for row in tf_ratings_map.batch(1).take(1):
    print(f"Computed representations: {user_model(row)}")

Computed representations: [[-9.49854776e-03 -4.07686941e-02 -2.75819302e-02 -1.08468533e-02
   1.68311596e-03 -3.94163243e-02 -1.70231946e-02  3.36598493e-02
   1.80847533e-02  4.65952642e-02 -2.80122831e-03 -4.25349548e-03
  -2.30829008e-02  4.36505191e-02  3.44716348e-02 -9.02016088e-03
  -6.28281757e-03 -1.50293820e-02  1.89254321e-02  2.02165581e-02
   3.95418331e-03 -8.76940787e-04 -2.05943733e-03 -4.15004417e-03
   3.34905125e-02 -2.91977767e-02 -4.43983078e-02 -2.34945416e-02
  -9.03869793e-03  1.97106712e-02  7.85908848e-03 -2.27237474e-02
   1.72722600e-02  3.39537300e-02 -9.18791443e-03  3.42808031e-02
   1.35832094e-02 -2.77657267e-02 -4.61349487e-02 -9.73738357e-03
   4.29556631e-02 -1.66180246e-02 -1.91069134e-02 -1.79636367e-02
  -9.02209431e-03 -2.95696147e-02 -4.21343558e-02 -1.76161155e-02
  -1.42933242e-02 -7.78484344e-03  2.90289186e-02  3.47695686e-02
  -2.63884068e-02  4.00923565e-03 -2.86171678e-02  2.55995877e-02
  -1.11088753e-02 -1.04076788e-03 -9.56403092e-03 

### Book Model

In [147]:
class BookModel(tf.keras.Model):

    def __init__(self, unique_book_ids, unique_book_titles, embedding_dimension):
        super().__init__()

        max_tokens = 1_000

        self.book_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_book_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_book_ids) + 1, embedding_dimension),
        ])
    
        self.title_vectorizer = tf.keras.layers.TextVectorization(
            max_tokens=max_tokens)

        self.title_embedding = tf.keras.Sequential([
          self.title_vectorizer,
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
          tf.keras.layers.GlobalAveragePooling1D(),
        ])

        self.title_vectorizer.adapt(unique_book_titles)

    def call(self, inputs):
        return tf.concat([
            self.book_embedding(inputs["book_id"]),
            self.title_embedding(inputs["book_title"]),
        ], axis=1)

In [148]:
book_model = BookModel(unique_book_ids, unique_book_titles, 32)
for row in tf_ratings_map.batch(1).take(1):
    print(f"Computed representations: {book_model(row)}")

Computed representations: [[-3.94688137e-02 -3.75805497e-02 -4.79060411e-02 -7.67754391e-03
  -3.76792662e-02 -4.33177464e-02  3.94165181e-02 -1.28926039e-02
  -1.48759596e-02 -9.29572433e-03  7.58085400e-03  2.11796500e-02
  -3.00803427e-02  2.42065303e-02  2.30334513e-02 -4.81771007e-02
  -1.72335282e-02  2.25508846e-02  2.93181427e-02  1.00679025e-02
   3.37586142e-02  2.99189351e-02 -1.54346451e-02 -4.27184589e-02
  -3.38370204e-02  4.85745184e-02 -2.58558281e-02  3.24600972e-02
   1.80996768e-02  3.45756859e-03  4.76169847e-02 -4.66955565e-02
   1.54240150e-02  4.19015065e-04  1.87681783e-02 -8.50616023e-04
   5.78434486e-03  1.43453684e-02 -1.58941802e-02 -2.46956851e-03
  -1.01527553e-02  5.24986535e-04  1.43242022e-02 -2.59545520e-02
   1.99114010e-02  1.32593140e-03 -1.21377120e-02  6.47782162e-03
  -2.29139216e-02 -9.08130966e-03  9.33548622e-03  2.77774408e-04
  -3.12197953e-05 -4.70387712e-02  3.63731757e-04  3.42994221e-02
   2.10731905e-02 -9.61798523e-03 -1.21891387e-02 

### Book Recommender Model with Rating and Retrival tasks

In [149]:
tf_books = tf.data.Dataset.from_tensor_slices(dict(items_100000))
tf_books_map = tf_books.map(lambda x: {
    "book_id": x["book_id"], 
    "book_title": x["book_title"]
})

In [150]:
class BookRecModel(tfrs.models.Model):

    def __init__(self, rating_weight, retrieval_weight, unique_user_ids, unique_user_countries, 
                 unique_book_ids, unique_book_titles, age_buckets, timestamps):
        super().__init__()

        embedding_dimension = 32
        
        self.user_model = tf.keras.Sequential([
            UserModel(unique_user_ids, unique_user_countries, age_buckets, timestamps, embedding_dimension),
            tf.keras.layers.Dense(embedding_dimension)
        ])
        
        self.book_model = tf.keras.Sequential([
            BookModel(unique_book_ids, unique_book_titles, embedding_dimension),
            tf.keras.layers.Dense(embedding_dimension)
        ])
        
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(1),
        ])
        
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
            
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates = tf_books_map.batch(128).map(self.book_model)
            )
        )

        # The loss weights.
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:        
        user_embeddings = self.user_model(features)
        book_embeddings = self.book_model(features)

        return (
            user_embeddings,
            book_embeddings,

            self.rating_model(
                tf.concat([user_embeddings, book_embeddings], axis=1)
            ),
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

        ratings = features.pop("user_rating")

        user_embeddings, book_embeddings, rating_predictions = self(features)

        # We compute the loss for each task.
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, book_embeddings)

        # And combine them using the loss weights.
        return (self.rating_weight * rating_loss
                + self.retrieval_weight * retrieval_loss)

In [151]:
cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [152]:
model = BookRecModel(0.5, 0.5, unique_user_ids, unique_user_countries, unique_book_ids, 
                     unique_book_titles, age_buckets, unique_user_age)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))



In [153]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a2a714fa00>

In [156]:
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Retrieval top-100 accuracy: 0.007.
Ranking RMSE: 4581.224.


### Recommendation

In [158]:
model.summary()

Model: "book_rec_model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_110 (Sequential)  multiple                 67043     
                                                                 
 sequential_113 (Sequential)  multiple                 338304    
                                                                 
 sequential_114 (Sequential)  (None, 1)                49665     
                                                                 
 ranking_9 (Ranking)         multiple                  0         
                                                                 
 retrieval_9 (Retrieval)     multiple                  1         
                                                                 
Total params: 455,013
Trainable params: 455,009
Non-trainable params: 4
_________________________________________________________________


In [161]:
brute_force = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

In [181]:
# Transform book titles into embeddings using the book_model
test_map = tf_books_map.map(lambda x: {"book_embedding": model.book_model(x), "book_id": x["book_id"]})

# Use 'index_from_dataset'
index.index_from_dataset(
    tf_books_map.batch(100).map(lambda x: (x["book_title"], x["book_id"]))
)

ValueError: in user code:

    File "C:\Users\awast\AppData\Local\Temp\ipykernel_79924\1463076740.py", line 2, in None  *
        lambda x: {"book_embedding": model.book_model(x), "book_id": x["book_id"]}
    File "C:\Users\awast\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\awast\AppData\Local\Temp\__autograph_generated_filedywseif7.py", line 12, in tf__call
        retval_ = ag__.converted_call(ag__.ld(tf).concat, ([ag__.converted_call(ag__.ld(self).book_embedding, (ag__.ld(inputs)['book_id'],), None, fscope), ag__.converted_call(ag__.ld(self).title_embedding, (ag__.ld(inputs)['book_title'],), None, fscope)],), dict(axis=1), fscope)

    ValueError: Exception encountered when calling layer 'book_model_16' (type BookModel).
    
    in user code:
    
        File "C:\Users\awast\AppData\Local\Temp\ipykernel_79924\948094142.py", line 28, in call  *
            self.title_embedding(inputs["book_title"]),
        File "C:\Users\awast\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "C:\Users\awast\.conda\envs\tf\lib\site-packages\keras\engine\input_spec.py", line 235, in assert_input_compatibility
            raise ValueError(
    
        ValueError: Exception encountered when calling layer 'sequential_112' (type Sequential).
        
        Input 0 of layer "global_average_pooling1d_10" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 32)
        
        Call arguments received by layer 'sequential_112' (type Sequential):
          • inputs=tf.Tensor(shape=(), dtype=string)
          • training=None
          • mask=None
    
    
    Call arguments received by layer 'book_model_16' (type BookModel):
      • inputs={'book_id': 'tf.Tensor(shape=(), dtype=string)', 'book_title': 'tf.Tensor(shape=(), dtype=string)'}
