1) Setup colab runtime environment

1.1. Download Dataset

In [1]:
!git clone https://github.com/balarcode/deep-learning.git

Cloning into 'deep-learning'...
remote: Enumerating objects: 1064, done.[K
remote: Counting objects: 100% (111/111), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 1064 (delta 62), reused 19 (delta 11), pack-reused 953 (from 1)[K
Receiving objects: 100% (1064/1064), 26.45 MiB | 13.62 MiB/s, done.
Resolving deltas: 100% (128/128), done.


1.2. Common Imports

In [2]:
import numpy as np
import numpy.ma as ma # numpy.ma supports data arrays with masks
import pandas as pd
import tensorflow as tf
import pickle # Python object serialization
import tabulate # To print tables
import csv

from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler # Helpful routines from scikit-learn
from sklearn.model_selection import train_test_split # To split and shuffle the dataset
from collections import defaultdict
from IPython.core.display import display, HTML # To render and display HTML code in the output cell

pd.set_option("display.precision", 1) # Configure pandas to display numerical values with a precision of 1 decimal place

2) Setup Configuration

In [3]:
# Neural network settings
ACTIVATION = 'relu'
FIRST_LAYER_UNITS = 256
SECOND_LAYER_UNITS = 128
NUM_OUTPUTS = 32
LEARNING_RATE = 0.01
EPOCHS = 40

# Input dataset (user, item and target)
user_train = np.genfromtxt('/content/deep-learning/recommender_system/data/content_user_train.csv', delimiter=',')
item_train = np.genfromtxt('/content/deep-learning/recommender_system/data/content_item_train.csv', delimiter=',')
y_train    = np.genfromtxt('/content/deep-learning/recommender_system/data/content_y_train.csv', delimiter=',')

# Given two feature vectors (user features and item features), the task of the recommender
# system is to predict whether or not an item ‘i’ is a good match to the user ‘j’.
# This is achieved by content-based filtering algorithm.
# In this example, movies are considered as items.

with open('/content/deep-learning/recommender_system/data/content_user_train_header.txt', newline='') as f:
  user_features = list(csv.reader(f))[0]

with open('/content/deep-learning/recommender_system/data/content_item_train_header.txt', newline='') as f:
  item_features = list(csv.reader(f))[0]

item_vecs = np.genfromtxt('/content/deep-learning/recommender_system/data/content_item_vecs.csv', delimiter=',')

movie_dict = defaultdict(dict)
count = 0
with open('/content/deep-learning/recommender_system/data/content_movie_list.csv', newline='') as csvfile:
  reader = csv.reader(csvfile, delimiter=',', quotechar='"')
  for line in reader:
    if (count == 0):
      count += 1 # Skip the header
    else:
      count += 1
      movie_id = int(line[0])
      movie_dict[movie_id]["title"]  = line[1]
      movie_dict[movie_id]["genres"] = line[2]

with open('/content/deep-learning/recommender_system/data/content_user_to_genre.pickle', 'rb') as f:
  user_to_genre = pickle.load(f)

num_user_features = (user_train.shape[1] - 3)  # remove user id, rating count and average rating
num_item_features = (item_train.shape[1] - 1)  # remove movie id
print(f"Number of user features: {num_user_features}")
print(f"Number of item features: {num_item_features}")

print(f"Number of input user   vectors: {len(user_train)}")
print(f"Number of input item   vectors: {len(item_train)}")
print(f"Number of input target vectors: {len(y_train)}")

Number of user features: 14
Number of item features: 16
Number of input user   vectors: 50884
Number of input item   vectors: 50884
Number of input target vectors: 50884


  user_to_genre = pickle.load(f)


3) Create Custom Dataset

In [4]:
# Scale input data as part of feature scaling step
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

# Standard scaler performs standardization or Z-score normalization to remove mean and keep unity variance
scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

# MinMaxScaler transforms data by scaling each feature to a specified range between -1 and 1
scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))

# NOTE: inverse_transform() adds the mean back to the normalized dataset.
print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(item_train)))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))

True
True


4) Create Training and Validation (Test) Sets

In [5]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train,    y_test    = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)

print(f"Movie/Item training set shape: {item_train.shape}")
print(f"Movie/Item validation set shape: {item_test.shape}")

Movie/Item training set shape: (40707, 17)
Movie/Item validation set shape: (10177, 17)


5) Create Keras Sequential Model

In [6]:
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=FIRST_LAYER_UNITS,  activation=ACTIVATION, name='l1'),
    tf.keras.layers.Dense(units=SECOND_LAYER_UNITS, activation=ACTIVATION, name='l2'),
    tf.keras.layers.Dense(units=NUM_OUTPUTS, name='l3')
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=FIRST_LAYER_UNITS, activation=ACTIVATION, name='l1'),
    tf.keras.layers.Dense(units=SECOND_LAYER_UNITS, activation=ACTIVATION, name='l2'),
    tf.keras.layers.Dense(units=NUM_OUTPUTS, name='l3')
])

# Create feature vector of the user in vu
input_user = tf.keras.layers.Input(shape=(num_user_features, ))
vu = user_NN(input_user)
vu = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu)

# Create feature vector of items (i.e. movies) in vm
input_item = tf.keras.layers.Input(shape=(num_item_features, ))
vm = item_NN(input_item)
vm = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)

# Compute the dot product of the two vectors vu and vm which forms
# the model for content-based filtering for the recommender system
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# Specify the inputs and output of the neural network
model = tf.keras.Model([input_user, input_item], output)

model.summary()

cost_fn = tf.keras.losses.MeanSquaredError() # Mean squared error loss for the cost function
opt = keras.optimizers.Adam(learning_rate=LEARNING_RATE) # Adam optimizer
model.compile(optimizer=opt, loss=cost_fn)

6) Train the Model

In [7]:
# Train the model on the training set
tf.random.set_seed(1)
u_idx = 3  # starting column index in training set for the users
i_idx = 1  # starting column index in training set for the items (movies)
model.fit([user_train[:, u_idx:], item_train[:, i_idx:]], y_train, epochs=EPOCHS)

Epoch 1/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.1305
Epoch 2/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 0.1152
Epoch 3/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1091
Epoch 4/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1045
Epoch 5/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1002
Epoch 6/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0973
Epoch 7/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0952
Epoch 8/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0934
Epoch 9/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0919
Epoch 10/40
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x79edcc3f3850>

In [8]:
# Evaluate the model to determine loss on the validation (test) set
model.evaluate([user_test[:, u_idx:], item_test[:, i_idx:]], y_test)

[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0854


0.08149166405200958

7) Inference or Predictions for an Existing User

Now that the recommender system model is built using a neural network, predictions for an existing user can be made using user's preferences i.e. ratings provided for the items (movies). The predictions, y_p can then be compared with actual user's ratings, y.

In [9]:
user_id = 10 # Existing user with user ID: 10

# Define an empty user_vec
new_user_id = 0
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 0.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 0.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 0

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

# Populate user and y vectors
if not user_id in user_to_genre:
  print("error: unknown user_id")
else:
  user_vec_found = False
  for i in range(len(user_train_unscaled)):
    if user_train_unscaled[i, 0] == user_id:
      user_vec = user_train_unscaled[i]
      user_vec_found = True
      break
  if not user_vec_found:
    print("error: did not find user_id in user_train")
  num_items = len(item_vecs)
  user_vecs = np.tile(user_vec, (num_items, 1))

  y_vecs = np.zeros(num_items)
  for i in range(num_items):  # Check and retrieve the movie rating
    movie_id = item_vecs[i, 0]
    if movie_id in user_to_genre[user_id]['movies']:
      rating = user_to_genre[user_id]['movies'][movie_id]
    else:
      rating = 0
    y_vecs[i] = rating

# Scale user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# Inference or Prediction
u_idx = 3
i_idx = 1
y_p = model.predict([suser_vecs[:, u_idx:], sitem_vecs[:, i_idx:]])

# Perform inverse_transform()
y_pu = scalerTarget.inverse_transform(y_p)

# Sort the results such that highest prediction is first
sorted_index      = np.argsort(-y_pu, axis=0).reshape(-1).tolist()
sorted_ypu        = y_pu[sorted_index]
sorted_items      = item_vecs[sorted_index]
sorted_user       = user_vecs[sorted_index]
sorted_y          = y_vecs[sorted_index]
sorted_y_reshaped = sorted_y.reshape(-1,1)

# Recommender system top-rated movies for an existing user to watch based on the movies rated so far by the same user
count = 0
maxcount = 50
disp = [["y_p", "y", "user", "user genre ave", "movie rating ave", "movie id", "title", "genres"]]
for i in range(0, sorted_y_reshaped.shape[0]):
  if sorted_y_reshaped[i, 0] != 0: # Zero means that movie is not rated by the existing user
    if count == maxcount:
      break
    count += 1
    movie_id = sorted_items[i, 0].astype(int)
    offsets  = np.nonzero(sorted_items[i, 3:] == 1)[0]
    genre_ratings = sorted_user[i, 3 + offsets]
    disp.append([sorted_ypu[i, 0], # "y_p"
                 sorted_y_reshaped[i, 0], # "y"
                 sorted_user[i, 0].astype(int), # "user"
                 np.array2string(genre_ratings,
                                 formatter={'float_kind':lambda x: "%.1f" % x},
                                 separator=',', suppress_small=True), # "user genre ave"
                 sorted_items[i, 2].astype(float), # "movie rating ave"
                 movie_id,
                 movie_dict[movie_id]["title"],
                 movie_dict[movie_id]["genres"]])

display(HTML(tabulate.tabulate(disp, tablefmt="html", headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".1f"])))

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step


y_p,y,user,user genre ave,movie rating ave,movie id,title,genres
4.3,5.0,10,"[3.9,3.8,3.3,3.5]",3.7,79091,Despicable Me (2010),Animation|Children|Comedy|Crime
4.1,4.5,10,"[3.6,3.9,3.8,3.3,3.8,3.5]",3.9,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Romance
4.1,4.5,10,"[3.6,3.9,3.3,3.8,3.5]",3.6,106696,Frozen (2013),Adventure|Animation|Comedy|Fantasy|Romance
4.0,4.0,10,"[3.6,3.9,3.8,3.3]",4.0,68954,Up (2009),Adventure|Animation|Children|Drama
4.0,3.5,10,"[3.7,3.5,3.9]",3.8,8665,"Bourne Supremacy, The (2004)",Action|Crime|Thriller
4.0,3.5,10,"[3.7,3.5,3.9]",3.7,54286,"Bourne Ultimatum, The (2007)",Action|Crime|Thriller
3.9,4.0,10,"[3.9,3.8,3.3]",3.6,103335,Despicable Me 2 (2013),Animation|Children|Comedy
3.9,4.0,10,"[3.7,3.6,3.9,3.8]",3.5,95167,Brave (2012),Action|Adventure|Animation|Children
3.9,5.0,10,"[3.7,3.5]",3.9,33794,Batman Begins (2005),Action|Crime
3.8,3.5,10,"[3.9,3.8,3.3,3.8,3.5]",3.9,81847,Tangled (2010),Animation|Children|Comedy|Fantasy|Romance


In [10]:
user_id = 600 # Existing user with user ID: 600

# Define an empty user_vec
new_user_id = 0
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 0.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 0.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 0

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

if not user_id in user_to_genre:
  print("error: unknown user_id")
else:
  user_vec_found = False
  for i in range(len(user_train_unscaled)):
    if user_train_unscaled[i, 0] == user_id:
      user_vec = user_train_unscaled[i]
      user_vec_found = True
      break
  if not user_vec_found:
    print("error: did not find user_id in user_train")
  num_items = len(item_vecs)
  user_vecs = np.tile(user_vec, (num_items, 1))

  y_vecs = np.zeros(num_items)
  for i in range(num_items):  # Check and retrieve the movie rating
    movie_id = item_vecs[i, 0]
    if movie_id in user_to_genre[user_id]['movies']:
      rating = user_to_genre[user_id]['movies'][movie_id]
    else:
      rating = 0
    y_vecs[i] = rating

# Scale user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# Inference or Prediction
u_idx = 3
i_idx = 1
y_p = model.predict([suser_vecs[:, u_idx:], sitem_vecs[:, i_idx:]])

# Perform inverse_transform()
y_pu = scalerTarget.inverse_transform(y_p)

# Sort the results such that highest prediction is first
sorted_index      = np.argsort(-y_pu, axis=0).reshape(-1).tolist()
sorted_ypu        = y_pu[sorted_index]
sorted_items      = item_vecs[sorted_index]
sorted_user       = user_vecs[sorted_index]
sorted_y          = y_vecs[sorted_index]
sorted_y_reshaped = sorted_y.reshape(-1,1)

# Recommender system top-rated movies for an existing user to watch based on the movies rated so far by the same user
count = 0
maxcount = 50
disp = [["y_p", "y", "user", "user genre ave", "movie rating ave", "movie id", "title", "genres"]]
for i in range(0, sorted_y_reshaped.shape[0]):
  if sorted_y_reshaped[i, 0] != 0: # Zero means that movie is not rated by the existing user
    if count == maxcount:
      break
    count += 1
    movie_id = sorted_items[i, 0].astype(int)
    offsets  = np.nonzero(sorted_items[i, 3:] == 1)[0]
    genre_ratings = sorted_user[i, 3 + offsets]
    disp.append([sorted_ypu[i, 0], # "y_p"
                 sorted_y_reshaped[i, 0], # "y"
                 sorted_user[i, 0].astype(int), # "user"
                 np.array2string(genre_ratings,
                                 formatter={'float_kind':lambda x: "%.1f" % x},
                                 separator=',', suppress_small=True), # "user genre ave"
                 sorted_items[i, 2].astype(float), # "movie rating ave"
                 movie_id,
                 movie_dict[movie_id]["title"],
                 movie_dict[movie_id]["genres"]])

display(HTML(tabulate.tabulate(disp, tablefmt="html", headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".1f"])))

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


y_p,y,user,user genre ave,movie rating ave,movie id,title,genres
4.2,5.0,600,"[3.0,3.9,3.8]",4.2,5618,Spirited Away (Sen to Chihiro no kamikakushi) (2001),Adventure|Animation|Fantasy
4.1,4.5,600,"[3.0,3.8,2.5]",3.8,48394,"Pan's Labyrinth (Laberinto del fauno, El) (2006)",Drama|Fantasy|Thriller
4.1,4.0,600,"[3.9,3.0,3.8]",3.7,4873,Waking Life (2001),Animation|Drama|Fantasy
4.1,4.0,600,"[3.9,3.8,2.5]",3.7,66097,Coraline (2009),Animation|Fantasy|Thriller
4.1,5.0,600,"[2.3,3.0,3.0,3.8]",4.1,7153,"Lord of the Rings: The Return of the King, The (2003)",Action|Adventure|Drama|Fantasy
4.0,5.0,600,"[3.0,3.8]",4.1,4993,"Lord of the Rings: The Fellowship of the Ring, The (2001)",Adventure|Fantasy
4.0,5.0,600,"[3.0,3.8]",4.0,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
4.0,4.5,600,"[3.1,2.9]",4.0,8874,Shaun of the Dead (2004),Comedy|Horror
3.9,3.5,600,"[3.1,3.0,3.8,3.1]",3.9,48082,"Science of Sleep, The (La science des rêves) (2006)",Comedy|Drama|Fantasy|Romance
3.9,5.0,600,"[3.0,3.1,3.8]",3.5,30810,"Life Aquatic with Steve Zissou, The (2004)",Adventure|Comedy|Fantasy
