In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras

In [2]:
ratings = pd.read_csv('./data/movielens_small/ratings.csv')

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
ratings.drop(columns=['timestamp'], inplace=True)

In [5]:
ratings = ratings.pivot(columns='userId', index='movieId', values='rating')

In [6]:
# R[n, m]
# n - number of movies
# m - number of users
# R[i][j] = 1 if the user rated a movie, 0 otherwise

R = (~ratings.isna()).values.astype(int)
R

array([[1, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
# Y[n, m]
# n - number of movies
# m - number of users
# Y[i][j] = the rating (from .5 to 5 in .5 increments) user (j) gave for movie (i)

Y = ratings.fillna(0).values
Y

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [8]:
# Set random seed for reproducibility
tf.random.set_seed(42)

n_features = 100
n_movies, n_users = Y.shape
W = tf.Variable(tf.random.normal([n_users, n_features]))
X = tf.Variable(tf.random.normal([n_movies, n_features]))
b = tf.Variable(tf.zeros([1, n_users]))

Y = tf.convert_to_tensor(Y, dtype=tf.float32)
R = tf.convert_to_tensor(R, dtype=tf.float32)

In [9]:
# Perform mean normalization
# so when a new user comes his initial recommendations will be the mean of user ratings / movie

Y_mu = tf.expand_dims(tf.reduce_mean(Y, axis=1), 1)
Y -= Y_mu

In [10]:
def model(X: tf.Tensor, W: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
    return tf.matmul(X, W, transpose_b=True) + b


def cost_fn(Y: tf.Tensor, Y_hat: tf.Tensor, R: tf.Tensor, lam: float) -> float:
    error = tf.reduce_sum((R * (Y_hat - Y)) ** 2)
    l2_weight_reg = tf.reduce_sum(W ** 2)
    l2_x_reg = tf.reduce_sum(X ** 2)

    return .5 * error + (.5 * lam) * (l2_weight_reg + l2_x_reg)

In [11]:
optimizer = keras.optimizers.Adam(learning_rate=1e-1)
lam = 1
epochs = 400

for epoch in range(epochs):
    with tf.GradientTape() as tape:
        Y_hat = model(X, W, b)
        loss = cost_fn(Y, Y_hat, R, lam)
    
    if epoch % (epochs // 10) == 0:
        print(f'Epoch[{epoch}] loss: {loss:.4f}')
    
    grads = tape.gradient(loss, [X, W, b])
    optimizer.apply_gradients(zip(grads, [X, W, b]))

Epoch[0] loss: 6079012.5000
Epoch[40] loss: 126538.8438
Epoch[80] loss: 37831.5547
Epoch[120] loss: 17705.3320
Epoch[160] loss: 10999.0137
Epoch[200] loss: 8211.1348
Epoch[240] loss: 6845.9336
Epoch[280] loss: 6082.5552
Epoch[320] loss: 5609.2949
Epoch[360] loss: 5293.6650


In [12]:
movies_df = pd.read_csv('./data/movielens_small/movies.csv')

movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [13]:
movies_df.set_index('movieId', inplace=True)

movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [14]:
# Predict the ratings that users will give for all users

Y_hat = model(X, W, b) + Y_mu

In [15]:
# Get the ratings of the 231th user

userId = 20

predicted_ratings = Y_hat[:, userId]
actual_ratings = Y[:, userId]
movies_watched = R[:, userId]
movies_not_watched = 1 - R[:, userId]

top_rated_watched = tf.argsort(actual_ratings * movies_watched, direction="DESCENDING").numpy()
top_recommended = tf.argsort(predicted_ratings * movies_not_watched, direction="DESCENDING").numpy()

# ratings.numpy()[idx_recommended[:10]]
# ratings.numpy()[idx_watched[:10]]

In [16]:
rating_diff = ((((predicted_ratings + Y_mu) - actual_ratings) * movies_watched) ** 2).numpy()

In [17]:
print(f'mean rating diff: {rating_diff.mean()}')
print(f'std rating diff: {rating_diff.std()}')

mean rating diff: 0.012858806177973747
std rating diff: 0.14651431143283844


In [18]:
movies_df.iloc[top_rated_watched[:20]]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
7541,100 Girls (2000),Comedy|Romance
47610,"Illusionist, The (2006)",Drama|Fantasy|Mystery|Romance
5847,Ragtime (1981),Drama
3079,Mansfield Park (1999),Comedy|Drama|Romance
1721,Titanic (1997),Drama|Romance
370,Naked Gun 33 1/3: The Final Insult (1994),Action|Comedy
2011,Back to the Future Part II (1989),Adventure|Comedy|Sci-Fi
2010,Metropolis (1927),Drama|Sci-Fi
78116,Please Give (2010),Comedy|Drama
7492,Martin (1977),Drama|Horror


In [19]:
movies_df.iloc[top_recommended[:20]]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,"Shawshank Redemption, The (1994)",Crime|Drama
4990,Jimmy Neutron: Boy Genius (2001),Adventure|Animation|Children|Comedy
349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller
1484,"Daytrippers, The (1996)",Comedy|Drama|Mystery|Romance
34437,Broken Flowers (2005),Comedy|Drama
1747,Wag the Dog (1997),Comedy
3173,Any Given Sunday (1999),Drama
1206,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller
454,"Firm, The (1993)",Drama|Thriller
1379,Young Guns II (1990),Action|Western
