In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
movies = pd.read_csv('/Users/arun/Downloads/ml-latest-small/movies.csv')
ratings = pd.read_csv('/Users/arun/Downloads/ml-latest-small/ratings.csv')


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
user_item_matrix = ratings.pivot(index='movieId', columns='userId', values='rating')
user_item_matrix.fillna(0, inplace=True) 
user_item_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
R = user_item_matrix.values
R


array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [7]:
num_users, num_items = R.shape
num_factors = 10  # Number of latent factors

In [8]:
mask = (R > 0).astype(np.float32)  # Create mask where there are actual ratings
R = R.astype(np.float32)

In [9]:
class MatrixFactorization(tf.Module):
    def __init__(self, num_users, num_items, num_factors, reg=0.02):
        # Initialize user and item latent feature matrices
        self.user_factors = tf.Variable(tf.random.normal([num_users, num_factors]))
        self.item_factors = tf.Variable(tf.random.normal([num_items, num_factors]))
        self.reg = reg  # Regularization parameter

    def __call__(self, R, mask):
        # Predicted ratings: dot product of user and item latent features
        R_pred = tf.matmul(self.user_factors, self.item_factors, transpose_b=True)
        error = tf.reduce_mean(tf.square(tf.multiply(mask, R - R_pred)))
        
        # Regularization term to prevent overfitting
        reg_term = self.reg * (tf.reduce_sum(tf.square(self.user_factors)) + 
                               tf.reduce_sum(tf.square(self.item_factors)))
        return error + reg_term

In [10]:
model = MatrixFactorization(num_users, num_items, num_factors)

optimizer = tf.optimizers.Adam(learning_rate=0.01)

def train_step(R, mask):
    with tf.GradientTape() as tape:
        loss = model(R, mask)
    gradients = tape.gradient(loss, [model.user_factors, model.item_factors])
    optimizer.apply_gradients(zip(gradients, [model.user_factors, model.item_factors]))
    return loss

epochs = 1000
for epoch in range(epochs):
    loss = train_step(R, mask)
    if epoch % 100 == 0:  # Print loss every 100 epochs
        print(f'Epoch {epoch}, Loss: {loss.numpy()}')

Epoch 0, Loss: 2071.490966796875
Epoch 100, Loss: 413.9208984375
Epoch 200, Loss: 88.67325592041016
Epoch 300, Loss: 19.510852813720703
Epoch 400, Loss: 4.438979625701904
Epoch 500, Loss: 1.1467736959457397
Epoch 600, Loss: 0.42899543046951294
Epoch 700, Loss: 0.27184516191482544
Epoch 800, Loss: 0.23701521754264832
Epoch 900, Loss: 0.22918429970741272


In [11]:
# Predict the ratings
R_pred = tf.matmul(model.user_factors, model.item_factors, transpose_b=True)

In [12]:
def rmse(R, R_pred, mask):
    error = tf.reduce_sum(tf.square(tf.multiply(mask, R - R_pred)))
    count = tf.reduce_sum(mask)
    return tf.sqrt(error / count)

print(f'RMSE: {rmse(R, R_pred, mask).numpy()}')


RMSE: 3.6534583568573


In [13]:
def recommend_movies(user_id, R_pred, movies, num_recommendations):
    user_ratings = R_pred[user_id - 1].numpy()  # Get predicted ratings for the user
    top_movie_indices = np.argsort(user_ratings)[::-1][:num_recommendations]  # Sort movies by predicted rating

    # Retrieve movie titles
    recommended_movies = movies[movies['movieId'].isin(top_movie_indices)]
    return recommended_movies[['movieId', 'title']]

# Example: Recommend 5 movies for user 1
recommended_movies = recommend_movies(user_id=45, R_pred=R_pred, movies=movies, num_recommendations=7)
print(recommended_movies)

     movieId                               title
6          7                      Sabrina (1995)
12        13                        Balto (1995)
115      140        Up Close and Personal (1996)
187      219                    Cure, The (1995)
234      272  Madness of King George, The (1994)
304      346                     Backbeat (1993)
420      482                  Killing Zoe (1994)
