In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import sparse

The MovieLens dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. https://grouplens.org/datasets/movielens/.

# MovieLens Dataset

In [2]:
PATH = Path("ml-latest-small")
list(PATH.iterdir())

[PosixPath('ml-latest-small/links.csv'),
 PosixPath('ml-latest-small/tags.csv'),
 PosixPath('ml-latest-small/ratings.csv'),
 PosixPath('ml-latest-small/README.txt'),
 PosixPath('ml-latest-small/movies.csv')]

In [3]:
df = pd.read_csv(PATH/"ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Encode Data

In [4]:
def proc_col(col):
    """
    Encodes a pandas column with values between 0 and n-1.
    where n = number of unique values
    """
    uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx[x] for x in col]), len(uniq)

In [5]:
def encode_data(df_train):
    """
    Encodes rating data with continous user and movie ids using 
    the helpful fast.ai function from above.
    
    Inputs:
      train_csv: a csv file with columns user_id,movie_id,rating 
    
    Returns:
      df: a dataframe with the encode data
      num_users
      num_movies    
    """
    num_users = proc_col(df_train.userId)[2]
    num_movies = proc_col(df_train.movieId)[2]

    df_train["userId"] = proc_col(df_train.userId)[1]
    df_train["movieId"] = proc_col(df_train.movieId)[1]
    return df_train, num_users, num_movies

In [6]:
df, num_users, num_movies = encode_data(df)
print(f'num_users : {num_users}')
print(f'num_movies : {num_movies}')
df

num_users : 610
num_movies : 9724


Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931
...,...,...,...,...
100831,609,3120,4.0,1493848402
100832,609,2035,5.0,1493850091
100833,609,3121,5.0,1494273047
100834,609,1392,5.0,1493846352


In [7]:
def encode_valid(df_val, df_train):
    """ 
    Encodes df_val with the same encoding as df_train.
    
    Returns:
    df_val: dataframe with the same encoding as df_train
    """
    name2idx_user = proc_col(df_train.userId)[0]
    name2idx_movie = proc_col(df_train.movieId)[0]

    df_val["userId"] = np.array([name2idx_user.get(x,-1) for x in df_val.userId])
    df_val["movieId"] = np.array([name2idx_movie.get(x,-1) for x in df_val.movieId])
    df_val = df_val[df_val.userId!=-1][df_val.movieId!=-1]
    return df_val

# Create Embeddings

In [8]:
def create_embedings(n, K):
    """ 
    Create a numpy random matrix of shape n, K
    The random matrix is initialized with uniform values in (0, 6/K)
    
    Inputs:
    n: number of items/users
    K: number of factors in the embeding 
    
    Returns:
    emb: numpy array of shape (n, num_factors)
    """
    np.random.seed(3)
    emb = 6*np.random.random((n, K)) / K
    return emb

In [9]:
K = 5
emb_user = create_embedings(num_users,K)
emb_movie = create_embedings(num_movies,K)
print(emb_user)

[[0.66095748 0.84977739 0.34908569 0.61299313 1.07153635]
 [1.07555171 0.15070237 0.24869145 0.06176064 0.52897181]
 [0.03585145 0.54819987 0.77897286 0.33418474 0.81150588]
 ...
 [0.94263609 0.84922124 0.60350821 0.80325799 0.30124549]
 [0.00896068 0.26912132 0.36652313 0.23476005 0.32429156]
 [0.80063981 0.30614346 0.36021625 1.17441335 0.18542009]]


# Utility Matrix

In [10]:
def df2matrix(df, nrows, ncols, column_name="rating"):
    """ 
    Returns a sparse matrix constructed from a dataframe
    This code assumes the df has columns: MovieID,UserID,Rating
    """
    values = df[column_name].values
    ind_movie = df['movieId'].values
    ind_user = df['userId'].values
    return sparse.csc_matrix((values,(ind_user, ind_movie)),shape=(nrows, ncols))

In [11]:
Y = df2matrix(df, num_users, num_movies)
print(Y) 

  (0, 0)	4.0
  (4, 0)	4.0
  (6, 0)	4.5
  (14, 0)	2.5
  (16, 0)	4.5
  (17, 0)	3.5
  (18, 0)	4.0
  (20, 0)	3.5
  (26, 0)	3.0
  (30, 0)	5.0
  (31, 0)	3.0
  (32, 0)	3.0
  (39, 0)	5.0
  (42, 0)	5.0
  (43, 0)	3.0
  (44, 0)	4.0
  (45, 0)	5.0
  (49, 0)	3.0
  (53, 0)	3.0
  (56, 0)	5.0
  (62, 0)	5.0
  (63, 0)	4.0
  (65, 0)	4.0
  (67, 0)	2.5
  (70, 0)	5.0
  :	:
  (609, 9699)	3.5
  (609, 9700)	3.5
  (609, 9701)	4.0
  (609, 9702)	1.5
  (609, 9703)	5.0
  (609, 9704)	3.0
  (609, 9705)	3.0
  (609, 9706)	4.0
  (609, 9707)	4.0
  (609, 9708)	3.5
  (609, 9709)	3.0
  (609, 9710)	4.0
  (609, 9711)	0.5
  (609, 9712)	4.0
  (609, 9713)	4.0
  (609, 9714)	3.0
  (609, 9715)	3.5
  (609, 9716)	3.5
  (609, 9717)	3.5
  (609, 9718)	3.5
  (609, 9719)	2.5
  (609, 9720)	4.5
  (609, 9721)	3.0
  (609, 9722)	3.5
  (609, 9723)	3.5


# Loss Function MSE

In [12]:
def cost(df, emb_user, emb_movie):
    """ 
    Computes mean square error
    
    First compute prediction. Prediction for user i and movie j is emb_user[i]*emb_movie[j]
    
    Inputs:
      df: dataframe with all data or a subset of the data
      emb_user: embedings for users
      emb_movie: embedings for movies
      
    Returns:
      error(float): MSE
    """
    pred = np.sum(emb_user[df["userId"].values]*emb_movie[df["movieId"].values], axis=1)
    error = ((df['rating'] - pred)**2).mean()
    return error

In [13]:
error = cost(df, emb_user, emb_movie)
error

4.4458561622796235

# Gradient descent with momentum

In [14]:
def sparse_multiply(df, emb_user, emb_movie):
    """ 
    This function returns U*V^T element wise multi by R as a sparse matrix.
    It avoids creating the dense matrix U*V^T
    """
    
    df["Prediction"] = np.sum(emb_user[df["userId"].values]*emb_movie[df["movieId"].values], axis=1)
    return df2matrix(df, emb_user.shape[0], emb_movie.shape[0], column_name="Prediction")


def gradient(df, Y, emb_user, emb_movie):
    """ 
    Computes the gradient.

    First compute prediction. Prediction for user i and movie j is emb_user[i]*emb_movie[j]
    
    Inputs:
      df: dataframe with all data or a subset of the data
      Y: sparse representation of df
      emb_user: embedings for users
      emb_movie: embedings for movies
      
    Returns:
      d_emb_user
      d_emb_movie
    """
    val_one = np.ones(df['rating'].values.shape)
    ind_movie = df['movieId'].values
    ind_user = df['userId'].values
    R = sparse.csc_matrix((val_one,(ind_user, ind_movie)),shape=(emb_user.shape[0], emb_movie.shape[0]))    
    
    N = Y.size
    Y_pred=sparse_multiply(df, emb_user, emb_movie)

    grad_user = -(2/N)*((Y-Y_pred).multiply(R).dot(emb_movie))
    grad_movie = -(2/N)*(((Y-Y_pred).multiply(R).T).dot(emb_user))
    return grad_user, grad_movie

In [15]:
Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
grad_user, grad_movie = gradient(df, Y, emb_user, emb_movie)
print(f'grad_user: {grad_user}')
print(f'grad_movie: {grad_movie}')

grad_user: [[-0.00600309 -0.00548105 -0.00619114 -0.00570191 -0.00566192]
 [-0.0007998  -0.0010409  -0.0011061  -0.00092375 -0.00079655]
 [-0.00032633 -0.00034514 -0.00041246 -0.00026068 -0.0003989 ]
 ...
 [-0.00866793 -0.00823638 -0.00911581 -0.00859423 -0.00981063]
 [-0.00119272 -0.00130126 -0.00103613 -0.00121082 -0.00110012]
 [-0.02806372 -0.03015728 -0.02990418 -0.02767845 -0.02963487]]
grad_movie: [[-4.04896780e-03 -4.01678866e-03 -4.30406079e-03 -4.46034678e-03
  -4.04664906e-03]
 [-1.10675347e-03 -1.33531774e-03 -1.14298145e-03 -1.31365357e-03
  -1.25228819e-03]
 [-2.62299899e-03 -2.79006386e-03 -2.69431155e-03 -2.93035665e-03
  -2.78203238e-03]
 ...
 [-2.47559405e-05 -9.46601622e-06 -1.11379573e-05 -3.63130922e-05
  -5.73322568e-06]
 [-2.91318207e-05 -1.11392369e-05 -1.31067117e-05 -4.27318239e-05
  -6.74663531e-06]
 [-3.36738874e-05 -1.28760030e-05 -1.51502352e-05 -4.93943253e-05
  -7.79853207e-06]]


In [16]:
def gradient_descent(df, emb_user, emb_movie, iterations=100, learning_rate=0.01, df_val=None):
    """ 
    Computes gradient descent with momentum (0.9) for a number of iterations.
    Prints training cost and validation cost (if df_val is not None) every 100 iterations.
    
    Returns:
    emb_user: the trained user embedding
    emb_movie: the trained movie embedding
    """
    Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
    v1 = 0
    v2 = 0
    for i in range(iterations):
        grad_user, grad_movie = gradient(df, Y, emb_user, emb_movie)
        v1 = 0.9*v1 + 0.1*grad_user
        v2 = 0.9*v2 + 0.1*grad_movie
        emb_user = emb_user - learning_rate*v1
        emb_movie = emb_movie - learning_rate*v2
        if (i+1)%100==0:
            mse = cost(df, emb_user, emb_movie)
            print(f'mse:{mse}')
    return emb_user, emb_movie

In [17]:
final_emb_user, final_emb_movie = gradient_descent(df, emb_user, emb_movie, iterations=1000, learning_rate=0.01)

mse:4.345217977254249
mse:4.242467402101515
mse:4.146954416082002
mse:4.057768588243036
mse:3.974142310904746
mse:3.8954269508507693
mse:3.8210729033954185
mse:3.750613090310903
mse:3.683649297284778
mse:3.6198408543454956


# Predict UserId 0 's rating for MovieId 3

In [18]:
final_emb_user[0].dot(final_emb_movie[3])

1.8111835374887322