In [1]:
import numpy as np
import pandas as pd
from scipy import sparse

# Sample Data

In [2]:
df = pd.read_csv("data/tiny_training.csv")
df

Unnamed: 0,userId,movieId,rating
0,11,1,4
1,11,23,5
2,2,23,5
3,2,4,3
4,31,1,4
5,31,23,4
6,4,1,5
7,4,3,2
8,52,1,1
9,52,3,4


# Encode Data

In [3]:
def proc_col(col):
    """
    Encodes a pandas column with values between 0 and n-1.
    where n = number of unique values
    """
    uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx[x] for x in col]), len(uniq)

In [4]:
def encode_data(train_csv):
    """
    Encodes rating data with continous user and movie ids using 
    the helpful fast.ai function from above.
    
    Inputs:
      train_csv: a csv file with columns user_id,movie_id,rating 
    
    Returns:
      df: a dataframe with the encode data
      num_users
      num_movies    
    """
    num_users = proc_col(df.userId)[2]
    num_movies = proc_col(df.movieId)[2]

    df["userId"] = proc_col(df.userId)[1]
    df["movieId"] = proc_col(df.movieId)[1]
    return df, num_users, num_movies

In [5]:
df, num_users, num_movies = encode_data(df)
print(f'num_users : {num_users}')
print(f'num_movies : {num_movies}')
df

num_users : 7
num_movies : 4


Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [6]:
def encode_valid(df_val, df_train):
    """ 
    Encodes df_val with the same encoding as df_train.
    
    Returns:
    df_val: dataframe with the same encoding as df_train
    """
    name2idx_user = proc_col(df_train.userId)[0]
    name2idx_movie = proc_col(df_train.movieId)[0]

    df_val["userId"] = np.array([name2idx_user.get(x,-1) for x in df_val.userId])
    df_val["movieId"] = np.array([name2idx_movie.get(x,-1) for x in df_val.movieId])
    df_val = df_val[df_val.userId!=-1][df_val.movieId!=-1]
    return df_val

# Create Embeddings

In [7]:
def create_embedings(n, K):
    """ 
    Create a numpy random matrix of shape n, K
    The random matrix is initialized with uniform values in (0, 6/K)
    
    Inputs:
    n: number of items/users
    K: number of factors in the embeding 
    
    Returns:
    emb: numpy array of shape (n, num_factors)
    """
    np.random.seed(3)
    emb = 6*np.random.random((n, K)) / K
    return emb

In [8]:
K = 5
emb_user = create_embedings(num_users,K)
emb_movie = create_embedings(num_movies,K)
print(emb_user)

[[0.66095748 0.84977739 0.34908569 0.61299313 1.07153635]
 [1.07555171 0.15070237 0.24869145 0.06176064 0.52897181]
 [0.03585145 0.54819987 0.77897286 0.33418474 0.81150588]
 [0.70903538 0.02877826 0.67062491 0.31110294 0.49812144]
 [0.3402301  0.8317655  0.52854446 0.18824129 0.65357882]
 [0.93637772 0.36763624 0.26634946 0.46556551 1.12366038]
 [1.17119451 0.80686041 1.08340093 1.01490105 0.45359285]]


# Utility Matrix

In [9]:
def df2matrix(df, nrows, ncols, column_name="rating"):
    """ 
    Returns a sparse matrix constructed from a dataframe
    This code assumes the df has columns: MovieID,UserID,Rating
    """
    values = df[column_name].values
    ind_movie = df['movieId'].values
    ind_user = df['userId'].values
    return sparse.csc_matrix((values,(ind_user, ind_movie)),shape=(nrows, ncols))

In [10]:
Y = df2matrix(df, num_users, num_movies)
print(Y) #<7x4 sparse matrix of type with 13 elements>

  (0, 0)	4
  (2, 0)	4
  (3, 0)	5
  (4, 0)	1
  (0, 1)	5
  (1, 1)	5
  (2, 1)	4
  (6, 1)	1
  (1, 2)	3
  (3, 3)	2
  (4, 3)	4
  (5, 3)	5
  (6, 3)	3


# Loss Function MSE

In [11]:
def cost(df, emb_user, emb_movie):
    """ 
    Computes mean square error
    
    First compute prediction. Prediction for user i and movie j is emb_user[i]*emb_movie[j]
    
    Inputs:
      df: dataframe with all data or a subset of the data
      emb_user: embedings for users
      emb_movie: embedings for movies
      
    Returns:
      error(float): MSE
    """
    pred = np.sum(emb_user[df["userId"].values]*emb_movie[df["movieId"].values], axis=1)
    error = ((df['rating'] - pred)**2).mean()
    return error

In [12]:
error = cost(df, emb_user, emb_movie)
error

6.318975545037869

# Gradient descent with momentum

In [13]:
def sparse_multiply(df, emb_user, emb_movie):
    """ 
    This function returns U*V^T element wise multi by R as a sparse matrix.
    It avoids creating the dense matrix U*V^T
    """
    
    df["Prediction"] = np.sum(emb_user[df["userId"].values]*emb_movie[df["movieId"].values], axis=1)
    return df2matrix(df, emb_user.shape[0], emb_movie.shape[0], column_name="Prediction")


def gradient(df, Y, emb_user, emb_movie):
    """ 
    Computes the gradient.

    First compute prediction. Prediction for user i and movie j is emb_user[i]*emb_movie[j]
    
    Inputs:
      df: dataframe with all data or a subset of the data
      Y: sparse representation of df
      emb_user: embedings for users
      emb_movie: embedings for movies
      
    Returns:
      d_emb_user
      d_emb_movie
    """
    val_one = np.ones(df['rating'].values.shape)
    ind_movie = df['movieId'].values
    ind_user = df['userId'].values
    R = sparse.csc_matrix((val_one,(ind_user, ind_movie)),shape=(emb_user.shape[0], emb_movie.shape[0]))    
    
    N = Y.size
    Y_pred=sparse_multiply(df, emb_user, emb_movie)

    grad_user = -(2/N)*((Y-Y_pred).multiply(R).dot(emb_movie))
    grad_movie = -(2/N)*(((Y-Y_pred).multiply(R).T).dot(emb_user))
    return grad_user, grad_movie

In [14]:
Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
grad_user, grad_movie = gradient(df, Y, emb_user, emb_movie)
print(f'grad_user: {grad_user}')
print(f'grad_movie: {grad_movie}')

grad_user: [[-0.69564081 -0.23669632 -0.19693492 -0.14568195 -0.47938463]
 [-0.58733568 -0.26908101 -0.40082497 -0.14793654 -0.56185469]
 [-0.75538639 -0.35793374 -0.24000433 -0.23483023 -0.6200397 ]
 [-0.43736869 -0.46699903 -0.26297135 -0.36822101 -0.63873094]
 [-0.23207446  0.10856947 -0.25908272 -0.05551977 -0.07598536]
 [-0.37549602 -0.0152406  -0.35515433 -0.16475612 -0.26379871]
 [ 0.06195886  0.01821505 -0.05412412 -0.03297054  0.01028791]]
grad_movie: [[-0.47175274 -0.23523022 -0.61385481 -0.36682109 -0.64543274]
 [-0.77388169 -0.68867308 -0.54810338 -0.37770056 -1.19213767]
 [-0.36985705 -0.05182302 -0.08551917 -0.02123804 -0.18190102]
 [-0.88750327 -0.69021782 -0.60346291 -0.50402758 -1.01141043]]


In [15]:
def gradient_descent(df, emb_user, emb_movie, iterations=100, learning_rate=0.01, df_val=None):
    """ 
    Computes gradient descent with momentum (0.9) for a number of iterations.
    Prints training cost and validation cost (if df_val is not None) every 50 iterations.
    
    Returns:
    emb_user: the trained user embedding
    emb_movie: the trained movie embedding
    """
    Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
    v1 = 0
    v2 = 0
    for i in range(iterations):
        grad_user, grad_movie = gradient(df, Y, emb_user, emb_movie)
        v1 = 0.9*v1 + 0.1*grad_user
        v2 = 0.9*v2 + 0.1*grad_movie
        emb_user = emb_user - learning_rate*v1
        emb_movie = emb_movie - learning_rate*v2
        if (i+1)%50==0:
            mse = cost(df, emb_user, emb_movie)
            print(f'mse:{mse}')
    return emb_user, emb_movie

In [16]:
final_emb_user, final_emb_movie = gradient_descent(df, emb_user, emb_movie, iterations=200, learning_rate=0.01)

mse:2.7318752608990735
mse:1.268898984379745
mse:0.889579113290915
mse:0.7676439996999788


# Predict UserId 0 's rating for MovieId 3

In [35]:
final_emb_user[0].dot(final_emb_movie[3])

4.297753720698854