In [2]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

In [3]:
# Defining the functions for importing the data
import numpy as np
import pandas as pd
from numpy import loadtxt

def normalizeRatings(Y, R):
    """
    Preprocess data by subtracting mean rating for every movie (every row).
    Only include real ratings R(i,j)=1.
    [Ynorm, Ymean] = normalizeRatings(Y, R) normalized Y so that each movie
    has a rating of 0 on average. Unrated moves then have a mean rating (0)
    Returns the mean rating in Ymean.
    """
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R) 
    return(Ynorm, Ymean)

def load_precalc_params_small():

    file = open('./data/small_movies_X.csv', 'rb')
    X = loadtxt(file, delimiter = ",")

    file = open('./data/small_movies_W.csv', 'rb')
    W = loadtxt(file,delimiter = ",")

    file = open('./data/small_movies_b.csv', 'rb')
    b = loadtxt(file,delimiter = ",")
    b = b.reshape(1,-1)
    num_movies, num_features = X.shape
    num_users,_ = W.shape
    return(X, W, b, num_movies, num_features, num_users)
    
def load_ratings_small():
    file = open('./data/small_movies_Y.csv', 'rb')
    Y = loadtxt(file,delimiter = ",")

    file = open('./data/small_movies_R.csv', 'rb')
    R = loadtxt(file,delimiter = ",")
    return(Y,R)

def load_Movie_List_pd():
    """ returns df with and index of movies in the order they are in in the Y matrix """
    df = pd.read_csv('./data/small_movie_list.csv', header=0, index_col=0,  delimiter=',', quotechar='"')
    mlist = df["title"].to_list()
    return(mlist, df)

In [86]:
#Load the data
X, W, b, num_movies, num_features, num_users = load_precalc_params_small()
Y, R = load_ratings_small()

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


In [87]:
def cofi_cost_func(X, W, b, Y, R, lambda_):
    m,n = Y.shape
    cost = 0

    for i in range(m):
        for j in range(n):
            if R[i,j] != 0:
                # Calculate cost only if it is ranked\
                cost += (np.dot(W[j,:], X[i,:]) + b[0,j] - Y[i,j])**2

    cost = cost/2
    
    regulrization = 0
    for i in range(n):
        regulrization += np.sum(np.dot(W[i,:],W[i,:]))

    for i in range(m):
        regulrization += np.sum(np.dot(X[i,:],X[i,:]))
    
    regulrization = lambda_/2*regulrization
    
    return regulrization+cost

def cofi_cost_func_v(X, W, b, Y, R, lambda_): 
    # X = X.numpy() if type(X) != np.ndarray else X
    # W = W.numpy() if type(W) != np.ndarray else W
    # b = b.numpy() if type(b) != np.ndarray else b

    # # Vectorized implementation of calculating cost fcn
    # cost1 = np.sum((((np.matmul(X, np.transpose(W))+b-Y)*R).flatten())**2)*.5
    # cost2 = lambda_/2*(np.sum((X.flatten())**2)+np.sum((W.flatten())**2))
    # return cost1+cost2
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

# Reduce the data set size so that this runs faster
num_users_r = 4
num_movies_r = 5 
num_features_r = 3

X_r = X[:num_movies_r, :num_features_r]
W_r = W[:num_users_r,  :num_features_r]
b_r = b[0, :num_users_r].reshape(1,-1)
Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]

# Evaluate cost function
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 0)
print(f"Cost: {J}")

Cost: 13.670725805579915


In [88]:
def load_Movie_List_pd():
    """ returns df with and index of movies in the order they are in in the Y matrix """
    df = pd.read_csv('./data/small_movie_list.csv', header=0, index_col=0,  delimiter=',', quotechar='"')
    mlist = df["title"].to_list()
    return(mlist, df)

def normalizeRatings(Y, R):
    """
    Preprocess data by subtracting mean rating for every movie (every row).
    Only include real ratings R(i,j)=1.
    [Ynorm, Ymean] = normalizeRatings(Y, R) normalized Y so that each movie
    has a rating of 0 on average. Unrated moves then have a mean rating (0)
    Returns the mean rating in Ymean.
    """
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R) 
    return(Ynorm, Ymean)

movieList, movieDataframe = load_Movie_List_pd()

In [89]:
my_ratings = np.zeros(num_movies)          #  Initialize my ratings

# Check the file small_movie_list.csv for id of each movie in our dataset
# For example, Toy Story 3 (2010) has ID 2700, so to rate it "5", you can set
my_ratings[2700] = 5 

#Or suppose you did not enjoy Persuasion (2007), you can set
my_ratings[2609] = 2

# We have selected a few movies we liked / did not like and the ratings we
# gave are as follows:
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   # Inception
my_ratings[1150] = 5   # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {movieDataframe.loc[i,"title"]}')


New user ratings:

Rated 5.0 for  Shrek (2001)
Rated 5.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for  Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for  Eternal Sunshine of the Spotless Mind (2004)
Rated 5.0 for  Incredibles, The (2004)
Rated 2.0 for  Persuasion (2007)
Rated 5.0 for  Toy Story 3 (2010)
Rated 3.0 for  Inception (2010)
Rated 1.0 for  Louis Theroux: Law & Disorder (2008)
Rated 1.0 for  Nothing to Declare (Rien à déclarer) (2010)


In [90]:
# Reload ratings
Y, R = load_ratings_small()

# Add new user ratings to Y 
Y = np.c_[my_ratings, Y]

# Add new user indicator matrix to R
R = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)

numMovies, numUsers = Y.shape
numFeatures = 100
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((numUsers, numFeatures), dtype=np.float64), name="W")
X = tf.Variable(tf.random.normal((numMovies, numFeatures), dtype=np.float64), name="X")
b = tf.Variable(tf.random.normal((1, numUsers), dtype=np.float64), name="b")

optimizer = tf.keras.optimizers.Adam(learning_rate=.1)

iterations = 200
lambda_ = 1
for i in range(iterations):
    with tf.GradientTape() as tape:
        cost = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)
    grad = tape.gradient(cost, [X, W, b])
    optimizer.apply_gradients(zip(grad, [X, W, b]))

    # Inform the user of the progress
    print(f"Iteration {i}") if i%20 == 0 else None


Iteration 0
Iteration 20
Iteration 40
Iteration 60
Iteration 80
Iteration 100
Iteration 120
Iteration 140
Iteration 160
Iteration 180


In [97]:
# Recomended moves based upon my_ratings
predNorm = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
predReal =  predNorm + Ymean

myPredictions = predReal[:,0] # Get the predictions for me (User no. 0)
movieDataframe["Myratings"] = myPredictions

In [103]:
SortedDf = movieDataframe.sort_values("Myratings", ascending=False)
SortedDf

Unnamed: 0,mean rating,number of ratings,title,Myratings
1150,3.836000,125,"Incredibles, The (2004)",4.898892
246,3.867647,170,Shrek (2001),4.897137
929,4.118919,185,"Lord of the Rings: The Return of the King, The...",4.887054
622,3.598039,102,Harry Potter and the Chamber of Secrets (2002),4.878342
793,3.778523,149,Pirates of the Caribbean: The Curse of the Bla...,4.874936
...,...,...,...,...
3649,0.500000,1,God's Not Dead (2014),-0.097306
1938,0.500000,1,I Know Who Killed Me (2007),-0.121898
3680,0.500000,1,Midnight Chronicles (2009),-0.122060
2644,0.500000,1,Case 39 (2009),-0.122123
