In [13]:
#Implementing regularized Non-negative Matrix factorization using Regularized gradient descent
import sys, numpy as np
from numpy import genfromtxt
import codecs
from numpy import linalg as LA

In [14]:
#build movie dicitionary with line no as numpy movie id ,its actual movie id as the key.
def build_movies_dict(movies_file):
    i = 0
    movie_id_dict = {}
    with codecs.open(movies_file, 'r', 'latin-1') as f:
        for line in f:
            if i == 0:
                i = i+1
            else:
                movieId,title,genres = line.split(',')
                movie_id_dict[int(movieId)] = i-1
                i = i +1
    return movie_id_dict


In [15]:
#Each line of i/p file represents one tag applied to one movie by one user,
#and has the following format: userId,movieId,tag,timestamp
#return the sparse matrix as a numpy array
def read_data(input_file,movies_dict):
    #no of users
    users = 718
    #no of movies
    movies = 8927
    X = np.zeros(shape=(users,movies))
    i = 0
    with open(input_file,'r') as f:
        for line in f:
            if i == 0:
                i = i +1
            else:
                user,movie_id,rating,timestamp = line.split(',')
                #get the movie id for the numpy array consrtruction
                id = movies_dict[int(movie_id)]
                X[int(user)-1,id] = float(rating)
                i = i+1
    return X

In [16]:
# non negative regulaized matrix factorization implemention
def matrix_factorization(X,P,Q,K,steps,alpha,beta):
    Q = Q.T
    for step in range(steps):
        print(step)
        #for each user
        for i in range(X.shape[0]):
            #for each item
            for j in range(X.shape[1]):
                if X[i][j] > 0 :

                    #calculate the error of the element
                    eij = X[i][j] - np.dot(P[i,:],Q[:,j])
                    #second norm of P and Q for regularilization
                    sum_of_norms = 0
                    #for k in xrange(K):
                    #added regularized term to the error
                    sum_of_norms += LA.norm(P) + LA.norm(Q)
                    eij += ((beta/2) * sum_of_norms)
                    #compute the gradient from the error
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * ( 2 * eij * Q[k][j] - (beta * P[i][k]))
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - (beta * Q[k][j]))

        #compute total error
        error = 0
        #for each user
        for i in range(X.shape[0]):
            #for each item
            for j in range(X.shape[1]):
                if X[i][j] > 0:
                    error += np.power(X[i][j] - np.dot(P[i,:],Q[:,j]),2)
        if error < 0.001:
            break
    return P, Q.T

In [17]:
def start_func(X,K):
    #no of users
    N= X.shape[0]
    #no of movies
    M = X.shape[1]
    #P: an initial matrix of dimension N x K, where is n is no of users and k is hidden latent features
    P = np.random.rand(N,K)
    #Q : an initial matrix of dimension M x K, where M is no of movies and K is hidden latent features
    Q = np.random.rand(M,K)
    #steps : the maximum number of steps to perform the optimisation, hardcoding the values
    #alpha : the learning rate, hardcoding the values
    #beta  : the regularization parameter, hardcoding the values
    steps = 10 #5000
    alpha = 0.0002
    beta = float(0.02)
    estimated_P, estimated_Q = matrix_factorization(X,P,Q,K,steps,alpha,beta)
    #Predicted numpy array of users and movie ratings
    modeled_X = np.dot(estimated_P,estimated_Q.T)
    np.savetxt('mf_result.txt', modeled_X, delimiter=',')

In [50]:
ratings_file =  "/content/ratings.csv"
no_of_features = 8
movies_mapping_file = "/content/movies_nmf.csv"

#build a dictionary of movie id mapping with counter of no of movies
movies_dict = build_movies_dict(movies_mapping_file)
#read data and return a numpy array
numpy_arr = read_data(ratings_file,movies_dict)
non_zero_indices = numpy_arr.nonzero()

# # Normalize non-zero elements between 0 and 1
normalized_ratings = numpy_arr[non_zero_indices] / 5.0
# Create a new matrix with normalized values
normalized_matrix = np.zeros_like(numpy_arr, dtype=float)
normalized_matrix[non_zero_indices] = normalized_ratings
#start function
start_func(normalized_matrix,no_of_features)

0
1
2
3
4
5
6
7
8
9


In [56]:
#this python file reads the Predicted Matrix for users and movie ratings and recommends
#50 tops movies for each user based on his/her unrated movies
#this can be used after building the model to recommend any no. of times
import sys, numpy as np
import codecs
import operator

In [57]:
#function to return a dictionary with actual movie id as key and its line no as movie id for numpy array
def dict_with_movie_and_id(movies_file):
    movies_names_dict = {}
    movies_id_dict ={}
    i = 0
    with codecs.open(movies_file, 'r', 'latin-1') as f:
        for line in f:
            if i == 0:
                i = i+1
            else:
                movie_id, movie_name, genre = line.split(',')
                movies_names_dict[int(movie_id)] = movie_name
                movies_id_dict[int(movie_id)] = i-1
                i = i +1
    return movies_names_dict, movies_id_dict

In [58]:
#function to return a dictionary with users along with non-rated movie
def dict_with_user_unrated_movies(rating_file,movie_mapping_id):
    #no of users
    users = 718
    #no of movie ids
    movies = 8927
    dict_with_unrated_movies_users ={}
    X = np.zeros(shape=(users,movies))
    i = 0
    with open(rating_file,'r') as f:
        for line in f:
            if i == 0:
                i = i +1
            else:
                user,movie,rating,timestamp = line.split(',')
                id = movie_mapping_id[int(movie)]
                X[int(user)-1,id] = float(rating)
                i = i+1

    for row in range(X.shape[0]):
        unrated_movi_ids = np.nonzero(X[row] == 0)
        unrated_movi_ids = list(unrated_movi_ids[0])
        unrated_movi_ids = map(lambda x: x+1,unrated_movi_ids)
        dict_with_unrated_movies_users[row+1] = unrated_movi_ids
    return dict_with_unrated_movies_users

In [59]:
#build predicted numpy array from the comma seperated file
def build_predicted_numpy_array(pred_file):
    #no of users
    users = 718
    #no of movie ids
    movies = 8927
    X = np.zeros(shape=(users,movies))
    user = 0
    with open(pred_file,'r') as f:
        for line in f:
            ratings = line.split(',')
            for movie_id,rating in enumerate(ratings):
                X[user,movie_id] = rating
            user = user+1
    return X

In [60]:
#recommend top 25 movies for user specified
def top_25_recommended_movies(pred_rating_file,users,unrated_movies_per_user,movies_mapping_names,movie_mapping_id):
    #dicitonary with numpy movie id as key and actual movie id as value
    reverse_movie_id_mapping = {}
    for key,val in movie_mapping_id.items():
        reverse_movie_id_mapping[val] = key
    #for each user, predict top 25 movies
    for user in users:
        dict_pred_unrated_movies = {}
        unrated_movies = unrated_movies_per_user[int(user)]
        for unrated_movie in unrated_movies:
            dict_pred_unrated_movies[int(unrated_movie)] = pred_rating_file[int(user)-1][int(unrated_movie)-1]
        #recommend top k movies
        SortedMovies = sorted(dict_pred_unrated_movies.items(), key=operator.itemgetter(1), reverse=True)
        print ("Top 25 movies recommendation for the user", user)
        for i in range(25):
            movie_id, rating = SortedMovies[i]
            actual_movie_id = reverse_movie_id_mapping[movie_id]
            #recommend movies only if the predicted rating is greater than 3.5
            if rating >= 3.5 :
                print ("{} with Movie rating value {}".format(movies_mapping_names[actual_movie_id],rating))
        print("\n")


In [61]:
#main method
def recommend_movies_for_users(orig_rating_file,pred_rating_file,movies_file,users):
    #method to get the mapping between movie names, actual movie id and numpy movie id
    movies_mapping_names,movie_mapping_id = dict_with_movie_and_id(movies_file)
    #build predicted numpy movie id from the saved predicted matrix of user and movie ratings
    predicted_rating_numpy_array = build_predicted_numpy_array(pred_rating_file)
    #dictionary of unrated movies for each user
    dict_with_unrated_movies_users = dict_with_user_unrated_movies(orig_rating_file,movie_mapping_id)
    #method which actually recommends top 25 unrated movies based on their the predicted score
    top_25_recommended_movies(predicted_rating_numpy_array,users,dict_with_unrated_movies_users,movies_mapping_names,movie_mapping_id)

In [62]:
orig_rating_file = "/content/ratings.csv"
pred_rating_file = "/content/mf_result.txt"
movies_file = "/content/movies_nmf.csv"
list_of_users = "/content/users.txt"
with open (list_of_users,'r') as f:
  users = f.readline().split(',')
recommend_movies_for_users(orig_rating_file,pred_rating_file,movies_file,users)

Top 25 movies recommendation for the user 1
Life of David Gale  The (2003) with Movie rating value 3.755162936653236
Fall of the House of Usher  The (House of Usher) (1960) with Movie rating value 3.663115310969039
Monsters vs. Aliens (2009) with Movie rating value 3.6230172476076166
Bollywood/Hollywood (2002) with Movie rating value 3.605428153911866
Gone (2012) with Movie rating value 3.6005432293755937
Love Happens (2009) with Movie rating value 3.584811982304347
Random Hearts (1999) with Movie rating value 3.5808661065161025
Martyrs (2008) with Movie rating value 3.5786012444836044
Schizopolis (1996) with Movie rating value 3.5641794332209735
Ides of March  The (2011) with Movie rating value 3.5506842584579474
Dark Knight  The (2008) with Movie rating value 3.5479333151952637
My Boyfriend's Back (1993) with Movie rating value 3.546939227818836
Kick-Ass (2010) with Movie rating value 3.534307115328048
Leaving Normal (1992) with Movie rating value 3.5223474639193237
Keeping the Promi

In [65]:
import numpy as np

def calculate_rmse(actual, predicted):
    # Find indices where actual ratings are available
    non_zero_indices = np.nonzero(actual)

    # Calculate the squared differences between actual and predicted ratings
    squared_errors = np.power(actual[non_zero_indices] - predicted[non_zero_indices], 2)

    # Calculate the mean squared error
    mean_squared_error = np.mean(squared_errors)

    # Calculate the root mean squared error
    rmse = np.sqrt(mean_squared_error)

    return rmse

predicted_rating_numpy_array = build_predicted_numpy_array(pred_rating_file)
rmse = calculate_rmse(normalized_matrix, predicted_rating_numpy_array)
print("RMSE:", rmse)


RMSE: 1.933431334650169
