In [64]:
#Importing the important packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
from IPython.display import Image

In [65]:
#Importing the important csv files in pandas ratings
df=pd.read_csv('ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [66]:
movies = pd.read_csv("movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [67]:
ratings=df.pivot(index='userId', columns='movieId', values='rating')
# ratings = df

In [68]:
#Removing all the Nans from the ratings and changing it to 0's
ratings=ratings.fillna(0).values


In [69]:
##Defing the train and split for traiing and validation. 
'''
Parameters of nonZeros: Means this only takes if the user has rated over that particular value. 
              SizeRate: the randomised size of ratings for validation 
'''
def train_test_split(ratings, nonZeros, sizeRate):
    
    validation = np.zeros(ratings.shape)
    train = ratings.copy() 
    
    for user in np.arange(ratings.shape[0]):
        if len(ratings[user,:].nonzero()[0])>=nonZeros:
            val_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=sizeRate, 
                                        replace=False)
            train[user, val_ratings] = 0
            validation[user, val_ratings] = ratings[user, val_ratings]
    return train, validation

In [70]:
##Defining our distance functions
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

def mae(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return mean_absolute_error(prediction, ground_truth)

In [71]:
##Gradient gradient descent for The lower rank matrices
#P is the user matrix, Q is the item feature matrix
def prediction(P,Q):
    return np.dot(P.T,Q)

In [74]:
from tqdm import tnrange, tqdm_notebook

def prediction_final():
    ##Got the best parameters and have been used in this. 
    train, val = train_test_split(ratings, 35, 15)
    lmbda = 1e-3 # Regularization parameter
    k = 2 
    m, n = train.shape  # Number of users and items
    n_epochs = 100  # Number of epochs
    alpha=0.01  # Learning rate

    P = 3 * np.random.rand(k,m) # Latent user feature matrix
    Q = 3 * np.random.rand(k,n) # Latent movie feature matrix

    
    #Only consider items with ratings 
    users,items = train.nonzero()      
    for epoch in tqdm_notebook(range(n_epochs)):
        for u, i in zip(users,items):
            e = train[u, i] - prediction(P[:,u],Q[:,i])  # Calculate error for gradient update
            P[:,u] += alpha * ( e * Q[:,i] - lmbda * P[:,u]) # Update latent user feature matrix
            Q[:,i] += alpha * ( e * P[:,u] - lmbda * Q[:,i])  # Update latent item feature matrix
        print(f"Epoch number: {epoch}, training rmse: {rmse(prediction(P,Q),train)}, Testing RMSE: {rmse(prediction(P,Q),val)}, MAE: {mae(prediction(P,Q),val)}")

    return(prediction(P,Q))

In [75]:
pred = prediction_final()

HBox(children=(IntProgress(value=0), HTML(value='')))

Epoch number: 0, training rmse: 1.2587038746083887, Testing RMSE: 1.312509138244366, MAE: 1.0472514639539818
Epoch number: 1, training rmse: 1.0967476586802911, Testing RMSE: 1.132684581324057, MAE: 0.8756006445740715
Epoch number: 2, training rmse: 1.0360941710046971, Testing RMSE: 1.0800752885931326, MAE: 0.8281924016981892
Epoch number: 3, training rmse: 0.9999525427426658, Testing RMSE: 1.0522659224072834, MAE: 0.8034123499988005
Epoch number: 4, training rmse: 0.9752739277936399, Testing RMSE: 1.0349382946546744, MAE: 0.7883833760907936
Epoch number: 5, training rmse: 0.9570878062097216, Testing RMSE: 1.023151317482526, MAE: 0.7781247607090531
Epoch number: 6, training rmse: 0.9430038851625983, Testing RMSE: 1.014673108385425, MAE: 0.7705641409721297
Epoch number: 7, training rmse: 0.931707072300599, Testing RMSE: 1.0083321282650515, MAE: 0.764810251917137
Epoch number: 8, training rmse: 0.9224045355040433, Testing RMSE: 1.003452139198556, MAE: 0.7604191459230955
Epoch number: 9, 

In [76]:
def get_movie_name(index):
    return(movies['title'].values[index])

In [77]:
##GETTING THE TOP 5 MOVIES FOR A PARTICULAR USER:
def getLikedMovies(userID , ratings = ratings, prediction = pred, movies = movies):
    user = ratings[userID]
    movie_ids = np.nonzero(user)
    list_of_movies = []
    list_of_alrady_liked = []
    ratingAbove4 = []
    for i in movie_ids[0]:
        list_of_movies.append(get_movie_name(i))
        list_of_alrady_liked.append(i)
        if user[i] >4.5:
            ratingAbove4.append(get_movie_name(i))
    
    list_user_pred = pred[userID]
    movie_dict = {}
    for i in range(len(list_user_pred)):
        if list_user_pred[i] > 4.5:
            movie_dict[list_user_pred[i]] = (i, get_movie_name(i))
    ##Getting the top 5 movies:
    
    top_movies = []
    for j in sorted(movie_dict.keys())[::-1]:
        if movie_dict[j][0] not in list_of_alrady_liked:
            top_movies.append(movie_dict[j])
    
    
    if len(ratingAbove4) >=11:
        return(ratingAbove4[:10], top_movies[:5]) 
    else:
        return(ratingAbove4, top_movies[:5])
        
    

In [80]:
getLikedMovies(11)

(['First Knight (1995)',
  'Circle of Friends (1995)',
  'Junior (1994)',
  'Emma (1996)',
  'Star Trek: First Contact (1996)',
  'Sweet Hereafter, The (1997)',
  'And the Band Played On (1993)',
  'Day of the Beast, The (Día de la Bestia, El) (1995)',
  'Matrix, The (1999)',
  'Go (1999)'],
 [(6034, 'Fog, The (2005)'),
  (5192, 'Blackboard Jungle (1955)'),
  (4039, 'Eyewitness (Janitor, The) (1981)'),
  (8919, "Bruce Lee: A Warrior's Journey (2000)"),
  (6952, 'My Bloody Valentine 3-D (2009)')])