COS80024

DATA SCIENCE PROJECT 1

PROJECT 4: MOVIE RECOMMENDATION SYSTEM

# S3.4.2: For memory-based technique based on users  (Executor: Promita)

This task aims to select appropriate performance metrics and use them for model performance evaluation, comparison and analysis.

Task Leader: Promita

In [1]:
# Loading relevant Python libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import math
import time
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

# Model built without using surprise package

In [2]:
#Loading ratings_small
rating = pd.read_csv('ratings_small.csv')

In [3]:
#Getting list of users
users_list = rating['userId'].unique().tolist()

In [4]:
#Getting list of movies
movies_list = rating['movieId'].unique().tolist()

In [5]:
#Calculating mean rating
mean_rate = rating['rating'].mean()

In [6]:
#Converting userIds to index
user2idx = {o:i for i, o in enumerate(users_list)}

In [7]:
#Converting movieIds to index
movie2idx = {o:i for i, o in enumerate(movies_list)}

In [8]:
#Total number of users
n_users = len(user2idx)

In [9]:
#Total number of movies
n_movies = len(movie2idx)

In [10]:
#Loading files

#File 1: uploading the training data
train = pd.read_csv('train_df.csv')

#File 2: uploading the testing data
test = pd.read_csv('test_df.csv')

In [11]:
#Remove all information except User ID, Movie ID and Rating
train = train[['userId','movieId','rating']]
test = test[['userId','movieId','rating']]

In [12]:
#Creating matrix of the size of total number of users and total number of movies
train_matrix = np.zeros((n_users, n_movies))

In [13]:
#Filling up train_matrix with ratings for user,movie tuple
for entry in train.itertuples():
    user_idx = user2idx[entry[1]] #entry[1] refers to userId
    movie_idx = movie2idx[entry[2]]  #entry[1] refers to movieId
    train_matrix[user_idx, movie_idx] = entry[3]  #entry[1] refers to rating
print(train_matrix.shape)
mean_user_rating = train_matrix.mean(axis=1)

(671, 9066)


In [14]:
#Computing cosine silimarity between users
user_similarity = np.cos(pairwise_distances(train_matrix, metric ='cosine'))

In [15]:
#Function to predict rating for user,movie tuple
def cf_user(user, movie, k):
    useridx = user2idx[user] 
    movieidx = movie2idx[movie] 
    mean = mean_user_rating[useridx]
    movie_rateduser = train[train['movieId'] == movie]['userId'].tolist()
    movie_rateduseridx = [user2idx[usr] for usr in movie_rateduser]
    movie_rating = [train_matrix[usr][movieidx] for usr in movie_rateduseridx]
    sim_idx = np.argsort(-user_similarity[useridx][movie_rateduseridx])[1:]
    sim = user_similarity[useridx][movie_rateduseridx][sim_idx]
    
    if k>len(movie_rateduser):
        k_sim = np.array(sim)
        k_sim_rating = np.array([train_matrix[movie_rateduseridx[i]][movieidx] for i in sim_idx])
    else:
        k_sim = np.array(sim[:k])
        k_sim_rating = np.array([train_matrix[movie_rateduseridx[i]][movieidx] for i in sim_idx[:k]])
        
    ratings_diff = k_sim_rating - mean
    
    pred = mean + k_sim.dot(ratings_diff)/(np.abs(k_sim).sum())
    
    if math.isnan(pred):
        pred = mean
    
    return pred  

In [16]:
# Function to evaluate performance of model
def score(k):
    test_ratings = np.array(test['rating'].tolist())
    pred_test = []
    for i, entry in enumerate(test.itertuples()):
        user = entry[1]
        movie = entry[2]
        pred_test.append(cf_user(user,movie,k))
    pred_test = np.array(pred_test)
    print('RMSE: ',round(rmse(pred_test, test_ratings),4))
    print('MAE: ',round(mae(pred_test, test_ratings),4))

In [17]:
#Function to calculate rmse
def rmse(prediction, test_matrix):
    return sqrt(mean_squared_error(prediction, test_matrix))

In [18]:
#Function to calculate mae
def mae(prediction, test_matrix):
    return mean_absolute_error(prediction, test_matrix)

In [19]:
score(5)

  pred = mean + k_sim.dot(ratings_diff)/(np.abs(k_sim).sum())


RMSE:  1.1564
MAE:  0.8695


# Model built using surprise package

In [20]:
# Load Surprise libraries
from surprise import KNNWithMeans
from surprise import Reader
from surprise import Dataset
from surprise import accuracy

In [21]:
# Read the data into a Surprise dataset
reader = Reader(rating_scale = (1, 5))
data_train = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
data_test = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)

In [22]:
# Build full trainset
data_train = data_train.build_full_trainset()
data_test = data_test.build_full_trainset()

In [23]:
mean = data_train.global_mean
print('Train rating', mean)

Train rating 3.5399152683169963


In [24]:
mean = data_test.global_mean
print('Test rating', mean)

Test rating 3.645603576751118


In [25]:
# Create the trainset and testset
data_trainset = data_train.build_testset()
data_testset = data_test.build_testset()

In [26]:
# Create kNN algorithms with cosine similarity for user-based CF
sim_options = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNWithMeans(sim_options=sim_options)

In [27]:
# Train the algorithm on the trainset
user_based = algo.fit(data_train)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [28]:
# Importing pickle file
filename = 'user-based_CF.pickle'
pickle.dump(user_based, open(filename, 'wb'))

In [29]:
#Predicting for trainset
train_pred = algo.test(data_trainset)

In [30]:
# Calculate RMSE and MAE for trainset
accuracy.rmse(train_pred)
accuracy.mae(train_pred)

RMSE: 0.7842
MAE:  0.5916


0.5916013111205957

In [31]:
# Predicting for testset
test_pred = algo.test(data_testset)

In [32]:
# Calculate RMSE and MAE for testset
accuracy.rmse(test_pred)
accuracy.mae(test_pred)

RMSE: 0.9336
MAE:  0.7125


0.7124530038365493

In [33]:
# Predictions for trainset
predict_train = []
for i in range(len(train_pred)):
    temp = train_pred[i].est
    temp1 = round(temp,2)
    predict_train.append(temp1)

In [34]:
# Saving predictions
predict_train = pd.DataFrame(predict_train, columns=['predicted_rating'])

In [35]:
# Converting to dataframe
predict_train_df = pd.DataFrame(predict_train, columns=['predicted_rating'])

In [36]:
#loading onto train_df
df1 = train.join(predict_train_df)

In [37]:
#Export the train dataframe with the predicted ratings 
df1.to_csv("train_df_cf_user.csv", index=False)

In [38]:
# Predicting for testset
predict_test = []
for i in range(len(test_pred)):
    temp = test_pred[i].est
    temp1 = round(temp,2)
    predict_test.append(temp1)

In [39]:
# Saving predictions
predict_test_df = pd.DataFrame(predict_test, columns=['predicted_rating'])

In [40]:
# Converting to dataframe
df2 = test.join(predict_test_df)

In [41]:
#Export the test dataframe with the predicted ratings 
df2.to_csv("test_df_cf_user.csv", index=False)

# Model Selection

Model without surprise package: RMSE score = 1.1864

Model without surprise package: RMSE score = 0.9363

Since, the model built with surprise package gives a better performance score, we will be using the surprise package to build the final model