COS80024

DATA SCIENCE PROJECT 1

PROJECT 4: MOVIE RECOMMENDATION SYSTEM

# S3.4.3: Model-based Collaborative Filter  (Executor: Nakib)

This task aims to develop, select, train and tune parameters for a memory based collaborative filter based on items. 

Task Leader: Promita

In [1]:
# Load the Python libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import time

# Load Surprise libraries
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import accuracy

# Load plotting libraries
import matplotlib.pyplot as plt

In [2]:
#Loading files

#File 1: uploading the training data
train = pd.read_csv('train_df.csv')

#File 2: uploading the testing data
test = pd.read_csv('test_df.csv')

In [3]:
#Remove all information except User ID, Movie ID and Title
train_data = train[['userId','movieId','rating']]

In [4]:
#Remove all information except User ID, Movie ID and Title
test_data = test[['userId','movieId','rating']]

In [5]:
#Read the data into a Surprise dataset
reader = Reader(rating_scale = (1, 5))
data_train = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
data_test = Dataset.load_from_df(test_data[['userId', 'movieId', 'rating']], reader)

In [6]:
#Build full trainset
data_train = data_train.build_full_trainset()
data_test = data_test.build_full_trainset()

In [7]:
mean = data_train.global_mean
print('Train rating', mean)

Train rating 3.5399152683169963


In [8]:
mean = data_test.global_mean
print('Test rating', mean)

Test rating 3.645603576751118


In [9]:
#Create the trainset and testset
data_trainset = data_train.build_testset()
data_testset = data_test.build_testset()

In [59]:
#Create SVD algorithm with 5 factors
k_factors = 5
algo = SVD(n_factors= k_factors, n_epochs= 200, biased= True, lr_all= 0.005, reg_all= 0, init_mean= 0, init_std_dev= 0.01)

In [60]:
#Train the algorithm on the trainset
model_based = algo.fit(data_train)

In [61]:
#Exporting pickle file
filename = 'model-based_CF.pickle'
pickle.dump(model_based, open(filename, 'wb'))

In [62]:
#Calculate RMSE for training dataset
train_pred = algo.test(data_trainset)
accuracy.rmse(train_pred)
accuracy.mae(train_pred)

RMSE: 0.6144
MAE:  0.4508


0.4508355237046407

In [63]:
#Storing predicted rating for the train dataset
pred_rating = []
for i in range(len(train_pred)):
    temp = train_pred[i].est
    temp1 = round(temp,2)
    pred_rating.append(temp1)
pred_rating_df = pd.DataFrame(pred_rating, columns=['pred_rating'])
df = train_data.join(pred_rating_df)

In [64]:
#Export the train dataframe with the predicted ratings 
df.to_csv("train_df_cf_model.csv", index=False)

In [65]:
#Calculate RMSE for test dataset
test_pred = algo.test(data_testset)
accuracy.rmse(test_pred)
accuracy.mae(test_pred)

RMSE: 0.9554
MAE:  0.7236


0.7236316307697901

In [66]:
#Storing predicted rating for the test dataset
pred_rating = []
for i in range(len(test_pred)):
    temp = test_pred[i].est
    temp1 = round(temp,2)
    pred_rating.append(temp1)
pred_rating_df = pd.DataFrame(pred_rating, columns=['pred_rating'])
df1 = test_data.join(pred_rating_df)

In [18]:
#Export the test dataframe with the predicted ratings 
df1.to_csv("test_df_cf_model.csv", index=False)