In [1]:
# install Surprise for this notebook

!pip install surprise



In [2]:
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
import pandas as pd
import json

from surprise import \
SVD, \
NMF, \
CoClustering

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
PROJECT_DIR = "/content/drive/My Drive/Project_EnsembleLearning/"

In [5]:
# load training dataset

df_train = pd.read_csv(PROJECT_DIR + 'dataset_split/training_set.csv')[['userId', 'movieId', 'rating']]

df_train.head()

Unnamed: 0,userId,movieId,rating
0,1,481,3.5
1,1,1591,1.5
2,1,2478,4.0
3,1,2840,3.0
4,1,3698,3.5


In [6]:
reader = Reader(rating_scale=(1, 5))

In [7]:
trainset = Dataset.load_from_df(df_train, reader).build_full_trainset()

In [8]:
# pick algorithm for training

model_svd = SVD()
model_nmf = NMF()
model_cocluster = CoClustering()

In [9]:
# training the models

model_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f989c2604a8>

In [10]:
model_nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f989c260908>

In [11]:
model_cocluster.fit(trainset)

<surprise.prediction_algorithms.co_clustering.CoClustering at 0x7f989c260c88>

In [12]:
# load validation dataset

df_validate = pd.read_csv(
    PROJECT_DIR + 'dataset_split/validation_set.csv')[['userId', 'movieId', 'rating']]

df_validate.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,1590,2.5
2,1,3424,4.5
3,2,170,3.5
4,2,1296,4.5


In [13]:
predicted_attributes_svd = []
predicted_attributes_nmf = []
predicted_attributes_cocluster = []

# this function predicts ratings on the validation set using the trained model
def get_predicted_ratings(x, algo):
  prediction_list = []

  if algo == "SVD":
    model = model_svd
  elif algo == "NMF":
    model = model_nmf
  elif algo == "CoClustering":
    model = model_cocluster

  prediction = model.predict(x[0], x[1])

  prediction_list.append(int(prediction[0]))
  prediction_list.append(int(prediction[1]))
  prediction_list.append(prediction[3])

  if algo == "SVD":
    predicted_attributes_svd.append(prediction_list)
  elif algo == "NMF":
    predicted_attributes_nmf.append(prediction_list)
  elif algo == "CoClustering":
    predicted_attributes_cocluster.append(prediction_list)  

In [14]:
# predict ratings for the validation set using the trained SVD model 

df_validate.apply(lambda x: get_predicted_ratings(x, "SVD"), axis=1)

df_result_svd = pd.DataFrame.from_records(predicted_attributes_svd, 
                                      columns=['userId', 'movieId', 'predicted_rating'])
df_result_svd['rating'] = df_validate['rating']

df_result_svd.head()

Unnamed: 0,userId,movieId,predicted_rating,rating
0,1,307,4.048858,3.5
1,1,1590,3.039132,2.5
2,1,3424,3.895035,4.5
3,2,170,3.021717,3.5
4,2,1296,4.037089,4.5


In [15]:
# predict ratings for the validation set using the trained NMF model 

df_validate.apply(lambda x: get_predicted_ratings(x, "NMF"), axis=1)

df_result_nmf = pd.DataFrame.from_records(predicted_attributes_nmf, 
                                      columns=['userId', 'movieId', 'predicted_rating'])
df_result_nmf['rating'] = df_validate['rating']

df_result_nmf.head()

Unnamed: 0,userId,movieId,predicted_rating,rating
0,1,307,3.717875,3.5
1,1,1590,3.014896,2.5
2,1,3424,4.330001,4.5
3,2,170,3.017285,3.5
4,2,1296,3.999259,4.5


In [16]:
# predict ratings for the validation set using the trained CoClustering model 

df_validate.apply(lambda x: get_predicted_ratings(x, "CoClustering"), axis=1)

df_result_cocluster = pd.DataFrame.from_records(predicted_attributes_cocluster, 
                                      columns=['userId', 'movieId', 'predicted_rating'])
df_result_cocluster['rating'] = df_validate['rating']

df_result_cocluster.head()

Unnamed: 0,userId,movieId,predicted_rating,rating
0,1,307,3.333259,3.5
1,1,1590,2.476887,2.5
2,1,3424,2.994355,4.5
3,2,170,3.161215,3.5
4,2,1296,3.709965,4.5


In [17]:
# save predictions to file

df_result_svd.to_csv(
    PROJECT_DIR + 'predictions/prediction_svd.csv', index=False)

df_result_nmf.to_csv(
    PROJECT_DIR + 'predictions/prediction_nmf.csv', index=False)

df_result_cocluster.to_csv(
    PROJECT_DIR + 'predictions/prediction_cocluster.csv', index=False)