# Project 5 - Social Recommendation Systems (CiaoDVD Dataset)
## Continuation
### Tasks: 2. Implementation
### Task 2.2 Collaborative Filtering Recomender with Probabilistic Matrix Factorization.


In [19]:
import numpy as np
import time
import logging

from support import data_loading_analysis as dla
import model_based_cf as mbcf
from support import evaluation_metrics as em

from IPython.core.interactiveshell import InteractiveShell

# Used to get multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

# Logging configuration
logging.basicConfig(format='%(message)s', level=logging.INFO)

In [20]:
# Adding a file to save the results
results_pmf = open('results/results_pmf1', 'a+')

# Loading the dataset
path = 'Ciao-DVD-Datasets/movie-ratings.txt'
data = dla.load_dataset(path, pmf=True)
dla.get_information(data)
# Remove this columns, is not required in analysis
data = data.drop('genreID', axis=1) 

Users: 17615
Movies: 16121
Categories: 17
Ratings count: 72665
Density: 0.000256
Sparsity: 0.999744



In [21]:
# Splitting the data into train(60%), validation(20%) and test(20%) sets
train_data, validation_data, test_data = dla.train_validate_test_split_pmf(data)
train_data.shape
validation_data.shape
test_data.shape

(43599, 3)

(14533, 3)

(14533, 3)

In [22]:
# Number of Users, Items(Movies)
num_users = data.userId.unique().shape[0]
num_items = data.movieId.unique().shape[0]

# Creating the rating matrix
rating_matrix = np.zeros([num_users, num_items])
for ele in train_data:
    rating_matrix[int(ele[0]), int(ele[1])] = float(ele[2])
rating_matrix.shape

(17615, 16121)

In [23]:
# Parameters settings
lambda_u = 0.02
lambda_v = 0.02
latent_dims = (5, 50)
learn_rate = 0.005
num_iters = 1000
bounds = (1, 5)

print('Parameters for the model are: \nlambda_u: {:f}, \nlambda_v: {:f}, '
      '\nLearning rate: {:f}, \nNumber of iterations: {:d}'.format(lambda_u, lambda_v, learn_rate,
                                                                   num_iters))
# Saving the parameters to a file
print('Parameters for the model are: \nlambda_u: {:f}, \nlambda_v: {:f}, \nLearning rate: {:f},'
      ' \nNumber of iterations: {:d}'.format(lambda_u, lambda_v, learn_rate, num_iters), file=results_pmf)

Parameters for the model are: 
lambda_u: 0.020000, 
lambda_v: 0.020000, 
Learning rate: 0.005000, 
Number of iterations: 10000


In [24]:
for latent_dim in range(latent_dims[0], latent_dims[1]+1, 1):
    # Constructing the PMF model
    logging.info('\nBuilding the PMF model with {:d} latent dimensions....'.format(latent_dim))
    # Saving the latent dimensions to a file
    print('\nPMF model with {:d} latent dimensions....'.format(latent_dim), file=results_pmf)

    time_start = time.time()
    pmf_model = mbcf.PMF(rating_matrix=rating_matrix, lambda_u=lambda_u, lambda_v=lambda_v, latent_dim=latent_dim,
                         learn_rate=learn_rate, momentum=0.9, num_iters=num_iters, seed=1)
    U, V = pmf_model.train(train_data=train_data, validation_data=validation_data)
    time_elapsed = time.time() - time_start
    logging.info('Completed model building in {0:.5f} seconds'.format(time_elapsed))

    # Saving the build time to a file
    print('Time to build model: {0:.5f} seconds'.format(time_elapsed), file=results_pmf)

    logging.info('Testing the PMF model with {:d} latent dimensions....'.format(latent_dim))
    time_start = time.time()
    predictions = pmf_model.predict(data=test_data)
    time_elapsed = time.time() - time_start
    logging.info('Completed model testing in {0:.5f} seconds'.format(time_elapsed))
    # Saving the test time to a file
    print('Time to test model: {0:.5f} seconds'.format(time_elapsed), file=results_pmf)

    # Transforming the data to be with in the bounds
    low, high = bounds
    predictions[predictions < low] = low
    predictions[predictions > high] = high

    # Calculating the RMSE and MAE between the test data and the predicted data
    test_rmse = em.RMSE(test_data[:, 2], predictions)
    test_mae = em.MAE(test_data[:, 2], predictions)

    print('RMSE on test data: {:f}'.format(test_rmse))
    print('MAE on test data: {:f}'.format(test_mae))

    # Saving the errors to a file
    print('RMSE on test data: {:f}'.format(test_rmse), file=results_pmf)
    print('MAE on test data: {:f}'.format(test_mae), file=results_pmf)

th 20 latent dimensions....
RMSE on test data: 2.668683
MAE on test data: 2.302857
Iteration: 1, Loss: 725574.617336, Validation RMSE: 4.095505, Time: 20.360283 seconds
Iteration: 2, Loss: 649993.327413, Validation RMSE: 3.907433, Time: 19.636598 seconds
Iteration: 3, Loss: 578906.657012, Validation RMSE: 3.809660, Time: 19.282868 seconds
Iteration: 4, Loss: 515569.701573, Validation RMSE: 3.683550, Time: 18.650082 seconds
Iteration: 5, Loss: 441893.338311, Validation RMSE: 3.484563, Time: 18.460482 seconds
Iteration: 6, Loss: 403159.092341, Validation RMSE: 3.392323, Time: 18.137632 seconds
Completed model building in 135.98212 seconds
Testing the PMF model with 20 latent dimensions....
Completed model testing in 0.02600 seconds

Building the PMF model with 21 latent dimensions....
Iteration: 7, Loss: 379375.356846, Validation RMSE: 3.412982, Time: 19.170453 seconds
The model converged at iteration:  7
RMSE on test data: 2.659314
MAE on test data: 2.290871
Iteration: 1, Loss: 723276.9