Off-the-shelf method using Surprise library from scikit for collaborative filtering.

The Surprise library offers matrix factorization-based prediction algorithms such as SVD, SVD++, NMF.

In [1]:
import numpy as np
from surprise import Dataset, evaluate, Reader, accuracy
from surprise.prediction_algorithms import predictions, matrix_factorization
from surprise.model_selection import cross_validate, GridSearchCV

In [2]:
# load train and test data
r = Reader(sep='\t')
train_data = Dataset.load_from_file('data/train.txt', reader=r)
train_Y = train_data.build_full_trainset()

test_data = Dataset.load_from_file('data/test.txt', reader=r)
test_Y = test_data.build_full_trainset()
test_data = test_Y.build_testset()

In [None]:
# SVD
svd = matrix_factorization.SVD()

# SVD++ algorithm aka extension of SVD taking into account implicit ratings
svdpp = matrix_factorization.SVDpp()

# Non-negative Matrix factorization
nmf = matrix_factorization.NMF()

# Fit and train, compute root mean square error
for model in [svd, svdpp, nmf]:
    model.fit(train_Y)
    print(model)
    accuracy.rmse(model.test(test_data)) # Then compute RMSE
    print()
    
# Results: (SVD++ algo had best RMSE)
# <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x0000015C1957C240>
# RMSE: 0.9316

# <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x0000015C1957C9E8>
# RMSE: 0.9098

# <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x0000015C1957CBE0>
# RMSE: 0.9566

In [None]:
# Use Grid Search to find best parameters for SVD++ algorithm
params = {'n_factors': [5, 10, 20, 40, 50], 'n_epochs': [5, 10, 20, 40]}

# 3 fold cross val used
grid = GridSearchCV(matrix_factorization.SVDpp, params, measures=['rmse', 'mae'], cv=3)
grid.fit(train_Y)

In [None]:
print(grid.best_score['rmse']) # RMSE: 0.9296894324594304
print(grid.best_params['rmse']) # {'n_factors': 5, 'n_epochs': 20}

In [None]:
# train model with best params
model = matrix_factorization.SVDpp(n_factors=5, n_epochs=20)
model.fit(train_Y)
accuracy.rmse(model.test(test_data)) # RMSE: 0.9201790944404813

In [3]:
'''
This function performs matrix factorization.
Input:
    Y_train: training labels
    test_set: test set
Output: 
    newU: The 2D version of U
    newV: The 2D version of V
'''
def matrix_factor(train_Y):
    """ Uses matrix factorization-based algorithm SVD++
    and returns U (k x m matrix) and V (k x n matrix)
    where k in the number of factors of the model. """
    # train/fit model
    model = matrix_factorization.SVDpp(n_factors=5, n_epochs=20)
    model.fit(train_Y)
    
    # transpose user factors
    U = np.transpose(model.pu)    
    print("U shape: " + str(U.shape)) # k x m = 5 x 943
    
    # transpose item factors
    V = np.transpose(model.qi)    
    print("V shape: " + str(V.shape)) # k x n = 5 x 1682
    
    return U,V
    
def SVD(U, V):
    """ Applies SVD to V and uses the ﬁrst two columns of A
    to project U,V into 2-D space. """    
    # get A matrix
    A, _, _ = np.linalg.svd(V) 
    # take first two columns of A
    A = A[:, [0, 1]] # k x 2: 5 x 2
    print("A shape: " + str(A.shape))
    
    # project U into 2-D space
    U_tilda = np.dot(np.transpose(A), U)
    print("U shape: " + str(U_tilda.shape)) # 2 x 943
    
    # project V into 2-D space
    V_tilda = np.dot(np.transpose(A), V)
    print("V shape: " + str(V_tilda.shape)) # 2 x 1668

    return U_tilda, V_tilda

In [4]:
U, V = matrix_factor(train_Y)

U shape: (5, 943)
V shape: (5, 1668)


In [5]:
U_tilda, V_tilda = SVD(U, V)

A shape: (5, 2)
U shape: (2, 943)
V shape: (2, 1668)
