In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
from surprise.reader import Reader
from surprise import SVD, Dataset, accuracy, SVDpp
from sklearn.decomposition import PCA
np.random.seed(69)
%matplotlib inline

In [4]:
movie_df = pd.read_csv('data/movies_c.txt', delimiter='\t', header=None,
                       names=["Movie Id", "Movie Title", "Unknown", "Action", 
                              "Adventure", "Animation", "Childrens", "Comedy", 
                              "Crime", "Documentary", "Drama", "Fantasy", 
                              "Film-Noir", "Horror", "Musical", "Mystery", 
                              "Romance", "Sci-Fi", "Thriller", "War", 
                              "Western"])

data_df = pd.read_csv('data/data_c.txt', delimiter='\t', header=None,
                      names=["User Id", "Movie Id", "Rating"])
train_df = pd.read_csv('data/train_c.txt', delimiter='\t', header=None,
                      names=["User Id", "Movie Id", "Rating"])
test_df = pd.read_csv('data/test_c.txt', delimiter='\t', header=None,
                      names=["User Id", "Movie Id", "Rating"])

# 5)

In [16]:
Y = train_df.values

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_df, reader)
trainset = data.build_full_trainset()

data = Dataset.load_from_df(test_df, reader)
testset = data.construct_testset(data.raw_ratings)

biased=False
algo = SVD(n_factors = 20, n_epochs = 10, reg_all = 1e-3, biased=biased)
algo.fit(trainset)

U = algo.pu
V = algo.qi
if biased:
    A = algo.bu
    B = algo.bi
    mu = Y[:, 2].mean()

predictions = algo.test(testset)
accuracy.rmse(predictions)**2

RMSE: 0.9687


0.9384473950212315

In [17]:
def get_err_biased(U, V, A, B, mu, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V, as well as user biases A, movie biases B, and
    overall offset mu.

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T, 
    plus some bias terms.
    """
    predict = np.matmul(U, np.transpose(V))
    err = 0
    for (i, j , y_ij) in Y:
        err += (y_ij - mu - (predict[i][j] + A[i] + B[j]))**2
    return (reg/2 * ((np.linalg.norm(U))**2 + (np.linalg.norm(V))**2 + (np.linalg.norm(A))**2 + (np.linalg.norm(B))**2) 
            + 0.5 * err)/len(Y)

def get_err(U, V, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T
    """
    predict = np.matmul(U, np.transpose(V))
    err = 0
    for (i, j , y_ij) in Y:
        err += (y_ij - (predict[i][j])) ** 2
    return (reg/2 * ((np.linalg.norm(U))**2 + (np.linalg.norm(V))**2) 
            + 0.5 * err)/len(Y)

In [18]:
Y_test = test_df.values
if biased:
    print(get_err(U, V, A, B, mu, Y))
    print(get_err(U, V, A, B, mu, Y_test))
else:
    print(get_err(U, V, Y))
    print(get_err(U, V, Y_test))

1.0141860216178074
1.0415904214278142


In [8]:
pca = PCA(3)
V_p = pca.fit_transform(V)
U_p = pca.transform(U)