# UV decomposition

Upload necessary packages

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import mean_squared_error

**Data parsing**

Parse the data and retrieve the ratings

In [5]:
#Import the Data Frame containing the ratings.
data = np.genfromtxt('./ratings.dat',
                     delimiter='::', usecols = (0, 1, 2), dtype = int)

#Convert to a data frame and rename the columns
ratings = pd.DataFrame(data)
ratings.columns = ['user', 'movie', 'rating']

In [6]:
#Generate the training userxmovie matrix
rating_mat = ratings.pivot(index='user', columns='movie', values='rating')

In [7]:
#We will work with numpy arrays since faster. 
ratings_np = np.matrix(ratings)
rating_mat_np = np.array(rating_mat)

#ratings_np represents the data frame of ratings with information encoded as <userID, moviesID, rating>. 
#rating_mat_np is the sparse matrix of ratings we use for the optimization algorithm.

**Implement the UV decomposition algorithm**

The UV matrix decomposition algorithm aims to decompose a matrix $X$ to two lower rank matrices $U$ and $M$ such that their dot product represents a good approximation of $X$. This method can be adopted for the development of a recommendation system that approximates the utility matrix $X$ as the matrix product between a user matrix $U$ and an item matrix $M$. The principle behind the optimization algorithm lies in the subsequent substitution of all elements of the matrices $U$ and $M$ with variables to optimize one at the time. Upon encoding  one element of the two matrices with an independent variable, the square error between the utility matrix $X$ and the product $UM$ is subject to quadratic optimization. As we find the optimal value of one variable representing one element of either $U$ or $M$, we pass to the next entry to optimize of the same matrix. 

We will therefore optimize subsequently all elements of $U$ and then all elements of $M$. Performing a single traversal will although not be enough for the optimization to be fulfilled. Indeed, when a value is optimized, this means that previously already optimized values could be further improved by means of subsequent cycles of the algorithm, which prompts us to carry on with the same logic until a final convergence is met. In practice, we let the algorithm cycle for approximately 40 runs and evaluate its performance in the end.

In [8]:
def initialize(ratings, n_users, n_movies, K):
    '''
    Given a rating data frame "ratings", the number of users and the number of movies and a constant K, the algorithm 
    initializes two matrices U and M of dimensions n_users*K and K*n_movies
    '''
    nan_mean = np.mean(ratings['rating'])
    initialize = np.sqrt(nan_mean/K)
    U = (np.repeat(initialize, K*n_users) + np.random.normal(0,1,K*n_users)).reshape((n_users, K))
    M = (np.repeat(initialize, K*n_movies) + np.random.normal(0,1,K*n_movies)).reshape((K, n_movies))
    return U,M

def RMSE(ratings, X, P):
    '''
    RMSE computes the root mean squared error of two matrices, X and P. Ratings is an array containing the list of
    ratings recorded as <userID, itemID, rating>.
    '''
    diff = X - P
    n_non_missing = len(ratings) #The number of rows of the rating data frame is equivalent to the number of ratings we have
    rmse = np.sqrt(np.nansum(diff**2)/n_non_missing)
    return rmse


def UV_decomp(ratings, X, U, M, K = 10, iter = 40):
    '''
    Implement the UV matrix decomposition algorithm.
    '''
    res = []
    #Fix the old and new RMSE values.
    RMSE_old = 1000
    RMSE_new = RMSE(ratings, X, U.dot(M))
    #cols_X and rows_X simply contain the indexes of the rows and columns of the
    #matrix X. They will be used for cycling.
    cols_X = np.arange(np.size(X,1))
    rows_X = np.arange(np.size(X,0))
    #Features is a vector from 0 to K-1.
    features = np.arange(K)
    count = 1
    while count<=iter:
        RMSE_old = RMSE_new
        #Cycle across the rows and columns of U. 
        for r in rows_X:
            #To implement update formulas you need to get the columns
            #of X such that their value at the row r is not NaN. 
            non_missing_cols = cols_X[~ np.isnan(list(X[r,:]))]
            for s in features:               
                num = np.sum(M[s, non_missing_cols] * (X[r, non_missing_cols] - 
                        U[r,np.delete(features, s)].dot(M[np.delete(features, s)][:,non_missing_cols])))
                den = np.sum(M[s,non_missing_cols]**2)
                U[r,s] = num/den
        #Repeat the process to update M
        for s in cols_X: 
            non_missing_rows = rows_X[~ np.isnan(list(X[:, s]))]
            for r in features:   
                num = np.sum(U[non_missing_rows, r] * (X[non_missing_rows, s] - 
                    U[non_missing_rows][:,np.delete(features, r)].dot(M[np.delete(features, r),s])))
                den = np.sum(U[non_missing_rows, r]**2)
                M[r,s] = num/den
        #Compute the matrix product between the new U and M.
        RMSE_new = RMSE(ratings, X , U.dot(M))
        res.append(RMSE_new)
        count += 1 
    return U.dot(M), res

**Test by cross-validation**

In [11]:
#Cross validation function
def cv(ratings, K, iter):
    #Set a seed for reproducibility
    np.random.seed(888)

    #Generate the 5 folds for all rows of the utility matrix of input.
    nfolds = 5
    folds = np.array([x%nfolds for x in range(len(ratings))])
    np.random.shuffle(folds)

    #Set up the vectors of errors. 
    errors_5folds_rmse_training = []
    errors_5folds_rmse_test = [] 

    '''
    We will cycle through the possible folds (from 1 to 5) and leave one set of observations (test set) out of the training process.
    Then, we use it to perform predictions and compute the error. 
    '''
    
    for i in range(5):
        print('Fold {0}'.format(i))
        #Fix the rows of the ratings Data Frame that will be the training set.
        train = ratings.loc[folds!=i, :]
        train_mat = train.pivot(index='user', columns='movie', values='rating')
        train_np = np.matrix(train)
        train_mat_np = np.array(train_mat)
    
        #Create lookup dictionary associating the indices of the numpy array to the
        #respective movieID and userID.
        col_names_train = {train_mat.columns[j]:j for j in range(len(train_mat.columns))}
        row_names_train_ = {train_mat.index[j]:j for j in range(len(train_mat.index))}   
    
        #Initialize U and M and run the prediction.
        U, M = initialize(train, np.size(train_mat_np,0), np.size(train_mat_np,1), K)
        pred, res = UV_decomp(train_np, train_mat_np, U, M, K = K, iter = iter)
    
        #Initialize test set
        test =  ratings.loc[folds == i, :]
        test_np = np.matrix(test)
        test_pred = []
    
        #The global mean of the training matrix will be used as a prediction for the observations that have a missing 
        #movie or user in the training set.
        global_mean = np.mean(train['rating'])
    
        #Create the prediction vector of the test set. If a user or item are not present
        #in the training set, predict their rating as the gloal mean of the training matrix.
        for val in test_np:
            if val[0,0] in row_names_train and val[0,1] in col_names_train:
                test_pred.append(pred[row_names_train[val[0,0]], col_names_train[val[0,1]]])
            else:
                test_pred.append(global_mean)
       
        test_pred = np.array(test_pred)
        test_obs = np.array(test['rating'])
    
    
        #Extra fallback rules to improve performance. We round to 1 all predictions lower than 1 and to 5 all prediction higher 
        #than 5
        test_pred = np.where(test_pred > 5, 5, test_pred)
        test_pred = np.where(test_pred < 1, 1, test_pred)
    
        #Calculate and store the errors on the training set and test set. 
        errors_5folds_rmse_training.append(RMSE(train_np,train_mat_np, pred))
    
        #Calculate the error on the test set and store it.
        test_RMSE = np.sqrt(np.sum((test_pred-test_obs)**2)/len(test_pred)) 
        errors_5folds_rmse_test.append(test_RMSE)
        
       
    
        partial_error.append(res)    

    #Average across the folds to compute the errors. 
    cv_error_rmse_train = np.mean(errors_5folds_rmse_training)
    cv_error_rmse_test = np.mean(errors_5folds_rmse_test)

    print('The average rmse over the training set for iter = ',iter,' and K = ',K,' is: ', cv_error_rmse_train)
    print('The average rmse over the test set for iter = ',iter,' and K = ',K,' is: ', cv_error_rmse_test)
    return 

In [None]:
#To call cross-validation function, use:
#cv(ratings, value of K, number of iterations)