In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold

In [15]:
class MovieLensRecommenderSystem():
    
    
    def read_data(self, path):
        data = pd.read_csv(path, delimiter = '\t', header = None).rename(columns = {0: 'user', 1: 'movie', 2: 'rating'})
        return data
    
    
    def __init__(self, train_data_path, seed = 17):
        # Set parameters
        self.DATA_DIR = 'ml-100k/'
        self.train_data_path = train_data_path
        self.seed = seed
        
        # Read the training data
        self.train_df = self.read_data(os.path.join(self.DATA_DIR, self.train_data_path))
        
        # Find list of unique user IDs
        self.inv_map_users = list(set(self.train_df['user']))
        self.num_users = len(self.inv_map_users)

        # Find lsit of unique movie IDs
        self.inv_map_movies = list(set(self.train_df['movie']))
        self.num_movies = len(self.inv_map_movies)
        
        # Find mapping of the original user ID to [0, N - 1]
        self.map_users = {x: i for i, x in enumerate(self.inv_map_users)}
        self.map_movies = {x: i for i, x in enumerate(self.inv_map_movies)}
        
        # Apply the mapping on the actual data
        self.train_df['user'] = self.train_df['user'].apply(lambda x: self.map_users[x])
        self.train_df['movie'] = self.train_df['movie'].apply(lambda x: self.map_movies[x])
        
    
    def predict(self, P, Q, u, i):
        # Predict the rating of user u and movie i
        return P[u].dot(Q[i])
    
    
    def cross_validate(self, n_splits = 5, K = 50, learning_rate = .01, regularization_rate = .017, bias = True, max_it = 500, seed = 17):
        # Initialize KFold object
        kf = KFold(n_splits = 5, random_state = seed, shuffle = True)
        
        # Evaluate RMSE for every fold
        total_error = 0
        for it, (train_idx, val_idx) in enumerate(kf.split(self.train_df)):
            # Get the train / validation data
            train_data = self.train_df.loc[train_idx]
            val_data = self.train_df.loc[val_idx]
            
            # Get the RMSE for the it-th fold
            print('Fold: %d/%d' % (it + 1, n_splits))
            cur_error = self.evaluate_matrix_factorization(train_data, val_data, K = K, learning_rate = learning_rate, bias = bias,
                                                           regularization_rate = regularization_rate, max_it = max_it, seed = seed)
            print()
            
            # Get the total error (squared errors)
            total_error += np.power(cur_error, 2) * len(val_data)
        
        # Calculate the total RMSE for all n-folds (weighted, useful if the validation datasets have different sizes)
        final_error = np.power(total_error / len(self.train_df), 1 / 2)
        print('Error: %.5f' % final_error)
    
        
    def evaluate_matrix_factorization(self, train_data, val_data, K = 50, learning_rate = .01, regularization_rate = .017, bias = True, max_it = 500, seed = 17):
        np.random.seed(seed)
        
        # Latent space dimension
        dim = K + (2 if bias else 0)
        
        # Initialize matrices P, Q ... the embeddings of the user and movies
        P = np.random.choice([-0.01, 0.01], size = self.num_users * dim).reshape((self.num_users, dim))
        Q = np.random.choice([-0.01, 0.01], size = self.num_movies * dim).reshape((self.num_movies, dim))
        
        # Initialize one column to be 1 so that we can learn the bias
        if bias:
            P[:, K] = 1
            Q[:, K + 1] = 1
        
        # Perform SGD on P, Q until RMSE on the validation dataset increases
        error = 1e9
        for it in range(max_it):
            # For every entry in the training data
            for u, i, r in zip(train_data['user'], train_data['movie'], train_data['rating']):
                # Get the prediction, and error
                r_pred = self.predict(P, Q, u, i)
                e_ui = r - r_pred
                
                # Find the updates for the approriate rows in P and Q
                nP_u = P[u] + learning_rate * (e_ui * Q[i] - regularization_rate * P[u])
                nQ_i = Q[i] + learning_rate * (e_ui * P[u] - regularization_rate * Q[i])
                
                # Set the approriate element to be 1 for the bias
                if bias:
                    nP_u[K] = 1
                    nQ_i[K + 1] = 1

                # Update the matrices
                P[u] = nP_u
                Q[i] = nQ_i
            
            # Calculate RMSE on the validation dataset
            cur_error = 0
            for u, i, r in zip(val_data['user'], val_data['movie'], val_data['rating']):
                # Get the prediction, and error
                r_pred = self.predict(P, Q, u, i)
                e_ui = r - r_pred
                
                # Error squared
                cur_error += np.power(e_ui, 2)
            # Final RMSE
            cur_error = np.power(cur_error / len(val_data), 1 / 2)
            
            # print('%3d\t%.5f' % (it, cur_error))
            
            # If RMSE on validation dataset increases, stop the learning
            if cur_error >= error:
                break
            error = cur_error
        
        print('-------------------')
        print('%3d\t%.5f' % (it, cur_error))
        
        return error

In [16]:
x = MovieLensRecommenderSystem('u.data')

In [20]:
x.cross_validate(learning_rate = .01, regularization_rate = .051, bias = True, seed = 17)

Fold: 1/5
-------------------
 34	0.90407

Fold: 2/5
-------------------
 34	0.90705

Fold: 3/5
-------------------
 33	0.90589

Fold: 4/5
-------------------
 34	0.91258

Fold: 5/5
-------------------
 34	0.90136

Error: 0.90614
