## Import Packages

In [1]:
# Import packages
from surprise import Reader
from surprise import SVD
from surprise import KNNBasic
from surprise import Dataset
from surprise import Trainset
from surprise import accuracy
from surprise.model_selection import LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
from itertools import combinations
import pandas as pd
import numpy as np
import json

## Define Working Directory

In [2]:
path_clean_data = '/Users/rathin/Documents/Projects/mids/W207/W207_movies/data/clean_data/'

## Import Prepocessed Data 

In [3]:
# Load train, dev, test data
train_df = pd.read_csv(path_clean_data + "cf_train_ratings.csv", low_memory=False)
dev_df = pd.read_csv(path_clean_data + "cf_dev_ratings.csv", low_memory=False)
test_df = pd.read_csv(path_clean_data + "cf_test_ratings.csv", low_memory=False)

In [4]:
# drop 'Unnamed 0' column
train_df.drop('Unnamed: 0', axis=1, inplace=True)
dev_df.drop('Unnamed: 0', axis=1, inplace=True)
test_df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
# Inspect train_df
train_df.head()

Unnamed: 0,userId,movieId,rating
0,4,223,4.0
1,4,415,4.0
2,4,648,4.0
3,4,1097,5.0
4,4,1197,4.0


In [6]:
# Load dev and test evaluation pools
f1 = open(path_clean_data + 'dev_evaluation_pools.json',)
f2 = open(path_clean_data + 'test_evaluation_pools.json',)
dev_evaluation_pools = json.load(f1)
test_evaluation_pools = json.load(f2)

# Convert key from str to int
dev_evaluation_pools = {int(k):v for k,v in dev_evaluation_pools.items()}
test_evaluation_pools = {int(k):v for k,v in test_evaluation_pools.items()}

## Load Data into Surprise Data Structures

In [7]:
#Create a surprise Reader object to help us create a Surprise DataSet object. 
reader = Reader()

#Now we load training data as is (no folding) by calling the .build_full_trainset() method. 
train_data = Dataset.load_from_df(train_df, reader).build_full_trainset()

#Convert dev_data to DataSet object, and then as testset object by calling .buildtestset()
dev_data = Dataset.load_from_df(dev_df, reader).build_full_trainset()
dev_data = dev_data.build_testset()

#Convert test_data to DataSet object, and then as testset object by calling .buildtestset()
test_data = Dataset.load_from_df(test_df, reader).build_full_trainset()
test_data = test_data.build_testset()

## Run and Evaluate Matrix Factorization Models

### Define Custom Functions

In [8]:
def sort_evaluation_pools(model, evaluation_pools):
    '''
    Take evaluation pool for each user and sort it based on ratings predicted by a 
    trained model in descending order of predicted ratings
    Parameters
    ----------
    model : A trained surprise model
    evaluation_pools : A dictionary from user to the pool of movies on 
                       which to evaluate the recommender system 
    
    Returns
    -------
    sorted_evaluation_pools : A dictionary from user to the pool of movies where each 
                              movie is sorted in descending order of predicted movie 
                              rating
    '''
    
    # Instantiate output dictionary
    sorted_evaluation_pools = {}
    
    # Loop through each key-value pair in the input dictionary
    for user, movie_pool in evaluation_pools.items():
        
        # Create a dictionary for the predicted rating of each movie in the user's pool
        predictions = {}
        for movie in movie_pool:
            pred = model.predict(user, movie)
            predictions[movie] = pred[3]
        
        # Sort the pool in descending order of predicted ratings
        sorted_pool = [k for k, v in sorted(predictions.items(), key=lambda item: item[1], reverse=True)]
        
        # Add key-value pair of user and sorted evalualtion pool to the output dictionary
        sorted_evaluation_pools[user] = sorted_pool

    return sorted_evaluation_pools

In [9]:
def calc_hit_rate_pools(sorted_evaluation_pools, test_df, top_n=10):
    '''
    Calculate hit rate given a dictionary of sorted evaluation pools and the corrsponding 
    test data frame. A hit is defined as finding a test movie in the top_n of sorted 
    evalutation pool of a user
    Parameters
    ----------
    sorted_evaluation_pools : A dictionary from user to the pool of movies where each 
                              movie is sorted in descending order of predicted movie 
                              rating
    test_df : A dataframe of ratings for movies being tested in the evaluation pools
    top_n : The threshold above which a test movies should be found to be called a hit
    
    Returns
    -------
    hit_rate : A hit rate across pools
    
    '''
    
    # Start with hits and totals at 0
    hits=0
    total=0
    
    # Loop through the each key-value pair in the input dictionary
    for user, sorted_pool in sorted_evaluation_pools.items():
        
        # Filter to test movies for the user
        test_movies = test_df[test_df['userId']==user]
        
        # Find top_n movies from the pool
        top_movies = sorted_pool[:top_n]
        
        # Loop through each test movie
        for index, test_movie in test_movies.iterrows():
            test_movie_id = test_movie['movieId']
            
            # If test movie is in top_movies, then add one to hits
            if test_movie_id in top_movies:
                hits += 1
                
            # Add one to total for each test movie  
            total += 1
     
    hit_rate = hits/total
    return hit_rate

In [14]:
def calc_hit_rate_pools_with_cutoff(sorted_evaluation_pools, test_df, top_n=10, rating_cutoff=0):
    '''
    Calculate hit rate given a dictionary of sorted evaluation pools and a corresponding 
    test data frame. A hit is defined as finding a test movie that has a rating greater 
    than a rating_cutoff in the top_n of sorted evalutation pool of a user.
    Parameters
    ----------
    sorted_evaluation_pools : A dictionary from user to the pool of movies where each 
                              movie is sorted in descending order of predicted movie 
                              rating
    test_df : A dataframe of ratings for movies being tested in the evaluation pools
    top_n : The threshold above which a test movies should be found to be called a hit
    rating_cutoff : The threshold rating above which a test movie should be evaluated
    
    Returns
    -------
    hit_rate : A hit rate across pools
    '''
    
    # Start with hits and totals at 0
    hits=0
    total=0
    
    # Loop through the each key-value pair in the input dictionary
    for user, sorted_pool in sorted_evaluation_pools.items():
        
        # Filter to test movies for the user
        test_movies = test_df[test_df['userId']==user]

         # Find top_n movies from the pool
        top_movies = sorted_pool[:top_n]
        
        # Loop through each test movie
        for index, test_movie in test_movies.iterrows():
            test_movie_rating = test_movie['rating']
            test_movie_id = test_movie['movieId']
            
             # If test movie has a rating above rating_cutoff then evaluate
            if test_movie_rating >= rating_cutoff:
                
                # If test movie is in top_movies, then add one to hits
                if test_movie_id in top_movies:
                    hits += 1
                
                # Add one to total for each test movie
                total += 1

    return hits/total

### SVD

In [11]:
# Train and test algorithm (Matrix Factorization for Collaborative Filtering)
algo = SVD()

# Train our train_data (all user's that have liked 30+ movies, train on the n-2 movie set)
algo.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe76653b210>

In [12]:
# Predict rating for the 1 dev movie
dev_prediction = algo.test(dev_data)

# Sort dev evaluation pools based on SVD model
sorted_dev_evaluation_pools = sort_evaluation_pools(algo, dev_evaluation_pools)

In [15]:
# Calculate our evaluation metrics of interest
hit_rate_pools = calc_hit_rate_pools(sorted_dev_evaluation_pools, dev_df)
hit_rate_pools_with_cutoff = calc_hit_rate_pools_with_cutoff(sorted_dev_evaluation_pools, dev_df, rating_cutoff=4)
rmse = accuracy.rmse(dev_prediction, verbose = False)

# Print our evaluation metrics of interest
print('HR@10 random pools: %.3f' % hit_rate_pools)
print('HR@10 random pools > 4: %.3f' % hit_rate_pools_with_cutoff)
print('Root Mean Squared Error: %.3f' % rmse)

HR@10 random pools: 0.411
HR@10 random pools > 4: 0.553
Root Mean Squared Error: 0.828
