## Import Packages

In [1]:
# Import required packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
import json

tf.config.threading.set_intra_op_parallelism_threads(16) 
tf.config.threading.set_inter_op_parallelism_threads(16)

## Define Working Directory

In [2]:
path_clean_data = '/Users/rathin/Documents/Projects/mids/W207/W207_movies/data/clean_data/'

## Import Prepocessed Data 

In [3]:
# Load train, dev, test data
train_df = pd.read_csv(path_clean_data + "cf_train_ratings.csv", low_memory=False)
dev_df = pd.read_csv(path_clean_data + "cf_dev_ratings.csv", low_memory=False)
test_df = pd.read_csv(path_clean_data + "cf_test_ratings.csv", low_memory=False)

In [4]:
# drop 'Unnamed 0' column
train_df.drop('Unnamed: 0', axis=1, inplace=True)
dev_df.drop('Unnamed: 0', axis=1, inplace=True)
test_df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
# Inspect train_df
train_df.head()

Unnamed: 0,userId,movieId,rating
0,4,223,4.0
1,4,415,4.0
2,4,648,4.0
3,4,1097,5.0
4,4,1197,4.0


In [6]:
# Load dev and test evaluation pools
f1 = open(path_clean_data + 'dev_evaluation_pools.json',)
f2 = open(path_clean_data + 'test_evaluation_pools.json',)
dev_evaluation_pools = json.load(f1)
test_evaluation_pools = json.load(f2)

# Convert key from str to int
dev_evaluation_pools = {int(k):v for k,v in dev_evaluation_pools.items()}
test_evaluation_pools = {int(k):v for k,v in test_evaluation_pools.items()}

## Format Data for Keras

In [7]:
## Get lists of unique movie ids and user ids in the training data
unique_movie_ids = np.unique(np.concatenate((np.unique(train_df['movieId']),
                                             np.unique(dev_df['movieId']),
                                             np.unique(test_df['movieId']))))
unique_user_ids = np.unique(train_df['userId'])

## Calculate number of movies and users and print
num_movies = len(unique_movie_ids)
num_users = len(unique_user_ids)
print("Number of Movies: %i " % num_movies)
print("Number of Users: %i " % num_users)

Number of Movies: 44975 
Number of Users: 136362 


In [8]:
## Encode movies and users to be contigous  
movie2movie_encoded = {x: i for i, x in enumerate(unique_movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(unique_movie_ids)}
user2user_encoded = {x: i for i, x in enumerate(unique_user_ids)}
userencoded2user = {i: x for i, x in enumerate(unique_user_ids)}

In [9]:
## Add encoding columns to train, test and dev data
train_df["movieEncoded"] = train_df["movieId"].map(movie2movie_encoded)
train_df["userEncoded"] = train_df["userId"].map(user2user_encoded)

dev_df["movieEncoded"] = dev_df["movieId"].map(movie2movie_encoded)
dev_df["userEncoded"] = dev_df["userId"].map(user2user_encoded)

test_df["movieEncoded"] = test_df["movieId"].map(movie2movie_encoded)
test_df["userEncoded"] = test_df["userId"].map(user2user_encoded)

train_df.head()

Unnamed: 0,userId,movieId,rating,movieEncoded,userEncoded
0,4,223,4.0,220,0
1,4,415,4.0,411,0
2,4,648,4.0,640,0
3,4,1097,5.0,1075,0
4,4,1197,4.0,1172,0


## Run and Evaluate Deep Learning Models 

### Define Custom Functions

In [10]:
def sort_evaluation_pools(model, evaluation_pools):
    '''
    Take evaluation pool for each user and sort it based on ratings predicted by a 
    trained model in descending order of predicted ratings
    Parameters
    ----------
    model : A trained surprise model
    evaluation_pools : A dictionary from user to the pool of movies on 
                       which to evaluate the recommender system 
    
    Returns
    -------
    sorted_evaluation_pools : A dictionary from user to the pool of movies where each 
                              movie is sorted in descending order of predicted movie 
                              rating
    '''
    
    # Instantiate output dictionary
    sorted_evaluation_pools = {}
    
    # Loop through each key-value pair in the input dictionary
    for user, movie_pool in evaluation_pools.items():
        
        # Create a dictionary for the predicted rating of each movie in the user's pool
        predictions = {}
        for movie in movie_pool:
            pred = model.predict(user, movie)
            predictions[movie] = pred[3]
        
        # Sort the pool in descending order of predicted ratings
        sorted_pool = [k for k, v in sorted(predictions.items(), key=lambda item: item[1], reverse=True)]
        
        # Add key-value pair of user and sorted evalualtion pool to the output dictionary
        sorted_evaluation_pools[user] = sorted_pool

    return sorted_evaluation_pools

In [11]:
def calc_hit_rate_pools(sorted_evaluation_pools, test_df, top_n=10):
    '''
    Calculate hit rate given a dictionary of sorted evaluation pools and the corrsponding 
    test data frame. A hit is defined as finding a test movie in the top_n of sorted 
    evalutation pool of a user
    Parameters
    ----------
    sorted_evaluation_pools : A dictionary from user to the pool of movies where each 
                              movie is sorted in descending order of predicted movie 
                              rating
    test_df : A dataframe of ratings for movies being tested in the evaluation pools
    top_n : The threshold above which a test movies should be found to be called a hit
    
    Returns
    -------
    hit_rate : A hit rate across pools
    
    '''
    
    # Start with hits and totals at 0
    hits=0
    total=0
    
    # Loop through the each key-value pair in the input dictionary
    for user, sorted_pool in sorted_evaluation_pools.items():
        
        # Filter to test movies for the user
        test_movies = test_df[test_df['userId']==user]
        
        # Find top_n movies from the pool
        top_movies = sorted_pool[:top_n]
        
        # Loop through each test movie
        for index, test_movie in test_movies.iterrows():
            test_movie_id = test_movie['movieId']
            
            # If test movie is in top_movies, then add one to hits
            if test_movie_id in top_movies:
                hits += 1
                
            # Add one to total for each test movie  
            total += 1
     
    hit_rate = hits/total
    return hit_rate

In [12]:
def calc_hit_rate_pools_with_cutoff(sorted_evaluation_pools, test_df, top_n=10, rating_cutoff=0):
    '''
    Calculate hit rate given a dictionary of sorted evaluation pools and a corresponding 
    test data frame. A hit is defined as finding a test movie that has a rating greater 
    than a rating_cutoff in the top_n of sorted evalutation pool of a user.
    Parameters
    ----------
    sorted_evaluation_pools : A dictionary from user to the pool of movies where each 
                              movie is sorted in descending order of predicted movie 
                              rating
    test_df : A dataframe of ratings for movies being tested in the evaluation pools
    top_n : The threshold above which a test movies should be found to be called a hit
    rating_cutoff : The threshold rating above which a test movie should be evaluated
    
    Returns
    -------
    hit_rate : A hit rate across pools
    '''
    
    # Start with hits and totals at 0
    hits=0
    total=0
    
    # Loop through the each key-value pair in the input dictionary
    for user, sorted_pool in sorted_evaluation_pools.items():
        
        # Filter to test movies for the user
        test_movies = test_df[test_df['userId']==user]

         # Find top_n movies from the pool
        top_movies = sorted_pool[:top_n]
        
        # Loop through each test movie
        for index, test_movie in test_movies.iterrows():
            test_movie_rating = test_movie['rating']
            test_movie_id = test_movie['movieId']
            
             # If test movie has a rating above rating_cutoff then evaluate
            if test_movie_rating >= rating_cutoff:
                
                # If test movie is in top_movies, then add one to hits
                if test_movie_id in top_movies:
                    hits += 1
                
                # Add one to total for each test movie
                total += 1

    return hits/total

### Define Deep Learning Model

Source: https://heartbeat.fritz.ai/build-train-and-deploy-a-book-recommender-system-using-keras-tensorflow-js-b96944b936a7

In [13]:
def embedding_model(num_movies, num_users, embedding_size=15, d1_size=128):
    # Movie input network
    input_movies = layers.Input(shape=[1])

    # Define movies embedding layer
    embed_movies = layers.Embedding(num_movies + 1, embedding_size)(input_movies)
    movies_out = layers.Flatten()(embed_movies)

    # User input network
    input_users = layers.Input(shape=[1])

    # Define users embedding layer
    embed_users = layers.Embedding(num_users + 1,embedding_size)(input_users)
    users_out = keras.layers.Flatten()(embed_users)

    # Concatenenate Embeddings
    conc_layer = keras.layers.Concatenate()([movies_out, users_out])

    # Define Dense Layer                    
    x = keras.layers.Dense(d1_size, activation='relu')(conc_layer)

    # Define output Layer
    x_out = x = keras.layers.Dense(1, activation='relu')(x)

    # Define model
    model = keras.Model(inputs=[input_movies, input_users], outputs=x_out)
    
    return model

In [14]:
model_1 = embedding_model(num_movies, num_users)
opt = keras.optimizers.Adam(learning_rate=0.001)
model_1.compile(optimizer=opt, loss='mean_squared_error')
model_1.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 15)        674640      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 15)        2045445     input_2[0][0]                    
______________________________________________________________________________________________

## Fit Model

In [15]:
hist = model_1.fit(
    x=[train_df.movieEncoded, train_df.userEncoded],
    y=train_df.rating, 
    batch_size=64, 
    epochs=5, 
    verbose=1,
    validation_data=([dev_df.movieEncoded, dev_df.userEncoded],
                     dev_df.rating)
)

Epoch 1/5
 14824/372774 [>.............................] - ETA: 1:28:58 - loss: 0.9283

KeyboardInterrupt: 

In [None]:
plt.plot(hist.history["loss"])
plt.plot(hist.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()