In [1]:
import random
import numpy as np

my_seed = 1337
random.seed(my_seed)
np.random.seed(my_seed)

In [2]:
import pandas as pd
import numpy as np
from typing import *
from IPython.display import display, HTML, Markdown

import warnings
warnings.filterwarnings('ignore')


def display_best_and_worse_recommendations(recommendations: pd.DataFrame):
    recommendations.sort_values('Estimated Prediction', ascending=False, inplace=True)

    top_recommendations = recommendations.iloc[:10]
    top_recommendations.columns = ['Prediction (sorted by best)', 'Movie Title']

    worse_recommendations = recommendations.iloc[-10:]
    worse_recommendations.columns = ['Prediction (sorted by worse)', 'Movie Title']

    display(HTML("<h1>Recommendations your user will love</h1>"))
    display(top_recommendations)

    display(HTML("<h1>Recommendations your user will hate</h1>"))
    display(worse_recommendations)
    

def load_movies_dataset() -> pd.DataFrame:
    movie_data_columns = [
    'movie_id', 'title', 'release_date', 'video_release_date', 'url',
    'unknown', 'Action', 'Adventure', 'Animation', "Children's",
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
    'War', 'Western'
    ]

    movie_data = pd.read_csv(
        'datasets/ml-100k/u.item', 
        sep = '|', 
        encoding = "ISO-8859-1", 
        header = None, 
        names = movie_data_columns,
        index_col = 'movie_id'
    )
    movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])
    return movie_data

def load_ratings() -> pd.DataFrame:
    ratings_data = pd.read_csv(
        'datasets/ml-100k/u.data',
        sep = '\t',
        encoding = "ISO-8859-1",
        header = None,
        names=['user_id', 'movie_id', 'rating', 'timestamp']
    )
    return ratings_data[['user_id', 'movie_id', 'rating']]

def load_movielens() -> pd.DataFrame:
    ratings_data = load_ratings()
    movies_data = load_movies_dataset()
    ratings_data['user_id'] = ratings_data['user_id'].map(lambda k: "User %d"%k)
                                                         
    ratings_and_movies = ratings_data \
        .set_index('movie_id') \
        .join(movies_data['title']) \
        .reset_index()
    
    ratings_and_movies['movie_title'] = ratings_and_movies['title']
    return ratings_and_movies[['user_id', 'movie_title', 'rating']].sample(frac=1)

# Table of contents
# 1) Training a SVD model

    Downloading and exploring the MovieLens dataset
    Training a SVD using Surprise in 4 simple steps

# 2) Generating recommendations

    Recommendations via Matrix Reconstruction: Using the predict() API inside of Surprise
    Recommendations via Product based CF: Finding similarity between vectors



In [3]:
movielens_df = load_movielens()
movielens_df.head(5)

Unnamed: 0,user_id,movie_title,rating
36649,User 742,Jerry Maguire (1996),4
2478,User 908,"Usual Suspects, The (1995)",3
82838,User 758,Real Genius (1985),4
69729,User 393,Things to Do in Denver when You're Dead (1995),3
36560,User 66,Jerry Maguire (1996),4


In [4]:
# Remove movies with few ratings
movie_ratings = movielens_df.groupby('movie_title').size()
valid_movies = movie_ratings[movie_ratings > 50]
movie_ratings = movielens_df.set_index('movie_title', drop=False).join(valid_movies.to_frame(), how='inner').reset_index(drop=True)

del movie_ratings[0]

movie_ratings = movie_ratings.sample(frac=1)
movie_ratings.head(5)

movielens_df = movie_ratings

# Training a SVD using Surprise in 4 simple steps

In [5]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

# Step 1: create a Reader.
# A reader tells our SVD what the lower and upper bound of our ratings is.
# MovieLens ratings are from 1 to 5
reader = Reader(rating_scale=(1, 5))

In [6]:
# Step 2: create a new Dataset instance with a DataFrame and the reader
# The DataFrame needs to have 3 columns in this specific order: [user_id, product_id, rating]
data = Dataset.load_from_df(movielens_df, reader)

In [7]:
# Step 3: keep 25% of your trainset for testing
trainset, testset = train_test_split(data, test_size=.25)

In [8]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movielens_df, reader)
trainset, testset = train_test_split(data, test_size=.01)

In [9]:
# Step 4: train a new SVD with 100 latent features (number was chosen arbitrarily)
model = SVD(n_factors=256)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd55514de80>

In [10]:
# Normalization
pd.DataFrame(model.qi).iloc[0].pow(2).sum()
model.qi /= np.linalg.norm(model.qi, ord=2, axis=1).reshape(-1, 1)
pd.DataFrame(model.qi).iloc[0].pow(2).sum()

0.9999999999999999

# Inspecting our Product Matrix

Surprise SVD stores the product matrix under the model.qi attribute

In [11]:
model.qi.shape

(596, 256)

# Recommendations via Matrix Reconstruction
# Use cases:
 
    Predict a score between any combination of user and a product
# Recommendations via Matrix Reconstruction: Using the predict() API inside of Surprise

    Computes the rating prediction for given user and movie with model.predict(). Pick a random user and movie, and calculate the score between them

In [12]:
# Refresher: ratings data-frame.
movielens_df.head(2)

Unnamed: 0,user_id,movie_title,rating
49469,User 437,Monty Python and the Holy Grail (1974),3
12181,User 85,Butch Cassidy and the Sundance Kid (1969),4


In [13]:
a_user = "User 196"
a_product = "Toy Story (1995)"
model.predict(a_user, a_product)

Prediction(uid='User 196', iid='Toy Story (1995)', r_ui=None, est=4.121260358723221, details={'was_impossible': False})

# Recommendations via Item Similarity: Finding similarity between vectors

2 products are "similar" when the cosine distance is close to 0

In [15]:
def get_vector_by_movie_title(movie_title: str, trained_model: SVD) -> np.array:
    """Returns the latent features of a movie in the form of a numpy array"""
    movie_row_idx = trained_model.trainset._raw2inner_id_items[movie_title]
    return trained_model.qi[movie_row_idx]


def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Returns a float indicating the similarity between two vectors"""
    ab=np.sum(np.multiply(vector_a,vector_b))
    denom=np.sqrt(np.sum(np.square(vector_a))*np.sum(np.square(vector_b)))
    return 0.5+0.5*ab/denom

In [16]:
# Fetch the vectors of "Toy Story" and "Wizard of Oz"
movie1_vec = get_vector_by_movie_title('Star Wars (1977)', model)
movie2_vec = get_vector_by_movie_title('Return of the Jedi (1983)', model)
movie3_vec = get_vector_by_movie_title('Aladdin (1992)',model)
# Calculate the distance between the vectors. The smaller the number,
# the more similar the two movies are
cos_score = cosine_distance(movie1_vec, movie2_vec)
print("The similarity between Star Wars and Return of the Jedi is %.2f"%((cos_score)*100)+'%')
cos_score = cosine_distance(movie1_vec, movie3_vec)
print("The similarity between Star Wars and Aladdin is %.2f"%((cos_score)*100)+'%')

The similarity between Star Wars and Return of the Jedi is 82.51%
The similarity between Star Wars and Aladdin is 54.14%


# Finding similar movies by ranking

In [17]:
def display(similarity_table):
    similarity_table = pd.DataFrame(
        similarity_table,
        columns=['Similarity', 'movie title']
    ).sort_values('Similarity', ascending=False)
    return similarity_table.iloc[1:6]

def get_top_similarities(movie_title: str, model: SVD) -> pd.DataFrame:
    """Returns the top 5 most similar movies to a specified movie
    
    This function iterates over every possible movie in MovieLens and calculates
    distance between `movie_title` vector and that movie's vector.
    """
    
    # Get the first movie vector
    movie_vector = get_vector_by_movie_title(movie_title, model)
    similarity_table = []
    # Iterate over every possible movie and calculate similarity
    for other_movie_title in model.trainset._raw2inner_id_items.keys():
        other_movie_vector = get_vector_by_movie_title(other_movie_title, model)
        # Get the second movie vector, and calculate distance
        similarity_score = cosine_distance(other_movie_vector, movie_vector)
        similarity_table.append((similarity_score, other_movie_title))
    
    # sort movies by ascending similarity
    return display(sorted(similarity_table))

In [18]:
get_top_similarities('Star Wars (1977)', model)

Unnamed: 0,Similarity,movie title
594,0.825065,Return of the Jedi (1983)
593,0.824013,"Empire Strikes Back, The (1980)"
592,0.706024,Raiders of the Lost Ark (1981)
591,0.629825,Pinocchio (1940)
590,0.608031,"Sting, The (1973)"
