# Scaffolding for RecSys and User x Item Feedback

Using the IMDB and MovieLens data, we build a feature matrix that we can use for user-user and item-item recommendation.

In [1]:
%matplotlib inline

In [2]:
import json
import random
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from collections import Counter

from scipy.sparse import lil_matrix
from scipy.stats import pearsonr
from scipy.stats import spearmanr

from sklearn.neighbors import DistanceMetric
from sklearn.metrics import mean_squared_error
from sklearn.metrics import jaccard_score
from sklearn.metrics import pairwise_distances

In [3]:
known_movies = set()

user_ratings = {} # List of all our movie ratings for specific users
movie_ids = []

with open("../data/user_ratings.json", "r") as in_file:
    for line in in_file:
        
        this_rating = json.loads(line)
        
        known_movies.add(this_rating["title_id"])
        
        if this_rating["title_id"] not in movie_ids:
            movie_ids.append(this_rating["title_id"])
        
        this_users_ratings = user_ratings.get(this_rating["userId"], [])
        this_users_ratings.append((this_rating["title_id"], this_rating["rating"]))
        
        user_ratings[this_rating["userId"]] = this_users_ratings
        
        

In [4]:
movie_id_to_index = {m:i for i,m in enumerate(movie_ids)}

In [5]:
print("Known Users:", len(user_ratings))
print("Known Movies:", len(known_movies))


Known Users: 2244
Known Movies: 4465


In [6]:
actor_id_to_name_map = {}     # Map Actor IDs to actor names
actor_id_to_index_map = {}    # Map actor IDs to a unique index of known actors
index_to_actor_ids = []       # Array mapping unique index back to actor ID (invert of actor_id_to_index_map)

index_counter = 0    # Unique actor index; increment for each new actor
known_actors = set()

movie_actor_map = {} # List of all our movies and their actors

test_count = 0
with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
        
        # Restrict to known movies
        if this_movie["title_id"] not in known_movies:
            continue
            
        # Keep track of all the actors in this movie
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            
            # Keep names and IDs
            actor_id_to_name_map[actor_id] = actor_name
            
            # If we've seen this actor before, skip...
            if actor_id in known_actors:
                continue
                
            # ... Otherwise, add to known actor set and create new index for them
            known_actors.add(actor_id)
            actor_id_to_index_map[actor_id] = index_counter
            index_to_actor_ids.append(actor_id)
            index_counter += 1
            
        # Finished with this film
        movie_actor_map[this_movie["title_id"]] = ({
            "movie": this_movie["title_name"],
            "actors": set(this_movie['actor_ids']),
            "genres": this_movie["title_genre"]
        })

In [7]:
print("Known Actors:", len(known_actors))
print("Known Movies:", len(movie_actor_map))

Known Actors: 5224
Known Movies: 4465


## Generate DataFrame using Sparse Matrics

Convert our Movie Ratings data into a DataFrame that we can use for analysis.

In [8]:
# With sparse matrix, initialize to size of Users x Movies of 0s
matrix_sparse = lil_matrix((len(user_ratings), len(known_movies)), dtype=float)

# Update the matrix, user by user, setting non-zero values for the appropriate actors
for row,this_user in enumerate(user_ratings): 
    this_user_ratings = user_ratings[this_user]
    
    for movie_id,rating in this_user_ratings:
        this_movie_index = movie_id_to_index[movie_id]
        matrix_sparse[row,this_movie_index] = rating

In [9]:
df = pd.DataFrame.sparse.from_spmatrix(
    matrix_sparse, 
    index=[u for u in user_ratings],
    columns=movie_ids
).T
df

Unnamed: 0,10,37,51,126,152,263,284,448,626,706,...,162002,162073,162207,162257,162363,162420,162434,162464,162499,162537
tt0274309,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0298203,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0315733,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0337563,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0463854,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt4241904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
tt1666800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt6806448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0844671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [10]:
print("Movies x Users dimensionality:", df.shape)

Movies x Users dimensionality: (4465, 2244)


## Hold Out Data for Evaluation

We want to extract a set of test users on which we can evaluate our recsys. For each test user, we need to split their behavior into a seen set that we can use for recommendation and an unseen set we can use for evaluation.

In [11]:
user_by_movie_df = df.T

In [12]:
held_out_users = np.random.choice(user_by_movie_df.index, size=100)
held_out_users

array(['36115', '96080', '135344', '92750', '149321', '60241', '4230',
       '46518', '9671', '157961', '130204', '88791', '83746', '56359',
       '1505', '106308', '115079', '121238', '158873', '155545', '97971',
       '90975', '149389', '133268', '101752', '106018', '104449',
       '138600', '99622', '139538', '61631', '66643', '140711', '79317',
       '38228', '100764', '111399', '147045', '54825', '92720', '5180',
       '140549', '115786', '59621', '28732', '35034', '46632', '20242',
       '37544', '123763', '83573', '87592', '90162', '34927', '60475',
       '126741', '72909', '92695', '85586', '5057', '78866', '86055',
       '142039', '157362', '25903', '60786', '80431', '55152', '133405',
       '43732', '149321', '135048', '16243', '139175', '1400', '113730',
       '153862', '2087', '31518', '32891', '51066', '139162', '66129',
       '113426', '158242', '94573', '159943', '124700', '117581',
       '112452', '62022', '159645', '151353', '43703', '127031', '13923',
   

In [13]:
held_out_movies = np.random.choice(user_by_movie_df.columns, size=500)

In [14]:
# This new DataFrame has zeros where ratings used to be
training_df = user_by_movie_df.sparse.to_dense()
for test_user in held_out_users:
    for test_movie in held_out_movies:
        training_df.loc[test_user][test_movie] = 0.0

### Baseline Recs using Most Popular Movies

As a baseline, let's rank movies by their global popularity

In [15]:
# Let's make a dataframe of just the training users, so we can evaluate
held_in_users_df = training_df.loc[[u for u in training_df.index if u not in held_out_users]]

In [16]:
agg_movie_ratings_ = held_in_users_df.sum(axis=0)
agg_movie_ratings = 5 * (agg_movie_ratings_ / agg_movie_ratings_.max())

for movie_id,sum_rating in agg_movie_ratings.sort_values(ascending=False).head(20).items():
    print(movie_id, movie_actor_map[movie_id]["movie"], sum_rating)

tt0209144 Memento 5.0
tt0172495 Gladiator 4.8754419800707165
tt1375666 Inception 4.813564770170363
tt0126029 Shrek 4.4567663130826105
tt0338013 Eternal Sunshine of the Spotless Mind 4.141755062680811
tt0266543 Finding Nemo 3.9561234329797488
tt0198781 Monsters, Inc. 3.872549019607843
tt0266697 Kill Bill: Vol. 1 3.7455801992928315
tt0372784 Batman Begins 3.400835744133719
tt0910970 WALL·E 3.399228543876567
tt0240772 Ocean's Eleven 3.2875281260045
tt0378194 Kill Bill: Vol. 2 3.252973320475731
tt0246578 Donnie Darko 3.163773706203793
tt0120903 X-Men 3.152523304403729
tt0181689 Minority Report 3.104307296689168
tt1049413 Up 3.0118932819029256
tt0264464 Catch Me If You Can 2.953230472516876
tt0145487 Spider-Man 2.838315654130505
tt0162222 Cast Away 2.7820636451301834
tt0361748 Inglourious Basterds 2.771616843458695


### Evaluate This Baseline on the Held-Out Users

In [17]:
top_k = 5
metrics = []

for u in held_out_users:

    this_held_out_movies = user_by_movie_df.loc[u][held_out_movies]
    if sum(this_held_out_movies) == 0:
        print("Skipping user with no ratings in held-out set:", u)
        continue

    # Calculate mean squared error
    mse = mean_squared_error(this_held_out_movies, agg_movie_ratings[held_out_movies])
    
    # Pearson correlation for rankings
    corr_p = pearsonr(this_held_out_movies, agg_movie_ratings[held_out_movies])[0]
    
    # Spearman correlation for *rankings* rather than overall ratings
    corr_s = spearmanr(this_held_out_movies, agg_movie_ratings[held_out_movies]).correlation
    
    # Top k rankings, i.e., what's the similarity between the top-k for this user and all the dataset?
    this_top_k = set(this_held_out_movies.sort_values(ascending=False).head(top_k).keys())
    global_top_k = set(agg_movie_ratings[held_out_movies].sort_values(ascending=False).head(top_k).keys())

    # What's the Precision between this user's top-k and global top-k?
    precision_at_k = len(this_top_k.intersection(global_top_k)) / len(this_top_k)
    
    metrics.append({
        "mse": mse,
        "pearson": corr_p,
        "spearman": corr_s,
        "pr@k": precision_at_k,
    })
    

Skipping user with no ratings in held-out set: 130204
Skipping user with no ratings in held-out set: 83746
Skipping user with no ratings in held-out set: 106308
Skipping user with no ratings in held-out set: 115079
Skipping user with no ratings in held-out set: 121238
Skipping user with no ratings in held-out set: 101752
Skipping user with no ratings in held-out set: 139538
Skipping user with no ratings in held-out set: 66643
Skipping user with no ratings in held-out set: 140711
Skipping user with no ratings in held-out set: 79317
Skipping user with no ratings in held-out set: 140549
Skipping user with no ratings in held-out set: 35034
Skipping user with no ratings in held-out set: 46632
Skipping user with no ratings in held-out set: 126741
Skipping user with no ratings in held-out set: 78866
Skipping user with no ratings in held-out set: 86055
Skipping user with no ratings in held-out set: 142039
Skipping user with no ratings in held-out set: 43732
Skipping user with no ratings in hel

In [18]:
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,mse,pearson,spearman,pr@k
0,0.141909,0.508491,0.152029,0.250000
1,0.142504,0.694896,0.331195,0.400000
2,0.201653,0.191528,0.064760,0.000000
3,0.161518,0.384974,0.146898,0.200000
4,0.198313,0.347841,0.179527,0.000000
...,...,...,...,...
72,0.177693,0.264456,0.075859,0.000000
73,0.220126,0.017649,0.083234,0.000000
74,0.212834,0.416011,0.170807,0.250000
75,0.190352,0.291287,0.106293,0.000000


In [19]:
metrics_df.mean()

mse         0.245499
pearson     0.320553
spearman    0.153482
pr@k        0.168615
dtype: float64