# RecSys and User x User Feedback

Using the IMDB and MovieLens data, we build a feature matrix that we can use for user-user and item-item recommendation.

Then, we use this scaffold to create recommendations by finding most similar users.

In [1]:
%matplotlib inline

In [2]:
import json
import random
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from collections import Counter

from scipy.sparse import lil_matrix

from sklearn.neighbors import DistanceMetric

from sklearn.metrics import jaccard_score
from sklearn.metrics import pairwise_distances

In [3]:
known_movies = set()

user_ratings = {} # List of all our movie ratings for specific users
movie_ids = []

with open("../data/user_ratings.json", "r") as in_file:
    for line in in_file:
        
        this_rating = json.loads(line)
        
        known_movies.add(this_rating["title_id"])
        
        if this_rating["title_id"] not in movie_ids:
            movie_ids.append(this_rating["title_id"])
        
        this_users_ratings = user_ratings.get(this_rating["userId"], [])
        this_users_ratings.append((this_rating["title_id"], this_rating["rating"]))
        
        user_ratings[this_rating["userId"]] = this_users_ratings
        
        

In [4]:
movie_id_to_index = {m:i for i,m in enumerate(movie_ids)}

In [5]:
print("Known Users:", len(user_ratings))
print("Known Movies:", len(known_movies))


Known Users: 2244
Known Movies: 4465


In [6]:
actor_id_to_name_map = {}     # Map Actor IDs to actor names
actor_id_to_index_map = {}    # Map actor IDs to a unique index of known actors
index_to_actor_ids = []       # Array mapping unique index back to actor ID (invert of actor_id_to_index_map)

index_counter = 0    # Unique actor index; increment for each new actor
known_actors = set()

movie_actor_map = {} # List of all our movies and their actors

test_count = 0
with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
        
        # Restrict to known movies
        if this_movie["title_id"] not in known_movies:
            continue
            
        # Keep track of all the actors in this movie
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            
            # Keep names and IDs
            actor_id_to_name_map[actor_id] = actor_name
            
            # If we've seen this actor before, skip...
            if actor_id in known_actors:
                continue
                
            # ... Otherwise, add to known actor set and create new index for them
            known_actors.add(actor_id)
            actor_id_to_index_map[actor_id] = index_counter
            index_to_actor_ids.append(actor_id)
            index_counter += 1
            
        # Finished with this film
        movie_actor_map[this_movie["title_id"]] = ({
            "movie": this_movie["title_name"],
            "actors": set(this_movie['actor_ids']),
            "genres": this_movie["title_genre"]
        })

In [7]:
print("Known Actors:", len(known_actors))
print("Known Movies:", len(movie_actor_map))

Known Actors: 5224
Known Movies: 4465


## Generate DataFrame using Sparse Matrics

Convert our Movie Ratings data into a DataFrame that we can use for analysis.

In [8]:
# With sparse matrix, initialize to size of Users x Movies of 0s
matrix_sparse = lil_matrix((len(user_ratings), len(known_movies)), dtype=float)

# Update the matrix, user by user, setting non-zero values for the appropriate actors
for row,this_user in enumerate(user_ratings): 
    this_user_ratings = user_ratings[this_user]
    
    for movie_id,rating in this_user_ratings:
        this_movie_index = movie_id_to_index[movie_id]
        matrix_sparse[row,this_movie_index] = rating

In [9]:
df = pd.DataFrame.sparse.from_spmatrix(
    matrix_sparse, 
    index=[u for u in user_ratings],
    columns=movie_ids
).T
df

Unnamed: 0,10,37,51,126,152,263,284,448,626,706,...,162002,162073,162207,162257,162363,162420,162434,162464,162499,162537
tt0274309,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0298203,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0315733,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0337563,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0463854,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt4241904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
tt1666800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt6806448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0844671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [111]:
### 
# Here, we center our user ratings such that "zero" means average rather than unknown
###
user_means_df = pd.DataFrame(
    [(idx, (np.mean([r for r in row if r > 0]))) for idx,row in df.T.iterrows()],
    columns=["user_id","mean"]
).set_index("user_id")

uid_centered_rows = []
for uid,ratings in df.T.iterrows():
    mean_rating = np.mean(ratings[ratings > 0])
    ratings[ratings > 0] = ratings[ratings > 0] - mean_rating
    uid_centered_rows.append(ratings)

centered_df = pd.DataFrame(uid_centered_rows).T

In [10]:
print("Movies x Users dimensionality:", df.shape)

Movies x Users dimensionality: (4465, 2244)


## Find a Query User and Measure Similarity against all users

In [120]:
# query_user = np.random.choice(df.columns)
query_user = "126"
print("Query User:", query_user)

Query User: 126


In [121]:
query_ranking_count = (df[query_user] > 0).sum()
print("How many movies has this user ranked:", query_ranking_count)

How many movies has this user ranked: 14


In [122]:
user_rating_df = df.T # Original DF is movies X users, let's transpose

In [123]:
# Find movies this query user has ranked
query_ranked_movies = user_rating_df.columns[user_rating_df.loc[query_user] > 0]

# Print this user's rated movies, ranked by rating
for movie_id, rating in user_rating_df.loc[query_user].sort_values(ascending=False).items():
    # Only print movies that have a rating
    if rating == 0:
        continue
        
    print(movie_id, movie_actor_map[movie_id]["movie"], rating)

tt0172495 Gladiator 5.0
tt0375679 Crash 5.0
tt2267998 Gone Girl 5.0
tt1375666 Inception 4.5
tt0414387 Pride & Prejudice 4.5
tt0327056 Mystic River 4.5
tt0758758 Into the Wild 4.5
tt1853728 Django Unchained 4.0
tt0352248 Cinderella Man 4.0
tt2582802 Whiplash 4.0
tt2179136 American Sniper 4.0
tt1130884 Shutter Island 4.0
tt2084970 The Imitation Game 3.5
tt2872732 Lucy 3.5


In [124]:
user_rating_centered_df = centered_df.T

# Find just the user's ratings and calc pairwise distances between that rating and all others
user_distances = pairwise_distances([user_rating_centered_df.loc[query_user]], user_rating_centered_df, metric="cosine")

In [125]:
# Convert to similarity rather than distance
user_sims = 1-user_distances

In [126]:
# Create a new DF of users based on similarity to the query user
user_user_sim_df = pd.DataFrame(user_sims[0,:], index=user_rating_df.index, columns=["similarity"])

In [127]:
top_k = 5

# Find the top-k most similar users 
relevant_users_df = user_user_sim_df.sort_values(by="similarity").tail(top_k + 1)

# Create a map of user IDs to similarity, since we use that later for re-weighting ratings
relevant_users_sims = {uid:row["similarity"] for uid,row in relevant_users_df.iterrows()}
relevant_users_df

Unnamed: 0,similarity
103322,0.275659
45614,0.275659
56162,0.275659
159304,0.279569
123966,0.293853
126,1.0


In [128]:
# We only care about rating movies that the user has NOT seen
query_unranked_movies = user_rating_df.columns[user_rating_df.loc[query_user] == 0]

In [129]:
# Get the IDs for the top-k similar users, excluding the query user, who will always have a high sim score
sim_user_index = [u for u in relevant_users_df.index if u != query_user]
unseen_sim_user_rankings_df = user_rating_df[query_unranked_movies].loc[sim_user_index]
unseen_sim_user_rankings_df

Unnamed: 0,tt0274309,tt0298203,tt0315733,tt0337563,tt0463854,tt0450385,tt0208092,tt0209144,tt0335266,tt0266697,...,tt4602066,tt7690670,tt8439854,tt6182908,tt5789976,tt4241904,tt1666800,tt6806448,tt0844671,tt0472198
103322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45614,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
# List for storing our predicted ratings
inferred_movie_ratings = []

# Transpose the similar-user X movies DF, and iterate through each movie...
for movie_id,ratings in unseen_sim_user_rankings_df.T.iterrows():
    
    # If no similar user has rated any movie here, skip this movie
    non_zero_ratings = ratings[ratings > 0]
    if non_zero_ratings.shape[0] < 1:
        continue
    
    # Weight the ratings for this movie based on the relevant users and their similarities
    this_summed_score = sum([
        relevant_users_sims[this_user_id] * this_rating  for this_user_id,this_rating in non_zero_ratings.items()
    ])    
    
    # Normalize based on user similarity
    this_summed_sim = sum([relevant_users_sims[this_user_id]  for this_user_id,this_rating in non_zero_ratings.items()])
    
    # Calculate inferred score
    inferred_rating = this_summed_score / this_summed_sim    
    inferred_movie_ratings.append((movie_id, inferred_rating))


In [131]:
for movie_id,inferred_rating in sorted(inferred_movie_ratings, key=lambda tup: tup[1], reverse=True)[:20]:
    print(movie_id, movie_actor_map[movie_id]["movie"], inferred_rating)

tt0181689 Minority Report 4.5
tt0338013 Eternal Sunshine of the Spotless Mind 4.0
tt0181984 Boiler Room 4.0
tt0186151 Frequency 4.0
tt0120630 Chicken Run 4.0
tt0246578 Donnie Darko 4.0
tt0165929 Romeo Must Die 4.0
tt0195778 Here on Earth 4.0
tt0162222 Cast Away 3.0
tt0218817 Antitrust 3.0
tt0174480 Autumn in New York 3.0
tt0175142 Scary Movie 1.0
