# Rank Most Similar Movies

Using various distance metrics, rank the most similar movies to a query movie.

In [2]:
%matplotlib inline

In [25]:
import pandas as pd

import json

In [14]:
actor_name_map = {}
movie_actor_map = {}

with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
            
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            actor_name_map[actor_id] = actor_name
            
        # Finished with this film
        movie_actor_map[this_movie["title_id"]] = ({
            "movie": this_movie["title_name"],
            "actors": set(this_movie['actor_ids']),
            "genres": this_movie["title_genre"]
        })

In [18]:
df = pd.DataFrame(movie_actor_map.values(), index=movie_actor_map.keys())
df[df["movie"] == "Star Wars: Episode II - Attack of the Clones"]
# df

Unnamed: 0,movie,actors,genres
tt0121765,Star Wars: Episode II - Attack of the Clones,"{nm0000489, nm0159789, nm0000191}","[Action, Adventure, Fantasy]"


In [30]:
target_movie_id = "tt0121765"

In [31]:
target_movie = movie_actor_map[target_movie_id]
target_movie

{'movie': 'Star Wars: Episode II - Attack of the Clones',
 'actors': {'nm0000191', 'nm0000489', 'nm0159789'},
 'genres': ['Action', 'Adventure', 'Fantasy']}

In [20]:
distances = []

target_actors = target_movie["actors"]
for movie in movie_actor_map.values():
    these_actors = movie["actors"]
    
    numer = len(target_actors.intersection(these_actors))
    denom = len(target_actors.union(these_actors))
    
    jaccard_sim = numer / denom
    
    distances.append({
        "movie": movie,
        "similarity": jaccard_sim
    })

In [21]:
for similar_movie in sorted(distances, key=lambda x: x["similarity"], reverse=True)[:10]:
    print(similar_movie["movie"]["movie"], similar_movie["similarity"])
    for actor in similar_movie["movie"]["actors"]:
        print("\t", actor_name_map[actor])

Star Wars: Episode II - Attack of the Clones 1.0
	 Christopher Lee
	 Hayden Christensen
	 Ewan McGregor
Star Wars: Episode III - Revenge of the Sith 0.5
	 Samuel L. Jackson
	 Hayden Christensen
	 Ewan McGregor
Faster 0.3333333333333333
	 Ewan McGregor
Whales of Atlantis: In Search of Moby Dick 0.3333333333333333
	 Christopher Lee
Troy's Story 0.3333333333333333
	 Ewan McGregor
Miss Potter 0.3333333333333333
	 Ewan McGregor
The Final Fix 0.3333333333333333
	 Ewan McGregor
Perfect Sense 0.3333333333333333
	 Ewan McGregor
Fastest 0.3333333333333333
	 Ewan McGregor
Charge 0.3333333333333333
	 Ewan McGregor


## Movie Similarity via Users' Movie Ratings

The MovieLens dataset contains ratings people have given to movies. This data gives a different way to evaluate similarity between movies based on user ratings. That is, two movies should be similar if users rated them similarly.

In [35]:
from scipy.sparse import lil_matrix # Needed for building the matrix of user ratings
import scipy.spatial.distance # Needed for calculating pairwise distances

In [9]:
known_movies = set()

user_ratings = {} # List of all our movie ratings for specific users
movie_ids = []

with open("../data/user_ratings.json", "r") as in_file:
    for line in in_file:
        
        this_rating = json.loads(line)
        
        known_movies.add(this_rating["title_id"])
        
        # Maintain a list of movie ratings we've seen
        if this_rating["title_id"] not in movie_ids:
            movie_ids.append(this_rating["title_id"])
        
        # Maintain a list of which movies a user has rated
        #. Check to see if we've seen this user before, and if so, 
        #. pull the list of (movie id, rating) tuples. Otherwise,
        #. create an empty list
        this_users_ratings = user_ratings.get(this_rating["userId"], [])
        this_users_ratings.append((this_rating["title_id"], this_rating["rating"]))
        
        user_ratings[this_rating["userId"]] = this_users_ratings
        
        

In [10]:
# Map IMDB movie IDs to numeric indices
movie_id_to_index = {m:i for i,m in enumerate(movie_ids)}

In [23]:
len(known_movies), len(user_ratings)

(4465, 2244, 4465)

In [27]:
# With sparse matrix, initialize to size of Users x Movies of 0s
matrix_sparse = lil_matrix((len(user_ratings), len(known_movies)), dtype=float)

# Update the matrix, user by user, setting non-zero values for the appropriate actors
for row,this_user in enumerate(user_ratings): 
    this_user_ratings = user_ratings[this_user]
    
    for movie_id,rating in this_user_ratings:
        this_movie_index = movie_id_to_index[movie_id]
        matrix_sparse[row,this_movie_index] = rating

In [29]:
df = pd.DataFrame.sparse.from_spmatrix(
    matrix_sparse, 
    index=[u for u in user_ratings],
    columns=movie_ids
).T
df

Unnamed: 0,10,37,51,126,152,263,284,448,626,706,...,162002,162073,162207,162257,162363,162420,162434,162464,162499,162537
tt0274309,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0298203,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0315733,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0337563,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0463854,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt4241904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
tt1666800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt6806448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0844671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [44]:
target_movie_ratings = df.loc[target_movie_id]
distances = scipy.spatial.distance.cdist(df, [target_movie_ratings], metric="cosine")[:,0]

query_distances = list(zip(df.index, distances))

In [52]:
for similar_movie_id, similar_movie_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    similar_movie = movie_actor_map[similar_movie_id]
    print(similar_movie_id, similar_movie["movie"], similar_movie_score)

    for actor in similar_movie["actors"]:
        print("\t", actor_name_map[actor])

tt0121765 Star Wars: Episode II - Attack of the Clones 0.0
	 Christopher Lee
	 Hayden Christensen
	 Ewan McGregor
tt0121766 Star Wars: Episode III - Revenge of the Sith 0.33186057101580557
	 Samuel L. Jackson
	 Hayden Christensen
	 Ewan McGregor
tt0145487 Spider-Man 0.4717880003216154
	 Tobey Maguire
	 Willem Dafoe
	 James Franco
tt0126029 Shrek 0.5065301456299656
	 Eddie Murphy
	 Mike Myers
	 John Lithgow
tt0290334 X2: X-Men United 0.5097195292363826
	 Ian McKellen
	 Patrick Stewart
	 Hugh Jackman
tt0240772 Ocean's Eleven 0.516582512079766
	 Brad Pitt
	 Matt Damon
	 George Clooney
tt0295297 Harry Potter and the Chamber of Secrets 0.5176754479737018
	 Rupert Grint
	 Richard Harris
	 Daniel Radcliffe
tt0316654 Spider-Man 2 0.5277418355211236
	 Alfred Molina
	 Tobey Maguire
	 James Franco
tt0181689 Minority Report 0.5313261461659751
	 Colin Farrell
	 Max von Sydow
	 Tom Cruise
tt0120903 X-Men 0.5361871745920443
	 Ian McKellen
	 Patrick Stewart
	 Hugh Jackman
