# Rank Most Actors By Genre

Using various distance metrics, identify the most similar actors to a given query actor by the genres in which they've starred.

In [3]:
%matplotlib inline


In [4]:
import pandas as pd

import json

import matplotlib.pyplot as plt

In [15]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}
actor_rating_map = {}


with open("../data/imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
            # Add ratings to this map
            if len(this_movie["rating"]) > 0:
                actor_rating_map[actor_id] = actor_rating_map.get(actor_id, 0)\
                    + this_movie["rating"]["votes"]
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })
        

In [16]:
# Check the output for actor Hugh Jackman, actor ID nm0413168
actor_genre_map['nm0413168']

{'Comedy': 7,
 'Fantasy': 3,
 'Romance': 5,
 'Action': 14,
 'Adventure': 11,
 'Sci-Fi': 10,
 'Crime': 6,
 'Thriller': 2,
 'Animation': 4,
 'Drama': 12,
 'Mystery': 5,
 'Biography': 4,
 'Musical': 2,
 'History': 1}

In [17]:
# Get all actors as an index for a dataframe
index = actor_genre_map.keys()

# Get the genre-counts for each actor in the index
rows = [actor_genre_map[k] for k in index]

# Create the data frame from these rows, with the actors as index
df = pd.DataFrame(rows, index=index)

# Fill NAs with zero, as NA means the actor has not starred in that genre
df = df.fillna(0)

df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,3.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Add Votes to the Genre-Count Matrix

In [19]:
df["Votes"] = [actor_rating_map.get(r, 0) for r in df.index]

In [20]:
df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Documentary,Sport,News,Family,Music,Unnamed: 17,Western,Short,Reality-TV,Votes
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,269915
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9278649
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2356649
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,535171
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1756


In [26]:
for idx,_ in df.sort_values(by="Votes", ascending=False).head(10).iterrows():
    print(idx, actor_name_map[idx])

nm0000138 Leonardo DiCaprio
nm0000288 Christian Bale
nm0000375 Robert Downey Jr.
nm0000093 Brad Pitt
nm0262635 Chris Evans
nm0424060 Scarlett Johansson
nm0005212 Ian McKellen
nm0000168 Samuel L. Jackson
nm0089217 Orlando Bloom
nm0413168 Hugh Jackman


In [27]:
df.sort_values(by="Votes", ascending=False).head(10)

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Documentary,Sport,News,Family,Music,Unnamed: 17,Western,Short,Reality-TV,Votes
nm0000138,4.0,0.0,4.0,15.0,1.0,4.0,3.0,4.0,4.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13861068
nm0000288,4.0,2.0,2.0,23.0,2.0,2.0,16.0,8.0,10.0,1.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,12956090
nm0000375,10.0,0.0,3.0,12.0,5.0,1.0,11.0,3.0,6.0,1.0,...,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,11448057
nm0000093,11.0,2.0,3.0,18.0,1.0,6.0,7.0,4.0,10.0,3.0,...,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11380108
nm0262635,7.0,1.0,5.0,11.0,0.0,6.0,16.0,2.0,5.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10016972
nm0424060,16.0,2.0,10.0,23.0,3.0,3.0,9.0,3.0,5.0,1.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,9992784
nm0005212,2.0,4.0,1.0,9.0,5.0,3.0,7.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9795242
nm0000168,12.0,5.0,0.0,23.0,7.0,9.0,28.0,3.0,19.0,2.0,...,5.0,3.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,9556934
nm0089217,2.0,5.0,3.0,12.0,0.0,2.0,12.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9296763
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9278649


## Actor Similarity Using Counts

In [28]:
from scipy.sparse import lil_matrix # Needed for building the matrix of user ratings
import scipy.spatial.distance # Needed for calculating pairwise distances

In [29]:
#Setting the actor we will be comparing to
# target_actor_id = 'nm1165110' # Chris Hemsworth
target_actor_id = 'nm0413168' # Hugh Jackman
# target_actor_id = 'nm0005351' # Ryan Reynolds

### Metric Value without Votes

In [49]:
df_without_votes = df.drop(columns=["Votes"])

#Gathering the genres for that actor
target_actor_ratings = df_without_votes.loc[target_actor_id]

#Generating distances from that actor to all the others
distances = scipy.spatial.distance.cdist(df_without_votes, [target_actor_ratings], metric="euclidean")[:,0]

query_distances = list(zip(df.index, distances))

#Printing the top ten most similar actors to our target
for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())

nm0413168 Hugh Jackman 0.0 9278735.0
nm0000375 Robert Downey Jr. 7.937253933193772 11448132.0
nm0262635 Chris Evans 8.366600265340756 10017046.0
nm1517976 Chris Pine 10.770329614269007 3857317.0
nm1475594 Channing Tatum 10.954451150103322 3838612.0
nm0185819 Daniel Craig 11.445523142259598 5844258.0
nm0000226 Will Smith 11.916375287812984 6516606.0
nm0757855 Zoe Saldana 12.328828005937952 4877179.0
nm0004937 Jamie Foxx 12.529964086141668 5086800.0
nm0000234 Charlize Theron 12.922847983320086 5191872.0


### Metric Value with Votes Column

In [50]:
#Gathering the genres for that actor
target_actor_ratings = df.loc[target_actor_id]

#Generating distances from that actor to all the others
distances = scipy.spatial.distance.cdist(df, [target_actor_ratings], metric="euclidean")[:,0]

query_distances = list(zip(df.index, distances))

#Printing the top ten most similar actors to our target
for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())

nm0413168 Hugh Jackman 0.0 9278735.0
nm0089217 Orlando Bloom 18114.00582422342 9296812.0
nm0749263 Mark Ruffalo 194590.00078112955 9084149.0
nm0000168 Samuel L. Jackson 278285.0013098083 9557089.0
nm0000354 Matt Damon 507210.0004298022 8771551.0
nm0005212 Ian McKellen 516593.0001848651 9795293.0
nm0000136 Johnny Depp 678847.0001642491 8599897.0
nm0424060 Scarlett Johansson 714135.0002198464 9992878.0
nm0262635 Chris Evans 738323.0000474048 10017046.0
nm0000323 Michael Caine 1426902.0001100285 7851817.0


## Calculate Distance After Min/Max Column Normalization

Above, we see differences in Euclidean distances with and without the Votes column. Below, we rescale the `Votes` column to `[0,1]` and recalculate.

In [42]:
df_range = (df.max(axis=0) - df.min(axis=0))

df_col_norm = (df - df.min(axis=0)) / df_range
df_col_norm

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Documentary,Sport,News,Family,Music,Unnamed: 17,Western,Short,Reality-TV,Votes
nm0000212,0.212121,0.1,0.428571,0.086957,0.05,0.031746,0.017857,0.090909,0.068966,0.076923,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.019473
nm0413168,0.212121,0.3,0.357143,0.173913,0.25,0.031746,0.250000,0.363636,0.206897,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.669404
nm0000630,0.242424,0.2,0.428571,0.202899,0.10,0.047619,0.071429,0.454545,0.034483,0.076923,...,0.21875,0.075,0.333333,0.000000,0.0,0.0,0.0,0.0,0.0,0.170019
nm0005227,0.303030,0.1,0.142857,0.028986,0.00,0.015873,0.017857,0.000000,0.000000,0.000000,...,0.00000,0.025,0.000000,0.076923,0.0,0.0,0.0,0.0,0.0,0.038610
nm0864851,0.030303,0.0,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.000000,0.0,0.000000,0.000000,0.00,0.015873,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000033
nm10592896,0.000000,0.0,0.000000,0.000000,0.00,0.015873,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000033
nm7216750,0.000000,0.0,0.000000,0.000000,0.00,0.015873,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000033
nm0936300,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.076923,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000127


In [44]:
# Now, the range for all columns should be the same
df_col_norm.max(axis=0)

Comedy         1.0
Fantasy        1.0
Romance        1.0
Drama          1.0
Mystery        1.0
Thriller       1.0
Action         1.0
Biography      1.0
Crime          1.0
War            1.0
Adventure      1.0
Sci-Fi         1.0
Animation      1.0
Musical        1.0
History        1.0
Horror         1.0
Documentary    1.0
Sport          1.0
News           1.0
Family         1.0
Music          1.0
               1.0
Western        1.0
Short          1.0
Reality-TV     1.0
Votes          1.0
dtype: float64

In [48]:
#Gathering the genres for that actor
target_actor_ratings = df_col_norm.loc[target_actor_id]

#Generating distances from that actor to all the others
distances = scipy.spatial.distance.cdist(df_col_norm, [target_actor_ratings], metric="euclidean")[:,0]

query_distances = list(zip(df.index, distances))

#Printing the top ten most similar actors to our target
for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())

nm0413168 Hugh Jackman 0.0 9278735.0
nm0000375 Robert Downey Jr. 0.5962602575177932 11448132.0
nm0010736 Amy Adams 0.6174947730739638 6257011.0
nm0262635 Chris Evans 0.6577934358095161 10017046.0
nm0124930 Gerard Butler 0.7055878776461664 5022148.0
nm0000129 Tom Cruise 0.7208010824176451 7250781.0
nm0000191 Ewan McGregor 0.7584657529417943 5224264.0
nm0000234 Charlize Theron 0.7885787241568242 5191872.0
nm0005351 Ryan Reynolds 0.7927022223157756 7243962.0
nm0000553 Liam Neeson 0.8001152435196612 6609435.0
