# Rank Most Actors By Costar

Using various distance metrics, identify the most similar actors to a given query actor by the costars with whom they've starred.

In [1]:
%matplotlib inline


In [2]:
import pandas as pd

import json

In [5]:
actor_name_map = {}
movie_actor_map = {}
actor_costar_map = {}


with open("../data/imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's stars to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_costars = actor_costar_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for a_id,_ in this_movie["actors"]:
                this_actors_costars[a_id] = this_actors_costars.get(a_id, 0) + 1
                
            # Update the map
            actor_costar_map[actor_id] = this_actors_costars
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [7]:
# Check the output for actor Hugh Jackman, actor ID nm0413168
actor_costar_map['nm0413168']

{'nm0000212': 1,
 'nm0413168': 30,
 'nm0000630': 2,
 'nm0005227': 1,
 'nm0001772': 5,
 'nm0005212': 4,
 'nm0000463': 2,
 'nm0000237': 1,
 'nm0000932': 3,
 'nm0000332': 1,
 'nm0000295': 1,
 'nm0746896': 1,
 'nm0378175': 1,
 'nm0000704': 1,
 'nm0005261': 1,
 'nm0000245': 1,
 'nm0001838': 1,
 'nm0859503': 1,
 'nm0000995': 1,
 'nm0000701': 1,
 'nm0000606': 1,
 'nm1431940': 1,
 'nm2023672': 1,
 'nm1107001': 1,
 'nm0000173': 1,
 'nm0011354': 1,
 'nm1812267': 1,
 'nm0424060': 2,
 'nm0242319': 1,
 'nm0061106': 1,
 'nm0005351': 1,
 'nm0396812': 1,
 'nm0000288': 1,
 'nm0000323': 1,
 'nm0000191': 1,
 'nm0931329': 2,
 'nm0022883': 1,
 'nm5473782': 1,
 'nm8030441': 1,
 'nm0366846': 1,
 'nm0350453': 1,
 'nm0205626': 1,
 'nm0502425': 1,
 'nm0498449': 1,
 'nm5148840': 1,
 'nm3822462': 1,
 'nm3948952': 1,
 'nm0000164': 1,
 'nm0000368': 1,
 'nm0000285': 1,
 'nm0279545': 1,
 'nm1517976': 1,
 'nm1374980': 1,
 'nm3918035': 1,
 'nm0000128': 1,
 'nm0004266': 1,
 'nm1086543': 1,
 'nm1663205': 1,
 'nm2353862':

In [8]:
# Get all actors as an index for a dataframe
index = actor_costar_map.keys()

# Get the genre-counts for each actor in the index
rows = [actor_costar_map[k] for k in index]

# Create the data frame from these rows, with the actors as index
df = pd.DataFrame(rows, index=index)

# Fill NAs with zero, as NA means the actor has not starred in that genre
df = df.fillna(0)

df

Unnamed: 0,nm0000212,nm0413168,nm0000630,nm0005227,nm0000473,nm0001435,nm0000527,nm0749263,nm0000492,nm1476796,...,nm9468314,nm9468359,nm9468358,nm9468360,nm9468385,nm9468384,nm9484309,nm9484310,nm9484308,nm1644256
nm0000212,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,1.0,30.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,1.0,2.0,31.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,1.0,1.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0864851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from scipy.sparse import lil_matrix # Needed for building the matrix of user ratings
import scipy.spatial.distance # Needed for calculating pairwise distances

In [10]:
#Setting the actor we will be comparing to
target_actor_id = 'nm1165110'

In [11]:
#Gathering the genres for that actor
target_actor_ratings = df.loc[target_actor_id]

#Generating distances from that actor to all the others
distances = scipy.spatial.distance.cdist(df, [target_actor_ratings], metric="cosine")[:,0]

query_distances = list(zip(df.index, distances))

In [12]:
#Printing the top ten most similar actors to our target
for similar_actor_id, similar_costar_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_costar_score)


nm1165110 Chris Hemsworth 0.0
nm1437125 Victoria Profeta 0.48600375513240246
nm1393354 Kristen Connolly 0.48600375513240246
nm2325018 Tang Wei 0.48600375513240246
nm0910966 Leehom Wang 0.48600375513240246
nm11523062 Bryon Lerum 0.48600375513240246
nm11523063 Ryder Lerum 0.48600375513240246
nm7287299 Rudhraksh Jaiswal 0.48600375513240246
nm11807845 Mark Paguio 0.48600375513240246
nm1089991 Tom Hiddleston 0.63727771538366


In [14]:
from sklearn.metrics import DistanceMetric
from sklearn.metrics.pairwise import cosine_distances

In [15]:
# dist = DistanceMetric.get_metric("euclidean")
dist = cosine_distances

In [16]:
#Gathering the genres for that actor
target_actor_ratings = df.loc[target_actor_id]

#Generating distances from that actor to all the others
# distances = dist.pairwise(df, [target_actor_ratings])[:,0]
distances = dist(df, [target_actor_ratings])[:,0]

query_distances = list(zip(df.index, distances))

#Printing the top ten most similar actors to our target
for similar_actor_id, similar_costar_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_costar_score)



nm1165110 Chris Hemsworth 9.992007221626409e-16
nm1437125 Victoria Profeta 0.48600375513240246
nm1393354 Kristen Connolly 0.48600375513240246
nm2325018 Tang Wei 0.48600375513240246
nm0910966 Leehom Wang 0.48600375513240246
nm11523062 Bryon Lerum 0.48600375513240246
nm11523063 Ryder Lerum 0.48600375513240246
nm7287299 Rudhraksh Jaiswal 0.48600375513240246
nm11807845 Mark Paguio 0.48600375513240246
nm1089991 Tom Hiddleston 0.63727771538366
