# Cluster Actors using a Matrix of Actor x Genre

Create a matrix of actors and the genres in which they've starred. Then, we'll use k-Means to extract clusters from the data in an unsupervised fashion.

In [1]:
%matplotlib inline


In [2]:
import pandas as pd

import json

In [3]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("../data/imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [4]:
# Check the output for actor Hugh Jackman, actor ID nm0413168
actor_genre_map['nm0413168']

{'Comedy': 7,
 'Fantasy': 3,
 'Romance': 5,
 'Action': 14,
 'Adventure': 11,
 'Sci-Fi': 10,
 'Crime': 6,
 'Thriller': 2,
 'Animation': 4,
 'Drama': 12,
 'Mystery': 5,
 'Biography': 4,
 'Musical': 2,
 'History': 1}

In [5]:
# Get all actors as an index for a dataframe
index = actor_genre_map.keys()

# Get the genre-counts for each actor in the index
rows = [actor_genre_map[k] for k in index]

# Create the data frame from these rows, with the actors as index
df = pd.DataFrame(rows, index=index)

# Fill NAs with zero, as NA means the actor has not starred in that genre
df = df.fillna(0)

df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,3.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Apply k-Means with a Fixed K

In [6]:
from sklearn.cluster import KMeans

In [8]:
k = 8 # We pick k=8 for illustrative purposes only. You would need a more principled approach here.

In [9]:
cluster_model = KMeans(n_clusters=k)

In [10]:
cluster_model.fit(df)

KMeans()

In [13]:
cluster_labels = cluster_model.predict(df)
actor_cluster_df = pd.DataFrame(cluster_labels, index=df.index, columns=["cluster"])

In [15]:
actor_cluster_df["cluster"].value_counts()

0    29089
6     2728
4      935
7      305
2      280
3      135
1      131
5        6
Name: cluster, dtype: int64

In [19]:
for cluster,actors in actor_cluster_df.groupby("cluster"):
    print("Cluster:", cluster, "Size:", actors.shape[0])
    
    for a_id in actors.sample(5).index:
        print("\t", a_id, actor_name_map[a_id])

Cluster: 0 Size: 29089
	 nm10743341 143 Anand
	 nm0573037 Julian McMahon
	 nm1944318 Nicole Holt
	 nm0384060 Bernard Hill
	 nm5607314 Gillian Broderick
Cluster: 1 Size: 131
	 nm0482320 Mohanlal
	 nm0000461 Michael Ironside
	 nm0000198 Gary Oldman
	 nm3606487 Samantha Ruth Prabhu
	 nm0124930 Gerard Butler
Cluster: 2 Size: 280
	 nm0000729 Casey Affleck
	 nm1256532 Jon Bernthal
	 nm1596350 Nawazuddin Siddiqui
	 nm2057859 Andrea Riseborough
	 nm0000630 Liev Schreiber
Cluster: 3 Size: 135
	 nm0792156 Dave Sheridan
	 nm0001062 Jeffrey Combs
	 nm0004760 Jennifer Blanc-Biehn
	 nm0068551 Tobin Bell
	 nm0186225 Barbara Crampton
Cluster: 4 Size: 935
	 nm0001718 Kyra Sedgwick
	 nm1494818 Lawrence Michael Levine
	 nm0005269 Corin Nemec
	 nm1559927 Booboo Stewart
	 nm0809793 Roger Guenveur Smith
Cluster: 5 Size: 6
	 nm0000115 Nicolas Cage
	 nm0001744 Tom Sizemore
	 nm0000616 Eric Roberts
	 nm0000514 Michael Madsen
	 nm0000246 Bruce Willis
Cluster: 6 Size: 2728
	 nm2460093 Jaime Zevallos
	 nm0000272 