# Cluster Actors using a Matrix of Actor x Genre

Create a matrix of actors and the genres in which they've starred. Then, we'll use k-Means to extract clusters from the data in an unsupervised fashion.

In [1]:
%matplotlib inline


In [2]:
import pandas as pd

import json

In [4]:
movie_data_map = {}

with open("../data/imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
            
        # Finished with this film
        movie_data_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [5]:
# Check the output for movie, Kate & Leopold, IMDB ID = tt0035423
movie_data_map['tt0035423']

{'movie': 'Kate & Leopold',
 'actors': {'nm0000212', 'nm0000630', 'nm0005227', 'nm0413168'},
 'genres': ['Comedy', 'Fantasy', 'Romance']}

In [11]:
# Get all actors as an index for a dataframe
index = movie_data_map.keys()

# Get the genre-counts for each actor in the index
rows = [movie_data_map[k]["genres"] for k in index]

# Unique genres
all_genres = {g for r in rows for g in r}

# Convert rows to binary vectors
def generate_genre_vector(row):
    this_genre_map = {g:0 for g in all_genres}
    this_genre_map.update({g:1 for g in row})
    
    return this_genre_map
genre_vectors = [generate_genre_vector(r) for r in rows]

# Create the data frame from these rows, with the actors as index
df = pd.DataFrame(genre_vectors, index=index)

# Fill NAs with zero, as NA means the actor has not starred in that genre
df = df.fillna(0)

df

Unnamed: 0,Unnamed: 1,Drama,Adventure,Fantasy,Reality-TV,History,Horror,Musical,Mystery,Animation,...,Crime,Comedy,Short,Documentary,Sci-Fi,Biography,War,Sport,Music,Western
tt0035423,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
tt0088751,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
tt0096056,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0113092,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0116391,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9906278,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
tt9906644,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
tt9906844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
tt9907032,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## Apply k-Means with a Fixed K

In [12]:
from sklearn.cluster import KMeans

In [41]:
k = 16 # We pick k for illustrative purposes only. You would need a more principled approach here.

In [42]:
cluster_model = KMeans(n_clusters=k)

In [43]:
cluster_model.fit(df)

KMeans(n_clusters=16)

In [44]:
cluster_labels = cluster_model.predict(df)
movie_cluster_df = pd.DataFrame(cluster_labels, index=df.index, columns=["cluster"])

In [45]:
movie_cluster_df["cluster"].value_counts()

5     3626
1     2509
11    1927
0     1705
2     1376
10    1333
6     1123
13     947
7      906
8      847
4      846
12     841
9      771
14     709
3      635
15     519
Name: cluster, dtype: int64

In [47]:
for cluster,movies in movie_cluster_df.groupby("cluster"):
    print("Cluster:", cluster, "Size:", movies.shape[0])
    
    top_genres = df.loc[movies.index].sum().sort_values(ascending=False).head(5) / movies.shape[0]
    print("\t", "Top Genres:")
    for this_g,rate in top_genres.items():
        print("\t\t", this_g, "[%0.4f]" % rate)
    
    print("\t", "Movie Sample:")
    for m_id in movies.sample(10).index:
        print("\t\t", m_id, movie_data_map[m_id]["movie"])

Cluster: 0 Size: 1705
	 Top Genres:
		 Drama [1.0000]
		 Comedy [1.0000]
		 Crime [0.0868]
		 Family [0.0716]
		 Music [0.0499]
	 Movie Sample:
		 tt1881109 Zeroville
		 tt0388181 The King of Bollywood
		 tt6460276 My Daddy's in Heaven
		 tt10198952 The Come Up
		 tt0273453 Bark!
		 tt2055765 The English Teacher
		 tt4882548 Burn Your Maps
		 tt6865630 Ana
		 tt0462392 LOL
		 tt3485938 The Two Dogs
Cluster: 1 Size: 2509
	 Top Genres:
		 Horror [0.7210]
		 Mystery [0.1487]
		 Comedy [0.1263]
		 Drama [0.1140]
		 Fantasy [0.0873]
	 Movie Sample:
		 tt18548186 Pig Killer
		 tt1127180 Drag Me to Hell
		 tt2317484 The Penny Dreadful Picture Show
		 tt4460572 Tenement
		 tt5815078 Wunderland
		 tt3377240 Texas Rein
		 tt3463106 The Cured
		 tt1773768 The Strawberry Shortcake Movie: Sky's the Limit
		 tt0134084 Scream 3
		 tt14392056 A Little More Flesh II
Cluster: 2 Size: 1376
	 Top Genres:
		 Thriller [1.0000]
		 Drama [1.0000]
		 Crime [0.2427]
		 Mystery [0.1199]
		 Action [0.1090]
	 Movi