# Probabilities and k-Means Clustering

Using the IMDB data, construct a feature matrix, and apply `k-Means` to the data to extract clusters. 

We then inspect various aspects of probability associated with these clusterings.

In [2]:
import json

import pandas as pd
import numpy as np

In [3]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [4]:
print("Known Actors:", len(actor_name_map))
print("Known Movies:", len(movie_actor_map))

Known Actors: 33609
Known Movies: 20620


## Read CSV of Movies to Cluster IDs

Using the provided movie-to-cluster mapping CSV file, we assess the distributions of movies per cluster and ask questions about genres and actors in each cluster.

In [5]:
cluster_df = pd.read_csv("movie_to_cluster.csv")

In [6]:
cluster_df["cluster"].value_counts()

cluster
6     3177
0     3097
15    1754
13    1705
2     1503
12    1466
1     1376
3     1240
14     893
8      774
10     761
4      655
11     640
7      635
5      560
9      384
Name: count, dtype: int64

In [7]:
cluster_pr_map = {cluster_id:cluster_pr for cluster_id,cluster_pr in (cluster_df["cluster"].value_counts() / cluster_df.shape[0]).items()}
cluster_pr_df = pd.DataFrame(cluster_df["cluster"].value_counts() / cluster_df.shape[0])

cluster_pr_df

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
6,0.154074
0,0.150194
15,0.085063
13,0.082687
2,0.07289
12,0.071096
1,0.066731
3,0.060136
14,0.043307
8,0.037536


## Assess Genre-Specific Cluster Probabilities

We want to determine, for a new movie with a known genre, to which cluster is it most likely to be assigned?

In [8]:
# For each genre, count the number of movies
genre_counts = {}

# For each movie, get its genres and update the genre count
for movie_id in movie_actor_map.keys():
    for genre in movie_actor_map[movie_id]["genres"]:
        genre_counts[genre] = genre_counts.get(genre, 0) + 1
        
genre_prs = []
for genre,g_count in genre_counts.items():
    genre_prs.append((genre, g_count/len(movie_actor_map)))
    
genre_prs_df = pd.DataFrame(genre_prs, columns=["genre", "probability"])
genre_pr_map = {row["genre"]:row["probability"] for idx,row in genre_prs_df.iterrows()}

genre_prs_df.sort_values(by="probability", ascending=False)

Unnamed: 0,genre,probability
5,Drama,0.49258
0,Comedy,0.291804
10,Thriller,0.19418
6,Action,0.181523
3,Horror,0.149224
8,Crime,0.134481
2,Romance,0.12226
7,Adventure,0.080844
9,Mystery,0.074442
4,Sci-Fi,0.051164


In [9]:
#Setting the genre we will be investigating
# target_genre = "Sci-Fi"
# target_genre = "Western"
target_genre = "Sci-Fi"

### Calculate Conditional Probabilities of the given Genre for each cluster

We want to calculate the conditional probability of a movie being assigned to a cluster given its genre. This probability can be calculated as follows:

$$ Pr(Y=Cluster <ID> | X=Genre) = \frac{Pr(X=Genre | Y=Cluster <ID>) Pr(Y=Cluster <ID>)}{Pr(X=Genre)}  $$

We have $Pr(X=Genre)$ above via `genre_prs_df`, and we have $Pr(Y=Cluster <ID>)$ above via `cluster_pr_df`, so all we need to calculate $Pr(X=Genre | Y=Cluster <ID>)$.


In [20]:
per_cluster_prs = []
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_genre_count = sum([
        1 if target_genre in movie_actor_map[m]["genres"] else 0 
        for m in group["movie_id"]
    ])
    
    # Calculate conditional probability of the target genre given this cluster
    ## Pr(X=Genre | Y=Cluster <ID>)
    pr_genre_given_cluster = this_cluster_genre_count / len(group)
    
    # Then add this calculated probability to `per_cluster_prs`,
    ## so we can track this conditional probability
    per_cluster_prs.append(pr_genre_given_cluster)

In [22]:
# For each cluster ID, calculate the posterior probability given the target genre
for cluster_id,cluster_genre_pr in enumerate(per_cluster_prs):

    # combine this cluster_genre_pr, cluster_pr_df, and genre_prs_df
    ## to calculate Pr(Cluster|Genre)
    pr_cluster_given_genre = (cluster_genre_pr * cluster_pr_map[cluster_id]) / genre_pr_map["Sci-Fi"]

    print("Pr[Cluster %02d | %s]:" % (cluster_id, target_genre), "\t", pr_cluster_given_genre)
    

Pr[Cluster 00 | Adventure]: 	 0.4492890995260664
Pr[Cluster 01 | Adventure]: 	 0.01137440758293839
Pr[Cluster 02 | Adventure]: 	 0.6199052132701421
Pr[Cluster 03 | Adventure]: 	 0.1279620853080569
Pr[Cluster 04 | Adventure]: 	 0.01800947867298578
Pr[Cluster 05 | Adventure]: 	 0.01990521327014218
Pr[Cluster 06 | Adventure]: 	 0.11753554502369669
Pr[Cluster 07 | Adventure]: 	 0.0
Pr[Cluster 08 | Adventure]: 	 0.009478672985781991
Pr[Cluster 09 | Adventure]: 	 0.001895734597156398
Pr[Cluster 10 | Adventure]: 	 0.014218009478672987
Pr[Cluster 11 | Adventure]: 	 0.006635071090047393
Pr[Cluster 12 | Adventure]: 	 0.022748815165876776
Pr[Cluster 13 | Adventure]: 	 0.07488151658767772
Pr[Cluster 14 | Adventure]: 	 0.017061611374407582
Pr[Cluster 15 | Adventure]: 	 0.06919431279620854


In [23]:
target_genre = "Fantasy"
# For each cluster ID, calculate the posterior probability given the target genre
for cluster_id,cluster_genre_pr in enumerate(per_cluster_prs):

    # combine this cluster_genre_pr, cluster_pr_df, and genre_prs_df
    ## to calculate Pr(Cluster|Genre)
    pr_cluster_given_genre = (cluster_genre_pr * cluster_pr_map[cluster_id]) / genre_pr_map["Fantasy"]

    print("Pr[Cluster %02d | %s]:" % (cluster_id, target_genre), "\t", pr_cluster_given_genre)
    

Pr[Cluster 00 | Fantasy]: 	 0.540478905359179
Pr[Cluster 01 | Fantasy]: 	 0.013683010262257699
Pr[Cluster 02 | Fantasy]: 	 0.7457240592930444
Pr[Cluster 03 | Fantasy]: 	 0.15393386545039908
Pr[Cluster 04 | Fantasy]: 	 0.021664766248574684
Pr[Cluster 05 | Fantasy]: 	 0.02394526795895097
Pr[Cluster 06 | Fantasy]: 	 0.14139110604332952
Pr[Cluster 07 | Fantasy]: 	 0.0
Pr[Cluster 08 | Fantasy]: 	 0.011402508551881414
Pr[Cluster 09 | Fantasy]: 	 0.0022805017103762824
Pr[Cluster 10 | Fantasy]: 	 0.01710376282782212
Pr[Cluster 11 | Fantasy]: 	 0.00798175598631699
Pr[Cluster 12 | Fantasy]: 	 0.02736602052451539
Pr[Cluster 13 | Fantasy]: 	 0.09007981755986316
Pr[Cluster 14 | Fantasy]: 	 0.020524515393386546
Pr[Cluster 15 | Fantasy]: 	 0.08323831242873432


In [24]:
target_genre = "Adventure"
# For each cluster ID, calculate the posterior probability given the target genre
for cluster_id,cluster_genre_pr in enumerate(per_cluster_prs):

    # combine this cluster_genre_pr, cluster_pr_df, and genre_prs_df
    ## to calculate Pr(Cluster|Genre)
    pr_cluster_given_genre = (cluster_genre_pr * cluster_pr_map[cluster_id]) / genre_pr_map["Adventure"]

    print("Pr[Cluster %02d | %s]:" % (cluster_id, target_genre), "\t", pr_cluster_given_genre)

Pr[Cluster 00 | Adventure]: 	 0.28434313137372524
Pr[Cluster 01 | Adventure]: 	 0.007198560287942413
Pr[Cluster 02 | Adventure]: 	 0.3923215356928614
Pr[Cluster 03 | Adventure]: 	 0.08098380323935213
Pr[Cluster 04 | Adventure]: 	 0.011397720455908816
Pr[Cluster 05 | Adventure]: 	 0.01259748050389922
Pr[Cluster 06 | Adventure]: 	 0.07438512297540492
Pr[Cluster 07 | Adventure]: 	 0.0
Pr[Cluster 08 | Adventure]: 	 0.00599880023995201
Pr[Cluster 09 | Adventure]: 	 0.0011997600479904016
Pr[Cluster 10 | Adventure]: 	 0.008998200359928014
Pr[Cluster 11 | Adventure]: 	 0.004199160167966407
Pr[Cluster 12 | Adventure]: 	 0.014397120575884822
Pr[Cluster 13 | Adventure]: 	 0.047390521895620874
Pr[Cluster 14 | Adventure]: 	 0.010797840431913617
Pr[Cluster 15 | Adventure]: 	 0.04379124175164967


### Sample Titles in Each Cluster

We can use the above conditional probabilities to determine the most likely cluster given a movie genre.

Here, we sample movies in the most likely cluster to get a sense of what movies are in that cluster.

In [13]:
target_cluster = 6

In [14]:
for movie_id in cluster_df[cluster_df["cluster"] == target_cluster].sample(n=10, replace=False)["movie_id"]:
    this_movie = movie_actor_map[movie_id]
    print(movie_id, this_movie["movie"], this_movie["genres"])

tt2404818 Saattai ['Drama', 'Family']
tt1186633 Against the Grain ['Drama']
tt3089838 She ['Drama']
tt0837792 Dough Boys ['Drama', 'Family']
tt0969348 The Toe Tactic ['Animation', 'Drama']
tt3501590 Ithaca ['Drama', 'War']
tt4473806 40 Nights ['Drama']
tt11278608 The Walk ['Drama']
tt12412514 Tell ['Drama']
tt0390187 Lest We Forget ['Documentary', 'Drama']


## Assess Actor-Specific Cluster Probabilities

Above, we determine the most likely cluster given a movie genre. Here, we ask the same question for a given actor.

In [15]:
# For each actor, count the number of movies
actor_counts = {}

# For each movie, get its genres and update the genre count
for movie_id in movie_actor_map.keys():
    for actor in movie_actor_map[movie_id]["actors"]:
        actor_counts[actor] = actor_counts.get(actor, 0) + 1
        
actor_prs = []
for actor,a_count in actor_counts.items():
    actor_prs.append((actor, a_count/len(movie_actor_map)))
    
actor_prs_df = pd.DataFrame(actor_prs, columns=["actor", "probability"])
actor_pr_map = {row["actor"]:row["probability"] for idx,row in actor_prs_df.iterrows()}

actor_prs_df.sort_values(by="probability", ascending=False)

Unnamed: 0,actor,probability
660,nm0000616,0.009408
261,nm0000514,0.004753
1033,nm0001744,0.004704
1204,nm0001803,0.004268
128,nm0222881,0.003637
...,...,...
15304,nm2877285,0.000048
15303,nm7580109,0.000048
15302,nm12015373,0.000048
15301,nm10843335,0.000048


In [16]:
#Setting the actor we will be comparing to
# target_actor_id = 'nm1165110' # Chris Hemsworth
# target_actor_id = 'nm0413168' # Hugh Jackman
# target_actor_id = 'nm0005351' # Ryan Reynolds
# target_actor_id = "nm0000206" # Keanu Reeves
target_actor_id = 'nm0000115' # Nic Cage

In [17]:
per_cluster_prs = []
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_actor_count = sum([
        1 if target_actor_id in movie_actor_map[m]["actors"] else 0 
        for m in group["movie_id"]
    ])
    
    # Calculate conditional probability of the target actor given this cluster
    ## Pr(X=Actor | Y=Cluster <ID>)
    ...
    
    # Then add this calculated probability to `per_cluster_prs`,
    ## so we can track this conditional probability
    per_cluster_prs.append(...)

In [18]:
# For each cluster ID, calculate the posterior probability given the target actor
for cluster_id,cluster_actor_pr in enumerate(per_cluster_prs):

    # combine this cluster_actor_pr, cluster_pr_df, and actor_prs_df
    ## to calculate Pr(Cluster|Actor)
    pr_cluster_given_actor = ...

    print("Pr[Cluster %02d | %s]:" % (cluster_id, target_actor), "\t", pr_cluster_given_actor)
    

NameError: name 'target_actor' is not defined

In [None]:
target_cluster = 12

In [None]:
for movie_id in cluster_df[cluster_df["cluster"] == target_cluster].sample(n=10, replace=False)["movie_id"]:
    this_movie = movie_actor_map[movie_id]
    print(movie_id, this_movie["movie"], this_movie["genres"])