# Probabilities and k-Means Clustering

Using the IMDB data, construct a feature matrix, and apply `k-Means` to the data to extract clusters. 

We then inspect various aspects of probability associated with these clusterings.

In [1]:
%matplotlib inline

In [47]:
import json

import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD

from scipy.sparse import lil_matrix

import matplotlib.pyplot as plt

In [3]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("../data/imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [4]:
print("Known Actors:", len(actor_name_map))
print("Known Movies:", len(movie_actor_map))

Known Actors: 33609
Known Movies: 20620


## Generate DataFrame using Sparse Matrics

Convert our Movie Ratings data into a DataFrame that we can use for analysis.

In [10]:
actor_id_to_index = {actor_id:i for i,actor_id in enumerate(actor_name_map.keys())}

In [15]:
# With sparse matrix, initialize to size of Movies x Actors of 0s
matrix_sparse = lil_matrix((len(movie_actor_map), len(actor_name_map)), dtype=float)

# Update the matrix, user by user, setting non-zero values for the appropriate actors
movie_matrix_index = []
for row,(this_movie_id,this_movie) in enumerate(movie_actor_map.items()): 
    
    # Add movie ID to an index, which we'll use for the DataFrame
    movie_matrix_index.append(this_movie_id)
    
    for actor_id in this_movie["actors"]:
        this_actor_index = actor_id_to_index[actor_id]
        matrix_sparse[row,this_actor_index] = 1.0

In [17]:
df = pd.DataFrame.sparse.from_spmatrix(
    matrix_sparse, 
    index=movie_matrix_index,
    columns=actor_name_map.keys()
)
df

Unnamed: 0,nm0000212,nm0413168,nm0000630,nm0005227,nm0864851,nm0828288,nm0933983,nm0329491,nm0000417,nm0000603,...,nm3768164,nm6522322,nm9169920,nm1644256,nm10067359,nm9504284,nm10592896,nm7216750,nm0936300,nm10375007
tt0035423,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0088751,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0096056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0113092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0116391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9906278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
tt9906644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
tt9906844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
tt9907032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
print("Movies x Actors dimensionality:", df.shape)

Movies x Actors dimensionality: (20620, 33609)


In [110]:
df.sum(axis=0).sort_values().tail(10)

nm0000115     61.0
nm0000246     61.0
nm0290556     63.0
nm0001002     63.0
nm2278431     71.0
nm0222881     75.0
nm0001803     88.0
nm0001744     97.0
nm0000514     98.0
nm0000616    194.0
dtype: float64

## Apply Dimensionality Reduction to this Sparse Matrix

The movie x actor matrix is sparse (the vast majority of cells are zero), so we want to project this matrix into a lower-dimensional space. We do this with tSNE.

In [75]:
reducer = TruncatedSVD(n_components=32)
matrix_reduced = reducer.fit_transform(matrix_sparse)

## Apply k-Means Clustering to the Movies Data

Use k-Means to cluster movies based on their user ratings, so we can extract probabilities around genre and related information.

In [76]:
from sklearn.cluster import MiniBatchKMeans

In [77]:
model = MiniBatchKMeans(n_clusters=16, n_init=128, max_iter=2048, tol=0.5, reassignment_ratio=0.5, random_state=31337)
model.fit(matrix_reduced)

MiniBatchKMeans(max_iter=2048, n_clusters=16, n_init=128, random_state=31337,
                reassignment_ratio=0.5, tol=0.5)

In [78]:
cluster_df = pd.DataFrame(zip(df.index, model.labels_), columns=["movie_id", "cluster"])

In [79]:
cluster_df

Unnamed: 0,movie_id,cluster
0,tt0035423,1
1,tt0088751,4
2,tt0096056,14
3,tt0113092,12
4,tt0116391,8
...,...,...
20615,tt9906278,6
20616,tt9906644,2
20617,tt9906844,6
20618,tt9907032,6


In [119]:
cluster_df.to_csv("movie_to_cluster.csv", index=False)

In [121]:
cluster_df = pd.read_csv("movie_to_cluster.csv")

In [122]:
cluster_df["cluster"].value_counts()

6     4119
2     3616
0     3507
13    1551
1     1380
3     1252
8     1072
4      845
14     750
7      638
10     553
9      483
5      318
12     295
11     194
15      47
Name: cluster, dtype: int64

In [81]:
cluster_pr_map = {cluster_id:cluster_pr for cluster_id,cluster_pr in (cluster_df["cluster"].value_counts() / cluster_df.shape[0]).items()}
cluster_df["cluster"].value_counts() / cluster_df.shape[0]

6     0.199758
2     0.175364
0     0.170078
13    0.075218
1     0.066925
3     0.060718
8     0.051988
4     0.040980
14    0.036372
7     0.030941
10    0.026819
9     0.023424
5     0.015422
12    0.014306
11    0.009408
15    0.002279
Name: cluster, dtype: float64