# Probabilities and k-Means Clustering

Using the IMDB data, construct a feature matrix, and apply `k-Means` to the data to extract clusters. 

We then inspect various aspects of probability associated with these clusterings.

In [1]:
%matplotlib inline

In [2]:
import json

import pandas as pd
import numpy as np

from scipy.sparse import lil_matrix

import matplotlib.pyplot as plt

In [3]:
known_movies = set()

user_ratings = {} # List of all our movie ratings for specific users
movie_ids = []

with open("../data/user_ratings.json", "r") as in_file:
    for line in in_file:
        
        this_rating = json.loads(line)
        
        known_movies.add(this_rating["title_id"])
        
        if this_rating["title_id"] not in movie_ids:
            movie_ids.append(this_rating["title_id"])
        
        this_users_ratings = user_ratings.get(this_rating["userId"], [])
        this_users_ratings.append((this_rating["title_id"], this_rating["rating"]))
        
        user_ratings[this_rating["userId"]] = this_users_ratings
        
        

In [4]:
movie_id_to_index = {m:i for i,m in enumerate(movie_ids)}

In [5]:
print("Known Users:", len(user_ratings))
print("Known Movies:", len(known_movies))


Known Users: 2244
Known Movies: 4465


In [6]:
actor_id_to_name_map = {}     # Map Actor IDs to actor names
actor_id_to_index_map = {}    # Map actor IDs to a unique index of known actors
index_to_actor_ids = []       # Array mapping unique index back to actor ID (invert of actor_id_to_index_map)

index_counter = 0    # Unique actor index; increment for each new actor
known_actors = set()

movie_actor_map = {} # List of all our movies and their actors

test_count = 0
with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
        
        # Restrict to known movies
        if this_movie["title_id"] not in known_movies:
            continue
            
        # Keep track of all the actors in this movie
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            
            # Keep names and IDs
            actor_id_to_name_map[actor_id] = actor_name
            
            # If we've seen this actor before, skip...
            if actor_id in known_actors:
                continue
                
            # ... Otherwise, add to known actor set and create new index for them
            known_actors.add(actor_id)
            actor_id_to_index_map[actor_id] = index_counter
            index_to_actor_ids.append(actor_id)
            index_counter += 1
            
        # Finished with this film
        movie_actor_map[this_movie["title_id"]] = ({
            "movie": this_movie["title_name"],
            "actors": set(this_movie['actor_ids']),
            "genres": this_movie["title_genre"]
        })

In [7]:
print("Known Actors:", len(known_actors))
print("Known Movies:", len(movie_actor_map))

Known Actors: 5224
Known Movies: 4465


## Generate DataFrame using Sparse Matrics

Convert our Movie Ratings data into a DataFrame that we can use for analysis.

In [8]:
# With sparse matrix, initialize to size of Users x Movies of 0s
matrix_sparse = lil_matrix((len(user_ratings), len(known_movies)), dtype=float)

# Update the matrix, user by user, setting non-zero values for the appropriate actors
for row,this_user in enumerate(user_ratings): 
    this_user_ratings = user_ratings[this_user]
    
    for movie_id,rating in this_user_ratings:
        this_movie_index = movie_id_to_index[movie_id]
        matrix_sparse[row,this_movie_index] = rating

In [9]:
df = pd.DataFrame.sparse.from_spmatrix(
    matrix_sparse, 
    index=[u for u in user_ratings],
    columns=movie_ids
).T
df

Unnamed: 0,10,37,51,126,152,263,284,448,626,706,...,162002,162073,162207,162257,162363,162420,162434,162464,162499,162537
tt0274309,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0298203,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0315733,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0337563,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0463854,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt4241904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
tt1666800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt6806448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0844671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [10]:
print("Movies x Users dimensionality:", df.shape)

Movies x Users dimensionality: (4465, 2244)


## Apply k-Means Clustering to the Movies Data

Use k-Means to cluster movies based on their user ratings, so we can extract probabilities around genre and related information.

In [11]:
from sklearn.cluster import MiniBatchKMeans

In [12]:
model = MiniBatchKMeans(n_clusters=16, n_init=128, max_iter=2048, tol=0.5, reassignment_ratio=0.5, random_state=31337)
model.fit(df)

MiniBatchKMeans(max_iter=2048, n_clusters=16, n_init=128, random_state=31337,
                reassignment_ratio=0.5, tol=0.5)

In [13]:
cluster_df = pd.DataFrame(zip(df.index, model.labels_), columns=["movie_id", "cluster"])

In [14]:
cluster_df

Unnamed: 0,movie_id,cluster
0,tt0274309,10
1,tt0298203,10
2,tt0315733,10
3,tt0337563,10
4,tt0463854,10
...,...,...
4460,tt4241904,0
4461,tt1666800,6
4462,tt6806448,6
4463,tt0844671,0


In [15]:
cluster_df["cluster"].value_counts()

0     2821
10    1269
5       80
14      72
6       70
8       49
9       35
7       18
11      18
4       16
1        6
2        5
13       3
12       1
3        1
15       1
Name: cluster, dtype: int64

In [16]:
cluster_pr_map = {cluster_id:cluster_pr for cluster_id,cluster_pr in (cluster_df["cluster"].value_counts() / cluster_df.shape[0]).items()}
cluster_df["cluster"].value_counts() / cluster_df.shape[0]

0     0.631803
10    0.284211
5     0.017917
14    0.016125
6     0.015677
8     0.010974
9     0.007839
7     0.004031
11    0.004031
4     0.003583
1     0.001344
2     0.001120
13    0.000672
12    0.000224
3     0.000224
15    0.000224
Name: cluster, dtype: float64

In [17]:
# For each genre, count the number of movies
genre_counts = {}

# Fore each movie, get its genres and update the genre count
for movie_id in df.index:
    for genre in movie_actor_map[movie_id]["genres"]:
        genre_counts[genre] = genre_counts.get(genre, 0) + 1
        
genre_prs = []
for genre,g_count in genre_counts.items():
    genre_prs.append((genre, g_count/df.shape[0]))
    
genre_prs_df = pd.DataFrame(genre_prs, columns=["genre", "probability"])
genre_pr_map = {row["genre"]:row["probability"] for idx,row in genre_prs_df.iterrows()}

genre_prs_df.sort_values(by="probability", ascending=False)

Unnamed: 0,genre,probability
2,Drama,0.582755
1,Comedy,0.382307
11,Action,0.221725
7,Romance,0.190817
4,Crime,0.178275
12,Adventure,0.155655
5,Thriller,0.151848
10,Mystery,0.100784
8,Horror,0.100112
0,Biography,0.07346


In [18]:
target_genre = "Sci-Fi"

per_cluster_prs = []
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_genre_count = sum([
        1 if target_genre in movie_actor_map[m]["genres"] else 0 
        for m in group["movie_id"]
    ])
    
    pr_genre_given_cluster = this_cluster_genre_count / group.shape[0]
    print("Pr[%s| Cluster %02d]:" % (target_genre, cluster_id), "\t", pr_genre_given_cluster)
    
    joint_pr_genre_cluster = pr_genre_given_cluster * group.shape[0] / cluster_df.shape[0]
    print("Pr[%s, Cluster %02d]:" % (target_genre, cluster_id), "\t", joint_pr_genre_cluster)
    per_cluster_prs.append(joint_pr_genre_cluster)

Pr[Sci-Fi| Cluster 00]: 	 0.048209854661467565
Pr[Sci-Fi, Cluster 00]: 	 0.03045912653975364
Pr[Sci-Fi| Cluster 01]: 	 0.16666666666666666
Pr[Sci-Fi, Cluster 01]: 	 0.00022396416573348266
Pr[Sci-Fi| Cluster 02]: 	 0.0
Pr[Sci-Fi, Cluster 02]: 	 0.0
Pr[Sci-Fi| Cluster 03]: 	 0.0
Pr[Sci-Fi, Cluster 03]: 	 0.0
Pr[Sci-Fi| Cluster 04]: 	 0.0
Pr[Sci-Fi, Cluster 04]: 	 0.0
Pr[Sci-Fi| Cluster 05]: 	 0.05
Pr[Sci-Fi, Cluster 05]: 	 0.0008958566629339306
Pr[Sci-Fi| Cluster 06]: 	 0.12857142857142856
Pr[Sci-Fi, Cluster 06]: 	 0.002015677491601344
Pr[Sci-Fi| Cluster 07]: 	 0.2222222222222222
Pr[Sci-Fi, Cluster 07]: 	 0.0008958566629339306
Pr[Sci-Fi| Cluster 08]: 	 0.10204081632653061
Pr[Sci-Fi, Cluster 08]: 	 0.0011198208286674132
Pr[Sci-Fi| Cluster 09]: 	 0.05714285714285714
Pr[Sci-Fi, Cluster 09]: 	 0.0004479283314669653
Pr[Sci-Fi| Cluster 10]: 	 0.11662726556343578
Pr[Sci-Fi, Cluster 10]: 	 0.03314669652855543
Pr[Sci-Fi| Cluster 11]: 	 0.2222222222222222
Pr[Sci-Fi, Cluster 11]: 	 0.00089585666293

In [19]:
pr_target_genre = sum(per_cluster_prs)
print("Probability of Target Genre:", pr_target_genre)

Probability of Target Genre: 0.07010078387458007


In [20]:
for cluster_id,cluster_genre_pr in enumerate(per_cluster_prs):

    pr_cluster_given_genre = cluster_genre_pr / genre_pr_map[target_genre]

    print("Pr[Cluster %02d | %s]:" % (cluster_id, target_genre), "\t", pr_cluster_given_genre)
    

Pr[Cluster 00 | Sci-Fi]: 	 0.4345047923322684
Pr[Cluster 01 | Sci-Fi]: 	 0.0031948881789137383
Pr[Cluster 02 | Sci-Fi]: 	 0.0
Pr[Cluster 03 | Sci-Fi]: 	 0.0
Pr[Cluster 04 | Sci-Fi]: 	 0.0
Pr[Cluster 05 | Sci-Fi]: 	 0.012779552715654953
Pr[Cluster 06 | Sci-Fi]: 	 0.028753993610223644
Pr[Cluster 07 | Sci-Fi]: 	 0.012779552715654953
Pr[Cluster 08 | Sci-Fi]: 	 0.015974440894568693
Pr[Cluster 09 | Sci-Fi]: 	 0.006389776357827477
Pr[Cluster 10 | Sci-Fi]: 	 0.47284345047923326
Pr[Cluster 11 | Sci-Fi]: 	 0.012779552715654953
Pr[Cluster 12 | Sci-Fi]: 	 0.0
Pr[Cluster 13 | Sci-Fi]: 	 0.0
Pr[Cluster 14 | Sci-Fi]: 	 0.0
Pr[Cluster 15 | Sci-Fi]: 	 0.0
