Octavio Sanchez

In [1]:
import json

import pandas as pd
import numpy as np

In [2]:
cluster_df = pd.read_csv("movie_to_cluster_rating.csv")

cluster_df



Unnamed: 0,movie_id,cluster,rating,raters
0,tt0035423,8,6.4,85923
1,tt0088751,12,5.3,328
2,tt0096056,6,5.6,830
3,tt0113092,3,3.4,829
4,tt0116391,3,6.2,257
...,...,...,...,...
20615,tt9906278,10,0.0,0
20616,tt9906644,13,6.8,835
20617,tt9906844,10,0.0,0
20618,tt9907032,10,0.0,0


In [3]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [4]:
# For each genre, count the number of movies
genre_counts = {}

# For each movie, get its genres and update the genre count
for movie_id in movie_actor_map.keys():
    for genre in movie_actor_map[movie_id]["genres"]:
        genre_counts[genre] = genre_counts.get(genre, 0) + 1
        
genre_prs = []
for genre,g_count in genre_counts.items():
    genre_prs.append((genre, g_count/len(movie_actor_map)))
    
genre_prs_df = pd.DataFrame(genre_prs, columns=["genre", "probability"])
genre_pr_map = {row["genre"]:row["probability"] for idx,row in genre_prs_df.iterrows()}

genre_prs_df.sort_values(by="probability", ascending=False)

Unnamed: 0,genre,probability
5,Drama,0.49258
0,Comedy,0.291804
10,Thriller,0.19418
6,Action,0.181523
3,Horror,0.149224
8,Crime,0.134481
2,Romance,0.12226
7,Adventure,0.080844
9,Mystery,0.074442
4,Sci-Fi,0.051164


In [5]:
#calculate avg rating for each cluster
cluster_avgs = cluster_df.groupby('cluster')['rating'].mean()

cluster_avgs

cluster
0     4.552502
1     5.502326
2     4.896075
3     5.515968
4     5.425038
5     6.464643
6     5.440730
7     6.048661
8     5.481137
9     4.876823
10    5.596978
11    4.373125
12    4.088063
13    5.865103
14    6.035834
15    4.977822
Name: rating, dtype: float64

In [6]:
target_genre = "Sci-Fi"

per_cluster_prs = []
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_genre_count = sum([
        1 if target_genre in movie_actor_map[m]["genres"] else 0 
        for m in group["movie_id"]
    ])
    
    # Calculate conditional probability
    pr_genre_given_cluster = this_cluster_genre_count / group.shape[0]
    print("Pr[%s| Cluster %02d]:" % (target_genre, cluster_id), "\t", pr_genre_given_cluster)
    
    # Calculate joint probability
    joint_pr_genre_cluster = pr_genre_given_cluster * group.shape[0] / cluster_df.shape[0]
    print("Pr[%s, Cluster %02d]:" % (target_genre, cluster_id), "\t", joint_pr_genre_cluster)
    per_cluster_prs.append(joint_pr_genre_cluster)

Pr[Sci-Fi| Cluster 00]: 	 0.05908944139489829
Pr[Sci-Fi, Cluster 00]: 	 0.008874878758486906
Pr[Sci-Fi| Cluster 01]: 	 0.03125
Pr[Sci-Fi, Cluster 01]: 	 0.002085354025218235
Pr[Sci-Fi| Cluster 02]: 	 0.1430472388556221
Pr[Sci-Fi, Cluster 02]: 	 0.010426770126091174
Pr[Sci-Fi| Cluster 03]: 	 0.020967741935483872
Pr[Sci-Fi, Cluster 03]: 	 0.0012609117361784678
Pr[Sci-Fi| Cluster 04]: 	 0.08549618320610687
Pr[Sci-Fi, Cluster 04]: 	 0.0027158098933074684
Pr[Sci-Fi| Cluster 05]: 	 0.0
Pr[Sci-Fi, Cluster 05]: 	 0.0
Pr[Sci-Fi| Cluster 06]: 	 0.02045955303745672
Pr[Sci-Fi, Cluster 06]: 	 0.0031522793404461687
Pr[Sci-Fi| Cluster 07]: 	 0.0
Pr[Sci-Fi, Cluster 07]: 	 0.0
Pr[Sci-Fi| Cluster 08]: 	 0.011627906976744186
Pr[Sci-Fi, Cluster 08]: 	 0.0004364694471387003
Pr[Sci-Fi| Cluster 09]: 	 0.059895833333333336
Pr[Sci-Fi, Cluster 09]: 	 0.0011154219204655674
Pr[Sci-Fi| Cluster 10]: 	 0.001314060446780552
Pr[Sci-Fi, Cluster 10]: 	 4.8496605237633365e-05
Pr[Sci-Fi| Cluster 11]: 	 0.078125
Pr[Sci-Fi,

For this function I was findind the expected rating for a new movie based on genre. I find the probability of the genre in _pr_map dict and loop the cluster dataframe to calculate of said genre occuring in the cluster. I then multiply the probability of the genre and the rating of the movie and add it to the expected rating, to produce the expected rating for the movie. 

In [15]:

def predict_rating(genre, cluster_df, cluster_avgs, genre_pr):    
    genre_pr = genre_pr_map.get(genre, 0) 
    # expected value - sum of rating * probability for each cluster
    exp_rating = 0
    for cluster_id in cluster_df['cluster'].unique():
        pr_genre_given_cluster = genre_pr * cluster_df[cluster_df['cluster'] == cluster_id].shape[0] / cluster_df.shape[0]
        cluster_avg = cluster_avgs[cluster_id]
        #add
        exp_rating += cluster_avg * pr_genre_given_cluster
        
    return exp_rating

Prediction of Drama

In [16]:
genre = 'Drama'
exp_rating = predict_rating(genre, cluster_df, cluster_avgs, genre_pr_map)
print("Expected rating for a new %s movie: %.2f" % (genre, exp_rating))

Expected rating for a new Drama movie: 2.57


Prediction of Sci - Fi

In [17]:
genre = 'Sci-Fi'
exp_rating = predict_rating(genre, cluster_df, cluster_avgs, genre_pr_map)
print("Expected rating for a new %s movie: %.2f" % (genre, exp_rating))

Expected rating for a new Sci-Fi movie: 0.27


Prediction of Comedy

In [18]:
genre = 'Comedy'
exp_rating = predict_rating(genre, cluster_df, cluster_avgs, genre_pr_map)
print("Expected rating for a new %s movie: %.2f" % (genre, exp_rating))

Expected rating for a new Comedy movie: 1.52
