In [1]:
%matplotlib inline
import pandas as pd

import json

import matplotlib.pyplot as plt
from scipy.sparse import lil_matrix
import scipy.spatial.distance

# Exercise 1

In [2]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}
actor_rating_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
      
        this_movie = json.loads(line)
                    
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            actor_genre_map[actor_id] = this_actors_genres
            
            if len(this_movie["rating"]) > 0:
                actor_rating_map[actor_id] = actor_rating_map.get(actor_id, 0)\
                    + this_movie["rating"]["votes"]
                       
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [3]:
actor_rating_map

{'nm0000212': 269915,
 'nm0413168': 9278649,
 'nm0000630': 2356649,
 'nm0005227': 535171,
 'nm0864851': 328,
 'nm0828288': 5375,
 'nm0933983': 328,
 'nm0329491': 328,
 'nm0000417': 400455,
 'nm0000603': 356515,
 'nm0000457': 886285,
 'nm0452288': 3247,
 'nm0001002': 151401,
 'nm0001299': 829,
 'nm0923529': 35979,
 'nm0936365': 829,
 'nm0006763': 318674,
 'nm0007113': 69011,
 'nm0310173': 257,
 'nm0412917': 119998,
 'nm1142237': 24,
 'nm0321320': 24,
 'nm0001123': 47,
 'nm0898634': 3041,
 'nm0269451': 24,
 'nm0803374': 24,
 'nm1315804': 24,
 'nm0803138': 24,
 'nm0001324': 84366,
 'nm0920148': 62,
 'nm0115304': 62,
 'nm0203349': 62,
 'nm0427470': 1851,
 'nm0001293': 387085,
 'nm0001062': 67193,
 'nm0732133': 1948,
 'nm0000199': 1551049,
 'nm0001583': 4405,
 'nm0283299': 4405,
 'nm0568375': 4405,
 'nm0000849': 2993821,
 'nm0603090': 19327,
 'nm0098709': 6789,
 'nm0617799': 6701,
 'nm0089231': 4535,
 'nm0175305': 89735,
 'nm0001497': 3050291,
 'nm0000138': 13861068,
 'nm0000639': 14807,
 '

In [4]:
index = actor_genre_map.keys()
rows = [actor_genre_map[k] for k in index]
df = pd.DataFrame(rows, index=index)

# Fill NAs with zero, as NA means the actor has not starred in that genre
df = df.fillna(0)

df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,3.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
norm_df = df.divide(df.sum(axis=1),axis=0)
norm_df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,0.250000,0.035714,0.214286,0.214286,0.035714,0.071429,0.035714,0.035714,0.071429,0.035714,...,0.000000,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
nm0413168,0.081395,0.034884,0.058140,0.139535,0.058140,0.023256,0.162791,0.046512,0.069767,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
nm0000630,0.112676,0.028169,0.084507,0.197183,0.028169,0.042254,0.056338,0.070423,0.014085,0.014085,...,0.042254,0.098592,0.042254,0.014085,0.00,0.0,0.0,0.0,0.0,0.0
nm0005227,0.400000,0.040000,0.080000,0.080000,0.000000,0.040000,0.040000,0.000000,0.000000,0.000000,...,0.040000,0.000000,0.040000,0.000000,0.08,0.0,0.0,0.0,0.0,0.0
nm0864851,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.333333,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,...,0.500000,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
nm10592896,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,...,0.500000,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
nm7216750,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,...,0.500000,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0
nm0936300,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0


In [6]:
norm_df.sum(axis=1)

nm0000212     1.0
nm0413168     1.0
nm0000630     1.0
nm0005227     1.0
nm0864851     1.0
             ... 
nm9504284     1.0
nm10592896    1.0
nm7216750     1.0
nm0936300     1.0
nm10375007    1.0
Length: 33609, dtype: float64

In [7]:
query = 'nm0413168'

In [12]:
target_actor_ratings = df.loc[query]
distances = scipy.spatial.distance.cdist(df, [target_actor_ratings], metric="euclidean")[:,0]
distances

array([22.22611077,  0.        , 18.46618531, ..., 27.27636339,
       27.33130074, 26.64582519])

In [47]:
target_actor_ratings = norm_df.loc[query]
distances = scipy.spatial.distance.cdist(norm_df, [target_actor_ratings], metric="euclidean")[:,0]
query_distances = list(zip(norm_df.index, distances))

for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())

nm0413168 Hugh Jackman 0.0 86.0
nm0089217 Orlando Bloom 18114.000001342312 49.0
nm0749263 Mark Ruffalo 194590.000000098 90.0
nm0000168 Samuel L. Jackson 278285.0000000308 155.0
nm0000354 Matt Damon 507210.00000003906 112.0
nm0005212 Ian McKellen 516593.0000000171 51.0
nm0000136 Johnny Depp 678847.0000000201 95.0
nm0424060 Scarlett Johansson 714135.0000000249 94.0
nm0262635 Chris Evans 738323.0000000081 74.0
nm0000323 Michael Caine 1426902.0000000203 70.0


# Exercise 2

In [59]:
df["Votes"] = [actor_rating_map.get(r, 0) for r in norm_df.index]
df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Documentary,Sport,News,Family,Music,Unnamed: 17,Western,Short,Reality-TV,Votes
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,269915
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9278649
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2356649
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,535171
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1756


In [50]:
df_without_votes = df.drop(columns=["Votes"])
target_actor_ratings = df_without_votes.loc[query]
distances = scipy.spatial.distance.cdist(df_without_votes, [target_actor_ratings], metric="euclidean")[:,0]
query_distances = list(zip(df.index, distances))

for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())

nm0413168 Hugh Jackman 0.0 9278735.0
nm0000375 Robert Downey Jr. 7.937253933193772 11448132.0
nm0262635 Chris Evans 8.366600265340756 10017046.0
nm1517976 Chris Pine 10.770329614269007 3857317.0
nm1475594 Channing Tatum 10.954451150103322 3838612.0
nm0185819 Daniel Craig 11.445523142259598 5844258.0
nm0000226 Will Smith 11.916375287812984 6516606.0
nm0757855 Zoe Saldana 12.328828005937952 4877179.0
nm0004937 Jamie Foxx 12.529964086141668 5086800.0
nm0000234 Charlize Theron 12.922847983320086 5191872.0


In [51]:
target_actor_ratings = df.loc[query]
distances = scipy.spatial.distance.cdist(df, [target_actor_ratings], metric="euclidean")[:,0]
query_distances = list(zip(df.index, distances))

for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())

nm0413168 Hugh Jackman 0.0 9278735.0
nm0089217 Orlando Bloom 18114.00582422342 9296812.0
nm0749263 Mark Ruffalo 194590.00078112955 9084149.0
nm0000168 Samuel L. Jackson 278285.0013098083 9557089.0
nm0000354 Matt Damon 507210.0004298022 8771551.0
nm0005212 Ian McKellen 516593.0001848651 9795293.0
nm0000136 Johnny Depp 678847.0001642491 8599897.0
nm0424060 Scarlett Johansson 714135.0002198464 9992878.0
nm0262635 Chris Evans 738323.0000474048 10017046.0
nm0000323 Michael Caine 1426902.0001100285 7851817.0


In [71]:
#Normalize by min/max
ndf = df.subtract(df.min()).divide(df.max()-df.min())
ndf

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Documentary,Sport,News,Family,Music,Unnamed: 17,Western,Short,Reality-TV,Votes
nm0000212,0.212121,0.1,0.428571,0.086957,0.05,0.031746,0.017857,0.090909,0.068966,0.076923,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.019473
nm0413168,0.212121,0.3,0.357143,0.173913,0.25,0.031746,0.250000,0.363636,0.206897,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.669404
nm0000630,0.242424,0.2,0.428571,0.202899,0.10,0.047619,0.071429,0.454545,0.034483,0.076923,...,0.21875,0.075,0.333333,0.000000,0.0,0.0,0.0,0.0,0.0,0.170019
nm0005227,0.303030,0.1,0.142857,0.028986,0.00,0.015873,0.017857,0.000000,0.000000,0.000000,...,0.00000,0.025,0.000000,0.076923,0.0,0.0,0.0,0.0,0.0,0.038610
nm0864851,0.030303,0.0,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.000000,0.0,0.000000,0.000000,0.00,0.015873,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000033
nm10592896,0.000000,0.0,0.000000,0.000000,0.00,0.015873,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000033
nm7216750,0.000000,0.0,0.000000,0.000000,0.00,0.015873,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000033
nm0936300,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.076923,...,0.00000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000127


In [82]:
target_actor_ratings = ndf.loc[query]
distances = scipy.spatial.distance.cdist(ndf, [target_actor_ratings], metric="euclidean")[:,0]
query_distances = list(zip(ndf.index, distances))


for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())

nm0413168 Hugh Jackman 0.0 86.66940361305493
nm0000375 Robert Downey Jr. 0.5962602575177932 75.82591449663187
nm0010736 Amy Adams 0.6174947730739638 79.45140331177944
nm0262635 Chris Evans 0.6577934358095161 74.72266956629893
nm0124930 Gerard Butler 0.7055878776461664 90.3623139284794
nm0000129 Tom Cruise 0.7208010824176451 61.52309966302741
nm0000191 Ewan McGregor 0.7584657529417943 114.376893757393
nm0000234 Charlize Theron 0.7885787241568242 95.37455822307487
nm0005351 Ryan Reynolds 0.7927022223157756 115.52260381378981
nm0000553 Liam Neeson 0.8001152435196612 120.4768258116907
