# Extract Frequent Pairs of Actors

Using the IMDB dataset of movies and actors `imdb_recent_movies.json`, we extract pairs of actors that occur frequently in the dataset. This dataset contains a sample of movies released betwen 2000-2020, their titles, genres, release years, and top-billed actors.

In [1]:
import pandas as pd

import json

In [2]:
support_threshold = 0.0001

In [3]:
actor_map = {}
actor_name_map = {}
movie_count = 0

with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
        
        for actor_id in this_movie['actor_ids']:
            actor_map[actor_id] = actor_map.get(actor_id, 0) + 1
            
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            actor_name_map[actor_id] = actor_name
            
        movie_count += 1

In [4]:
this_movie

{'title_id': 'tt9916730',
 'title_name': '6 Gunn',
 'title_year': 2017,
 'title_genre': ['\\N'],
 'actor_ids': ['nm6096005', 'nm0059461', 'nm13233318', 'nm4852679'],
 'actor_names': ['Devadhar Archit',
  'Sunil Barve',
  'Ganesh Vasant Patil',
  'Bhushan Pradhan']}

In [5]:
actual_threshold = support_threshold * movie_count
print("Movie Count:", movie_count)
print("Support Threshold:", actual_threshold)

Movie Count: 178121
Support Threshold: 17.8121


In [6]:
print(len(actor_map))

258059


In [7]:
frequent_actors = 0
frequent_actors_set = set()
for actor,count in actor_map.items():
    if count > actual_threshold:
        frequent_actors += 1
#         print("Frequent Actor:", actor)
        frequent_actors_set.add(actor)

In [8]:
print(frequent_actors)

2118


In [9]:
for actor_id in frequent_actors_set:
    print(actor_id, actor_map[actor_id], actor_name_map[actor_id])
    
    break

nm1822659 18 Nat Wolff


In [10]:
candidate_pairs_map = {}

with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
        
        sorted_ids = sorted(this_movie['actor_ids'])
        
        for i,left_id in enumerate(sorted_ids):
            if left_id not in frequent_actors_set or i == len(sorted_ids) - 1:
                continue
            
            for right_id in sorted_ids[i+1:]:
                if right_id in frequent_actors_set:
                    this_pair = (left_id, right_id)
                    candidate_pairs_map[this_pair] = candidate_pairs_map.get(this_pair, 0) + 1
            


In [11]:
len(candidate_pairs_map)

19273

In [12]:
frequent_pairs = 0
frequent_pairs_set = set()
for pair,count in candidate_pairs_map.items():
    if count > actual_threshold:
        frequent_pairs += 1
        print("Frequent Pair:", [actor_name_map[x] for x in pair])
        frequent_pairs_set.add(pair)

Frequent Pair: ['Shafqat Cheema', 'Shaan Shahid']
Frequent Pair: ['Mohammad Ali', 'Raghu Babu']
Frequent Pair: ['Mohammad Ali', 'Tanikella Bharani']
Frequent Pair: ['Mohammad Ali', 'Brahmanandam']
Frequent Pair: ['Tanikella Bharani', 'Brahmanandam']
Frequent Pair: ['Brahmanandam', 'M.S. Narayana']
Frequent Pair: ['Amit Pachori', 'Vinod Tripathi']
Frequent Pair: ['Brahmanandam', 'Raghu Babu']
Frequent Pair: ['Mohammad Ali', 'Krishna Bhagavan']
Frequent Pair: ['Brahmanandam', 'Krishna Bhagavan']
Frequent Pair: ['Kom Chauncheun', 'Kohtee Aramboy']
Frequent Pair: ['Kiran Kumar', 'Raza Murad']
Frequent Pair: ['Amit Pachori', 'Anil Nagrath']
Frequent Pair: ['Anil Nagrath', 'Vinod Tripathi']
Frequent Pair: ['Raza Murad', 'Anil Nagrath']
Frequent Pair: ['Brahmanandam', 'Chalapathi Rao']
Frequent Pair: ['Milind Gunaji', 'Mohan Joshi']
Frequent Pair: ['Tomohiro Okada', 'Seiji Nakamitsu']
Frequent Pair: ['Simon Hill', 'Kelsey Painter']
Frequent Pair: ['Justin J. Wheeler', "Paul 'Maxx' Rinehart"]


In [13]:
frequent_pairs

40