In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import heapq
from sklearn.preprocessing import LabelEncoder
from pprint import pprint

global execution_count

execution_count = 0

# pd.options.display.max_rows = 300000

In [None]:
# Put the actors and directors into a list to make it easier to access
df = pd.read_csv("data/rotten_tomatoes_movies.csv")

df = df[['directors', 'actors']]
df['directors_list'] = df['directors'].str.split(', ')
df['actors'] = df['actors'].str.replace(r', Jr\.', ' Jr.', regex=True)
df['actors'] = df['actors'].str.replace(r', Sr\.', ' Sr.', regex=True)
df['actors_list'] = df['actors'].str.split(', ')

In [3]:
# Getting the number of times the actors/directors appear in the dataset
directors_frequency = df['directors_list'].explode().value_counts()
actors_frequency = df['actors_list'].explode().value_counts()

In [4]:
print("Frequency of Directors")

directors_frequency.head(9642)

Frequency of Directors


directors_list
Clint Eastwood      38
Woody Allen         37
Alfred Hitchcock    36
Steven Spielberg    32
Sidney Lumet        31
                    ..
Roger Melvin         1
Dror Zahavi          1
Dean Wright          1
Al Reinert           1
Jared Bush           1
Name: count, Length: 9642, dtype: int64

In [5]:
global execution_count

while True:
    # Setting a limit on the number of times the cell is executed to prevent data shortage
    if execution_count == 1:
        print("WARNING: You can only run this cell once!")
        break
    # Only the top 10% of the most frequent actors in the dataset will be encoded
    print("Frequency of Actors")
    top_percent = int(len(actors_frequency) * 0.1)
    top_percent_actors = actors_frequency.head(top_percent).index
    actors_frequency = actors_frequency[top_percent_actors]
    print(actors_frequency)
    execution_count += 1



Frequency of Actors
actors_list
Jr.                    125
Samuel L. Jackson      109
Bruce Willis            90
Robert De Niro          89
Nicolas Cage            85
                      ... 
Art Lund                 4
Clifford Severn          4
Jason Samuels Smith      4
Sam Lee                  4
Ann Tyrrell              4
Name: count, Length: 20410, dtype: int64


In [6]:
# Creating a dictionary where the director that occurs most frequently will get the highets integer encoding (highest = 1, lowest = n)
director_encoding = {}
for i, name in enumerate(directors_frequency.index):
    director_encoding[name] = i + 1

pprint(director_encoding)

{'A. Dean Bell': 5446,
 'A. Edward Sutherland': 4904,
 'A..T. White': 6112,
 'A.J. Edwards': 5951,
 'A.R. Murugadoss': 9305,
 'A.T. White': 6114,
 'AJ Schnack': 8708,
 'Aamir Khan': 8408,
 'Aaron Aites': 4735,
 'Aaron B. Koontz': 5256,
 'Aaron Bear': 9435,
 'Aaron Blaise': 7005,
 'Aaron Fernandez': 4884,
 'Aaron Fisher': 8155,
 'Aaron Hancox': 6757,
 'Aaron Hann': 6716,
 'Aaron Harvey': 8146,
 'Aaron Horvath': 5887,
 'Aaron I. Naar': 3322,
 'Aaron J. Wiederspahn': 5361,
 'Aaron Katz (II)': 1336,
 'Aaron Kaufman': 4715,
 'Aaron Kopp': 8580,
 'Aaron Lipstadt': 5423,
 'Aaron Mirtes': 6868,
 'Aaron Moorhead': 1053,
 'Aaron Nee': 6628,
 'Aaron Norris': 1774,
 'Aaron Rose': 6368,
 'Aaron Schock': 6738,
 'Aaron Seelman': 5640,
 'Aaron Seltzer': 401,
 'Aaron Sorenson': 5971,
 'Aaron Sorkin': 4407,
 'Aaron Wilson': 7032,
 'Aaron Woodley': 2946,
 'Aaron Woolf': 7412,
 'Abbas Kiarostami': 904,
 'Abbe Wool': 3561,
 'Abby Epstein': 7097,
 'Abby Kohn': 8044,
 'Abdel Kechiche': 2874,
 'Abdellah Taia'

In [7]:
# Creating a dictionary where the actor that occurs most frequently will get the highest integer encoding (highest = 1, lowest = n)
actor_encoding = {}
for i, name in enumerate(actors_frequency.index):
    actor_encoding[name] = i + 1

pprint(actor_encoding)

{"'Snub' Pollard": 7227,
 '50 Cent': 1241,
 'A Martinez': 11457,
 'A. Ben Astar': 11552,
 'A. Michael Baldwin': 15705,
 'A.C. Peterson': 1992,
 'A.D. Miles': 7710,
 'A.G. Zeke Mills': 14689,
 'A.J. Buckley': 6387,
 'A.J. Cook': 11671,
 'A.J. Johnson': 3017,
 'AJ Bowen': 4401,
 'AJ Trauth': 17051,
 'Aamir Khan': 8140,
 'Aaron Abrams': 4950,
 'Aaron Ashmore': 16135,
 'Aaron Au': 19448,
 'Aaron Brown': 19012,
 'Aaron Craven': 19820,
 'Aaron Douglas': 3918,
 'Aaron Eckhart': 517,
 'Aaron Hill': 13692,
 'Aaron Himelstein': 10316,
 'Aaron Jackson': 18152,
 'Aaron Jay Rome': 10562,
 'Aaron Lazar': 19584,
 'Aaron Lohr': 11278,
 'Aaron Lustig': 1770,
 'Aaron Michael Lacey': 6646,
 'Aaron Monaghan': 14847,
 'Aaron Neville': 9461,
 'Aaron Paul': 1466,
 'Aaron Pearl': 3739,
 'Aaron Pedersen': 19403,
 'Aaron Poole': 3851,
 'Aaron Stanford': 5046,
 'Aaron Taylor-Johnson': 2045,
 'Aaron Tveit': 9689,
 'Aaron V. Williamson': 14149,
 'Aaron Yoo': 3365,
 'Aasif Mandvi': 1418,
 'Abbey Lee': 11381,
 'Abbi

In [8]:
# Replacing the names of the directors with the encoded values within the lists
for directors in df['directors_list']:
    if isinstance(directors, list):
        for names in range(0, len(directors)):
            encoded_value = director_encoding.get(directors[names])
            directors[names] = encoded_value
print(df['directors_list'])



0                     [117]
1                     [787]
2                       [9]
3                       [5]
4                      [37]
                ...        
17707                [3133]
17708    [1542, 1659, 9642]
17709                   NaN
17710          [1867, 1870]
17711                [1032]
Name: directors_list, Length: 17712, dtype: object


In [9]:
for actors in df['actors_list']:
    if isinstance(actors, list):
        for names in range(0, len(actors)):
            encoded_value = actor_encoding.get(actors[names])
            actors[names] = encoded_value
print(df['actors_list'])

0        [1388, 2228, 2589, 4592, 328, 138, 211, 71, 13...
1                [131, 692, 42, 1183, 6568, None, 65, 554]
2        [4648, 4651, 1283, 3504, 1825, None, 14272, 45...
3        [1352, 1922, 1925, 1385, 8239, 5577, 506, 564,...
4        [892, 438, 5585, 2489, 6184, 1367, 4635, 3291,...
                               ...                        
17707    [18079, 1273, 10224, 4704, 5877, None, 12366, ...
17708    [26, 625, 153, 432, 13224, None, None, 4098, N...
17709    [739, 2486, 6619, 17301, None, None, None, 188...
17710    [10304, 4299, None, 12629, 20, 16882, None, No...
17711    [245, 604, 6858, 1218, 3856, None, 5930, 1228,...
Name: actors_list, Length: 17712, dtype: object
