In [198]:
import pandas as pd
from pprint import pprint
import json
global execution_count

execution_count = 0

# pd.options.display.max_rows = 300000

In [199]:
# Put the actors and directors into a list to make it easier to access
df = pd.read_csv("data/evaluated_data.csv")

df['directors_list'] = df['directors'].str.split(', ')
df['actors'] = df['actors'].str.replace(r', Jr\.', ' Jr.', regex=True)
df['actors'] = df['actors'].str.replace(r', Sr\.', ' Sr.', regex=True)
df['actors_list'] = df['actors'].str.split(', ')
df['production_company_list'] = df['production_company']

In [200]:
# Getting the number of times the actors/directors appear in the dataset
directors_frequency = df['directors_list'].explode().value_counts()
actors_frequency = df['actors_list'].explode().value_counts()
production_company_frequency = df['production_company_list'].explode().value_counts()

In [201]:
print("Frequency of Directors")

directors_frequency = directors_frequency[directors_frequency > 1]

directors_frequency.head(5)

Frequency of Directors


directors_list
Steven Soderbergh    20
Woody Allen          18
Tyler Perry          16
Clint Eastwood       16
Ridley Scott         15
Name: count, dtype: int64

In [202]:
global execution_count

while True:
    # Setting a limit on the number of times the cell is executed to prevent data shortage
    if execution_count == 1:
        print("WARNING: You can only run this cell once!")
        break
    # Only the top 10% of the most frequent actors in the dataset will be encoded
    print("Frequency of Actors")
    top_percent = int(len(actors_frequency) * 0.1)
    top_percent_actors = actors_frequency.head(top_percent).index
    actors_frequency = actors_frequency[top_percent_actors]
    print(actors_frequency)
    execution_count += 1



Frequency of Actors
actors_list
Samuel L. Jackson    53
James Franco         42
Robert De Niro       42
Morgan Freeman       41
Liam Neeson          39
                     ..
Rihanna               5
Efren Ramirez         5
Kate Burton           5
Omar Epps             5
Liam Aiken            5
Name: count, Length: 1437, dtype: int64


In [203]:
print("Frequency of Production Companies")

# production_company_frequency = production_company_frequency[production_company_frequency > 1]

production_company_frequency.head(5)

Frequency of Production Companies


production_company_list
Warner Bros. Pictures     315
20th Century Fox          291
Universal Pictures        280
IFC Films                 262
Sony Pictures Classics    221
Name: count, dtype: int64

In [204]:
# Creating a dictionary where the director that occurs most frequently will get the highets integer encoding (highest = 1, lowest = n)
director_encoding = {}
for i, name in enumerate(directors_frequency.index):
    director_encoding[name] = i + 1

pprint(director_encoding)

{'Aaron Katz (II)': 1436,
 'Aaron Moorhead': 476,
 'Aaron Seltzer': 147,
 'Abbas Kiarostami': 509,
 'Abdel Kechiche': 1460,
 'Abel Ferrara': 955,
 'Adam Brooks': 1149,
 'Adam Carolla': 1367,
 'Adam Del Deo': 583,
 'Adam Green': 320,
 'Adam McKay': 60,
 'Adam Robitel': 553,
 'Adam Salky': 1379,
 'Adam Shankman': 28,
 'Adam Wingard': 182,
 'Adrian García Bogliano': 874,
 'Agnieszka Holland': 1422,
 'Agnès Varda': 554,
 'Ahron Keshales': 1551,
 'Aki Kaurismäki': 1079,
 'Akiva Schaffer': 515,
 'Alan Mak': 735,
 'Alan Taylor': 1450,
 'Alastair Fothergill': 114,
 'Albert Hughes': 672,
 'Alejandro Agresti': 1243,
 'Alejandro Amenábar': 371,
 'Alejandro Brugués': 876,
 'Alejandro González Iñárritu': 174,
 'Alejandro Monteverde': 1294,
 'Aleksandr Sokurov': 1499,
 'Alex Garland': 1058,
 'Alex Gibney': 9,
 'Alex Kendrick': 273,
 'Alex Kurtzman': 1303,
 'Alex Proyas': 325,
 'Alex Ross Perry': 1142,
 'Alex Steyermark': 1271,
 'Alex Zamm': 952,
 'Alexander Payne': 142,
 'Alexander Witt': 1360,
 'Al

In [205]:
# Creating a dictionary where the actor that occurs most frequently will get the highest integer encoding (highest = 1, lowest = n)
actor_encoding = {}
for i, name in enumerate(actors_frequency.index):
    actor_encoding[name] = i + 1
pprint(actor_encoding)

{'50 Cent': 654,
 'Aamir Khan': 831,
 'Aaron Eckhart': 64,
 'Aaron Paul': 670,
 'Aaron Stanford': 1263,
 'Aaron Taylor-Johnson': 507,
 'Aaron Yoo': 895,
 'Abbie Cornish': 422,
 'Abhishek Bachchan': 757,
 'Abigail Breslin': 243,
 'Adam Beach': 886,
 'Adam Brody': 304,
 'Adam DeVine': 1305,
 'Adam Driver': 396,
 'Adam Sandler': 63,
 'Adam Scott': 207,
 'Addison Timlin': 1338,
 'Adrian Grenier': 1236,
 'Adrian Lester': 1131,
 'Adriana Barraza': 999,
 'Adrien Brody': 196,
 'Agnes Bruckner': 861,
 'Aidan Gillen': 1334,
 'Aidan Quinn': 756,
 'Aishwarya Rai Bachchan': 803,
 'Ajay Devgan': 1195,
 'Akshay Kumar': 1269,
 'Al Pacino': 320,
 'Alan Alda': 769,
 'Alan Arkin': 318,
 'Alan Cumming': 777,
 'Alan Rickman': 628,
 'Alan Tudyk': 778,
 'Albert Brooks': 872,
 'Albert Finney': 1125,
 'Alden Ehrenreich': 1244,
 'Alec Baldwin': 91,
 'Alessandro Nivola': 310,
 'Alex Pettyfer': 888,
 'Alexa PenaVega': 743,
 'Alexander Ludwig': 1315,
 'Alexander Skarsgård': 594,
 'Alexandra Daddario': 952,
 'Alexa

In [206]:
# Creating a dictionary where the production companies that occurs most frequently will get the highets integer encoding (highest = 1, lowest = n)
production_company_encoding = {}
for i, name in enumerate(production_company_frequency.index):
    production_company_encoding[name] = i + 1

pprint(production_company_encoding)

{'101 Studios': 1170,
 '108 Pics': 820,
 '120 Degree Films': 665,
 '20th Century Fox': 2,
 '20th Century Fox Distribution': 240,
 '20th Century Fox Film': 707,
 '20th Century Fox Television': 736,
 '20th Century Fox/Emerging Pictures': 558,
 '20th Century Fox/Regency Films': 1124,
 '21 Laps Entertainment': 457,
 '21UNO FILM': 693,
 '408 Films': 1219,
 '42 West': 756,
 '7-57 Releasing': 1112,
 '72nd Street Productions': 908,
 '7A Productions/Variance Films': 1206,
 'A&E IndieFilms': 907,
 'A24': 64,
 'A24 Films': 66,
 'A24 and DIRECTV': 83,
 'ABC Distribution Co.': 938,
 'AFFRM': 952,
 'AMC/The Collective/BloodyDisgusting': 613,
 'ATO Pictures': 206,
 'Abarorama': 698,
 'Abramorama': 203,
 'Abramorama Entertainment': 266,
 'Adlab Films': 744,
 'Adlabs': 533,
 'Adlabs Films': 260,
 'Adlabs Films Ltd.': 588,
 'Adopt Films': 133,
 'Affirm Films': 399,
 'After Dark Films': 108,
 'AfterDark Films': 678,
 'Alamode Film': 1094,
 'Alchemy': 187,
 'Alchemy Films': 945,
 'Alliance': 355,
 'Allian

In [207]:
# Replacing the names of the directors with the encoded values within the lists
for directors in df['directors_list']:
    if isinstance(directors, list):
        for names in range(0, len(directors)):
            encoded_value = director_encoding.get(directors[names])
            directors[names] = encoded_value

def list_to_string(lst):
    if lst and type(lst) != list:
        return lst
    lst = [str(num) for num in lst if num != None]
    if not lst:
        return None
    return ', '.join(lst)

df['directors_list'] = df['directors_list'].apply(list_to_string)

print(df['directors_list'])

0            123
1            495
2             59
3           None
4            839
          ...   
7132    205, 265
7133         311
7134         311
7135        1019
7136    739, 684
Name: directors_list, Length: 7137, dtype: object


In [208]:
for actors in df['actors_list']:
    if isinstance(actors, list):
        for names in range(0, len(actors)):
            encoded_value = actor_encoding.get(actors[names])
            actors[names] = encoded_value

df['actors_list'] = df['actors_list'].apply(list_to_string)
    
print(df['actors_list'])

0            706, 1299, 952, 298
1              67, 303, 267, 205
2                       547, 530
3                           None
4                           None
                  ...           
7132    280, 36, 1058, 535, 1110
7133                  51, 20, 41
7134        51, 20, 41, 144, 440
7135                    467, 820
7136          106, 129, 397, 778
Name: actors_list, Length: 7137, dtype: object


In [209]:
def apply_production_company_encoding(company):
  if type(company) != str:
    return None
  encoding = production_company_encoding.get(company)
  if not encoding:
    return None
  return str(encoding)

df['production_company_list'] = df['production_company_list'].apply(apply_production_company_encoding)
print(df['production_company_list'])

0         2
1         5
2         1
3       298
4       801
       ... 
7132     14
7133      6
7134      6
7135     41
7136    406
Name: production_company_list, Length: 7137, dtype: object


In [210]:
df = df.drop(['actors', 'directors', 'production_company'], axis=1)
df = df.rename(columns={"actors_list": "actors", "directors_list": "directors", "production_company_list": "production_company"})

df.to_csv('data/final_data.csv', index=False)

In [211]:
actors_df = df['actors'].str.get_dummies(sep=', ').add_prefix('actor')
directors_df = df['directors'].str.get_dummies(sep=', ').add_prefix('director')
production_companies_df = df['production_company'].str.get_dummies(sep=', ').add_prefix('production')

df = df.drop('actors', axis=1).join(actors_df)
df = df.drop('directors', axis=1).join(directors_df)
df = df.drop('production_company', axis=1).join(production_companies_df)

df.to_csv('data/final_data_one_hot.csv', index=False)

In [212]:
# Writing dictionary to a JSON file
with open('data/actor_encoding.json', 'w') as json_file:
  json.dump(actor_encoding, json_file, indent=4)  # 'indent' makes the JSON readable

with open('data/director_encoding.json', 'w') as json_file:
  json.dump(director_encoding, json_file, indent=4)

with open('data/production_company_encoding.json', 'w') as json_file:
  json.dump(production_company_encoding, json_file, indent=4) 