In [613]:
import pandas as pd
from pprint import pprint
import json
global execution_count

execution_count = 0

# pd.options.display.max_rows = 300000

In [614]:
# Put the actors and directors into a list to make it easier to access
df = pd.read_csv("data/evaluated_data.csv")

df['directors_list'] = df['directors'].str.split(', ')
df['actors'] = df['actors'].str.replace(r', Jr\.', ' Jr.', regex=True)
df['actors'] = df['actors'].str.replace(r', Sr\.', ' Sr.', regex=True)
df['actors_list'] = df['actors'].str.split(', ')
df['production_company_list'] = df['production_company']

In [615]:
# Getting the number of times the actors/directors appear in the dataset
directors_frequency = df['directors_list'].explode().value_counts()
actors_frequency = df['actors_list'].explode().value_counts()
production_company_frequency = df['production_company_list'].explode().value_counts()

In [616]:
print("Frequency of Directors")

directors_frequency = directors_frequency[directors_frequency > 1]

directors_frequency.head(5)

Frequency of Directors


directors_list
Steven Soderbergh    20
Woody Allen          18
Clint Eastwood       16
Tyler Perry          16
Ridley Scott         15
Name: count, dtype: int64

In [617]:
global execution_count

while True:
    # Setting a limit on the number of times the cell is executed to prevent data shortage
    if execution_count == 1:
        print("WARNING: You can only run this cell once!")
        break
    # Only the top 10% of the most frequent actors in the dataset will be encoded
    print("Frequency of Actors")
    top_percent = int(len(actors_frequency) * 0.1)
    top_percent_actors = actors_frequency.head(top_percent).index
    actors_frequency = actors_frequency[top_percent_actors]
    print(actors_frequency)
    execution_count += 1



Frequency of Actors
actors_list
Samuel L. Jackson    52
Robert De Niro       42
James Franco         42
Morgan Freeman       41
Mark Wahlberg        39
                     ..
Rip Torn              5
Tom Cavanagh          5
Chadwick Boseman      5
Gretchen Mol          5
Brit Marling          5
Name: count, Length: 1430, dtype: int64


In [618]:
print("Frequency of Production Companies")

# production_company_frequency = production_company_frequency[production_company_frequency > 1]

production_company_frequency.head(5)

Frequency of Production Companies


production_company_list
Warner Bros. Pictures     313
20th Century Fox          291
Universal Pictures        279
IFC Films                 261
Sony Pictures Classics    220
Name: count, dtype: int64

In [619]:
# Creating a dictionary where the director that occurs most frequently will get the highets integer encoding (highest = 1, lowest = n)
director_encoding = {}
for i, name in enumerate(directors_frequency.index):
    director_encoding[name] = i + 1

pprint(director_encoding)

{'Aaron Katz (II)': 1457,
 'Aaron Moorhead': 325,
 'Aaron Seltzer': 123,
 'Abbas Kiarostami': 496,
 'Abdel Kechiche': 1253,
 'Abel Ferrara': 1101,
 'Adam Brooks': 1391,
 'Adam Carolla': 1300,
 'Adam Del Deo': 627,
 'Adam Green': 461,
 'Adam McKay': 59,
 'Adam Robitel': 629,
 'Adam Salky': 1409,
 'Adam Shankman': 26,
 'Adam Wingard': 173,
 'Adrian García Bogliano': 1054,
 'Agnieszka Holland': 1435,
 'Agnès Varda': 632,
 'Ahron Keshales': 880,
 'Aki Kaurismäki': 885,
 'Akiva Schaffer': 745,
 'Alan Mak': 776,
 'Alan Taylor': 1446,
 'Alastair Fothergill': 113,
 'Albert Hughes': 759,
 'Alejandro Agresti': 859,
 'Alejandro Amenábar': 345,
 'Alejandro Brugués': 1531,
 'Alejandro González Iñárritu': 110,
 'Alejandro Monteverde': 1286,
 'Aleksandr Sokurov': 971,
 'Alex Garland': 1062,
 'Alex Gibney': 11,
 'Alex Kendrick': 275,
 'Alex Kurtzman': 1211,
 'Alex Proyas': 416,
 'Alex Ross Perry': 953,
 'Alex Steyermark': 1151,
 'Alex Zamm': 939,
 'Alexander Payne': 146,
 'Alexander Witt': 1291,
 'Ale

In [620]:
# Creating a dictionary where the actor that occurs most frequently will get the highest integer encoding (highest = 1, lowest = n)
actor_encoding = {}
for i, name in enumerate(actors_frequency.index):
    actor_encoding[name] = i + 1
pprint(actor_encoding)

{'50 Cent': 688,
 'Aamir Khan': 896,
 'Aaron Eckhart': 63,
 'Aaron Paul': 643,
 'Aaron Stanford': 1272,
 'Aaron Taylor-Johnson': 488,
 'Aaron Yoo': 932,
 'Abbie Cornish': 393,
 'Abhishek Bachchan': 792,
 'Abigail Breslin': 250,
 'Adam Beach': 855,
 'Adam Brody': 309,
 'Adam DeVine': 1371,
 'Adam Driver': 416,
 'Adam Garcia': 1298,
 'Adam Sandler': 67,
 'Adam Scott': 249,
 'Adewale Akinnuoye-Agbaje': 1424,
 'Adrian Grenier': 1217,
 'Adrian Lester': 1216,
 'Adriana Barraza': 1013,
 'Adrien Brody': 178,
 'Agnes Bruckner': 937,
 'Aidan Quinn': 779,
 'Aishwarya Rai Bachchan': 703,
 'Ajay Devgan': 1183,
 'Akshay Kumar': 1200,
 'Al Pacino': 305,
 'Alan Alda': 742,
 'Alan Arkin': 300,
 'Alan Cumming': 766,
 'Alan Rickman': 669,
 'Alan Tudyk': 782,
 'Albert Brooks': 835,
 'Albert Finney': 1232,
 'Alden Ehrenreich': 1188,
 'Alec Baldwin': 92,
 'Alessandro Nivola': 315,
 'Alex Pettyfer': 938,
 'Alexa PenaVega': 789,
 'Alexander Skarsgård': 584,
 'Alexandra Daddario': 903,
 'Alexandra Maria Lara':

In [621]:
# Creating a dictionary where the production companies that occurs most frequently will get the highets integer encoding (highest = 1, lowest = n)
production_company_encoding = {}
for i, name in enumerate(production_company_frequency.index):
    production_company_encoding[name] = i + 1

pprint(production_company_encoding)

{'101 Studios': 1145,
 '108 Pics': 802,
 '120 Degree Films': 643,
 '20th Century Fox': 2,
 '20th Century Fox Distribution': 240,
 '20th Century Fox Film': 516,
 '20th Century Fox Television': 715,
 '20th Century Fox/Emerging Pictures': 846,
 '20th Century Fox/Regency Films': 1103,
 '21 Laps Entertainment': 464,
 '21UNO FILM': 697,
 '408 Films': 1192,
 '42 West': 737,
 '7-57 Releasing': 1121,
 '72nd Street Productions': 884,
 '7A Productions/Variance Films': 1224,
 'A&E IndieFilms': 891,
 'A24': 63,
 'A24 Films': 62,
 'A24 and DIRECTV': 77,
 'ABC Distribution Co.': 927,
 'AFFRM': 940,
 'AMC/The Collective/BloodyDisgusting': 597,
 'ATO Pictures': 207,
 'Abarorama': 679,
 'Abramorama': 209,
 'Abramorama Entertainment': 260,
 'Adlab Films': 723,
 'Adlabs': 523,
 'Adlabs Films': 248,
 'Adlabs Films Ltd.': 577,
 'Adopt Films': 138,
 'Affirm Films': 399,
 'After Dark Films': 110,
 'AfterDark Films': 652,
 'Alamode Film': 1105,
 'Alchemy': 202,
 'Alchemy Films': 934,
 'Alliance': 348,
 'Allian

In [622]:
# Replacing the names of the directors with the encoded values within the lists
for directors in df['directors_list']:
    if isinstance(directors, list):
        for names in range(0, len(directors)):
            encoded_value = director_encoding.get(directors[names])
            directors[names] = encoded_value

def list_to_string(lst):
    if lst and type(lst) != list:
        return lst
    lst = [str(num) for num in lst if num != None]
    if not lst:
        return None
    return ', '.join(lst)

df['directors_list'] = df['directors_list'].apply(list_to_string)

print(df['directors_list'])

0            115
1            490
2             53
3           None
4            824
          ...   
7082    286, 291
7083         483
7084         483
7085        1171
7086    554, 640
Name: directors_list, Length: 7087, dtype: object


In [623]:
for actors in df['actors_list']:
    if isinstance(actors, list):
        for names in range(0, len(actors)):
            encoded_value = actor_encoding.get(actors[names])
            actors[names] = encoded_value

df['actors_list'] = df['actors_list'].apply(list_to_string)
    
print(df['actors_list'])

0            676, 1138, 903, 334
1              58, 332, 279, 201
2                       573, 514
3                           None
4                           None
                  ...           
7082    288, 49, 1029, 508, 1196
7083            46, 19, 53, 1318
7084        46, 19, 53, 147, 473
7085              486, 1416, 893
7086           91, 117, 396, 782
Name: actors_list, Length: 7087, dtype: object


In [624]:
def apply_production_company_encoding(company):
  if type(company) != str:
    return None
  encoding = production_company_encoding.get(company)
  if not encoding:
    return None
  return str(encoding)

df['production_company_list'] = df['production_company_list'].apply(apply_production_company_encoding)
print(df['production_company_list'])

0          2
1          5
2          1
3        298
4       1257
        ... 
7082      13
7083       6
7084       6
7085      42
7086     406
Name: production_company_list, Length: 7087, dtype: object


In [625]:
df = df.drop(['actors', 'directors', 'production_company'], axis=1)
df = df.rename(columns={"actors_list": "actors", "directors_list": "directors", "production_company_list": "production_company"})

df.to_csv('data/final_data.csv', index=False)

In [626]:
def average_list(lst):
  if not lst:
    return ''
  if type(lst) != str:
    return lst
  lst = list(map(int, lst.split(', ')))
  return round(sum(lst) / len(lst), 2)

df1 = df.copy()

df1['actors'] = df1['actors'].apply(average_list)

df1['directors'] = df1['directors'].apply(average_list)

df1.to_csv('data/final_data_average_lists.csv', index=False)

In [None]:
def min_list(lst):
  if type(lst) != str:
    return lst
  lst = list(map(int, lst.split(', ')))
  return str(min(lst))

df2 = df.copy()

df2['actors'] = df2['actors'].apply(min_list)
max_actor = len(actor_encoding) + 1
df2['actors'] = df2['actors'].fillna(max_actor)

df2['directors'] = df2['directors'].apply(min_list)
max_director = len(director_encoding) + 1
df2['directors'] = df2['directors'].fillna(max_director)

max_production_company = len(production_company_encoding) + 1
df2['production_company'] = df2['production_company'].fillna(max_production_company)

df2.to_csv('data/final_data_min_lists.csv', index=False)

In [628]:
actors_df = df['actors'].str.get_dummies(sep=', ').add_prefix('actor')
directors_df = df['directors'].str.get_dummies(sep=', ').add_prefix('director')
production_companies_df = df['production_company'].str.get_dummies(sep=', ').add_prefix('production')

df3 = df.copy()

df3 = df3.drop('actors', axis=1).join(actors_df)
df3 = df3.drop('directors', axis=1).join(directors_df)
df3 = df3.drop('production_company', axis=1).join(production_companies_df)

df1.to_csv('data/final_data_one_hot.csv', index=False)

In [629]:
# Writing dictionary to a JSON file
with open('data/actor_encoding.json', 'w') as json_file:
  json.dump(actor_encoding, json_file, indent=4)  # 'indent' makes the JSON readable

with open('data/director_encoding.json', 'w') as json_file:
  json.dump(director_encoding, json_file, indent=4)

with open('data/production_company_encoding.json', 'w') as json_file:
  json.dump(production_company_encoding, json_file, indent=4) 