# Code 2: Enriching Webscrapes pickle file with
### 1. Oscar nominations <br> 2. Subgenres <br> 3. Rating reasons 

## Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_colwidth', None)

import seaborn as sns
sns.set_theme()
sns.set(rc={'figure.figsize':(12,8)})

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# read in master data file
data = pd.read_pickle('master_data_v2.pkl')
data[['Title','Rel_year','Dir1','Dir2','Dir3']]

In [None]:
# read in master data file
oscars_data = pd.read_csv('oscars_data.csv')
oscars_data.head()

In [None]:
oscars_data.columns

In [None]:
# convert film years to integers
y_lst = []
for y in list(oscars_data['Year']):
    y = y.split('/')[0]
    y_lst.append(y)
oscars_data['Year'] = y_lst
oscars_data['Year'] = pd.to_numeric(oscars_data['Year'], downcast="integer", errors='coerce')

In [None]:
def drop_movie(col):
    names = []
    films = []
    col_lst = list(oscars_data[col])
    for i in col_lst:
        if i is np.nan:
            name = np.nan
            film = np.nan
        elif 'No Country for Old Men' in i:
            name = i.split(' for ')[0]
            film = 'No Country for Old Men'
        elif ' for ' in i.lower():
            if ' For ' in i:
                name = i.split(' For ')[0]
                film = i.split(' For ')[1]
            else:
                name = i.split(' for ')[0]
                film = i.split(' for ')[1]
        elif ' from ' in i.lower():
            if ' From ' in i:
                name = i.split(' From ')[0]
                film = i.split(' From ')[1]
            else:
                name = i.split(' from ')[0]
                film = i.split(' from ')[1]
        else:
            name = i.split(' for ')[0]
            film = i.split(' for ')[1]
        names.append(name)
        films.append(film)
        
    return names, films

def split_collaborators(col_lst, film_lst):
    all_names = []
    for i,m in zip(col_lst,film_lst):
        sub_names = {}
        sub_names2 = []
        
        if i is np.nan:
            sub_names[np.nan] = m
            # sub_names.append(np.nan)
        elif ' and ' in i:
            name1 = i.split(' and ')[0]
            name2 = i.split(' and ')[1]
            sub_names[name1]=m
            sub_names[name2]=m
            # sub_names.append(name1)
            # sub_names.append(name2)

            if ',' in i:
                for j in re.finditer(', ', name1):
                    sub_names2.append(name1[:j.start()].split(', ')[0])
                    sub_names2.append(name1[j.end():-1].split(', ')[0])
                    sub_names2 = list(set(sub_names2))
                sub_names2.append(name2)
                sub_names3 = dict.fromkeys(sub_names2, m)
        else:
            sub_names[i] = m
            # sub_names.append(i)
        if sub_names2 != []:
            all_names.append(sub_names3)
        else:
            all_names.append(sub_names)
    return all_names

In [None]:
for col in oscars_data.iloc[:,13:72]:
    col_lst, film_lst = drop_movie(col)
    oscars_data[col] = split_collaborators(col_lst, film_lst)

## Best Directors

In [None]:
oscars_data.iloc[:,38:45]

In [None]:
total_lst=[]
for col in oscars_data.iloc[:,38:45]:
    nom_lst = oscars_data[col]
    category_lst=[]
    for nom,yr in zip(nom_lst, oscars_data.iloc[:,0]):
        keys = list(nom.keys())
        value = list(set(nom.values()))[0]

        for k in keys:
            new_lst = []
            new_lst.append(yr)
            new_lst.append(k)
            new_lst.append(value)
            category_lst.append(new_lst)
    total_lst.append(category_lst)

In [None]:
dir_list = [item for sublist in total_lst for item in sublist]
dir_noms = pd.DataFrame(dir_list, columns=['Year','Name','Film'])
dir_noms = dir_noms.dropna()
dir_noms = dir_noms.sort_values(by='Year', ascending=True).reset_index(drop=True)
dir_noms

In [None]:
titles = list(data['Title'])
years = list(data['Rel_year'])
dir1 = list(data['Dir1'])
dir2 = list(data['Dir2'])
dir3 = list(data['Dir3'])

oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,dir1,dir2,dir3):
    df_year = dir_noms[dir_noms['Year']<yr]
    nominated_films = []
    for n,f in zip(list(df_year['Name']),list(df_year['Film'])):
        if t==f:
            continue
        elif n==d1:
            nominated_films.append(f)
        elif n==d2:
            nominated_films.append(f)
        elif n==d3:
            nominated_films.append(f)
    nominated_films = list(set(nominated_films))
    oscar_cnt.append(nominated_films)

In [None]:
best_dir_df = data[['Title','Rel_year','Dir1','Dir2','Dir3']]
best_dir_df['Dir_noms']=oscar_cnt
best_dir_df[best_dir_df['Dir_noms'].apply(lambda x: len(x)) > 0]

#### Best Picture (Directors)

In [None]:
oscars_data.iloc[:,1:13]

In [None]:
total_lst=[]
for col in oscars_data.iloc[:,1:13]:
    nom_lst = oscars_data[col]
    category_lst=[]
    for nom,yr in zip(nom_lst, oscars_data.iloc[:,0]):
        new_lst = []
        new_lst.append(yr)
        new_lst.append(nom)
        category_lst.append(new_lst)
    total_lst.append(category_lst)

In [None]:
best_pic_list = [item for sublist in total_lst for item in sublist]
best_pic_noms = pd.DataFrame(best_pic_list, columns=['Year','Film'])
best_pic_noms = best_pic_noms.dropna()
best_pic_noms = best_pic_noms.sort_values(by='Year', ascending=True).reset_index(drop=True)
best_pic_noms

In [None]:
titles = list(data['Title'])
years = list(data['Rel_year'])
dir1 = list(data['Dir1_films'])
dir2 = list(data['Dir2_films'])
dir3 = list(data['Dir3_films'])

oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,dir1,dir2,dir3):
    df_year = best_pic_noms[best_pic_noms['Year']<yr]
    if d1==None:
        films1=[]
        yr1=[]
    else:
        yr1 = list(d1.values())
        films1 = list(d1.keys())
    if d2==None:
        films2=[]
        yr2=[]
    else:
        yr2 = list(d2.values())
        films2 = list(d2.keys())
    if d3==None:
        films3=[]
        yr3=[]
    else:
        yr3 = list(d3.values())
        films3 = list(d3.keys())
    
    nominated_films = []
    for f,yr_ in zip(list(df_year['Film']),list(df_year['Year'])):
        for f_,y in zip(films1,range(len(films1))):
            if (f==f_) and (abs(yr_-yr1[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films2,range(len(films2))):
            if (f==f_) and (abs(yr_-yr2[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films3,range(len(films3))):
            if (f==f_) and (abs(yr_-yr3[y])<=1):
                nominated_films.append(f)
    nom_cnt = len(list(set(nominated_films)))
    oscar_cnt.append(list(set(nominated_films)))

In [None]:
best_pic_df = data[['Title','Rel_year','Dir1','Dir1_films']]
best_pic_df['Best_pic_noms']=oscar_cnt
best_pic_df[best_pic_df['Best_pic_noms'].apply(lambda x: len(x)) > 0]

#### Best Foreign Film (Directors)

In [None]:
oscars_data.iloc[:,67:72]

In [None]:
total_lst = []
for col in oscars_data.iloc[:,67:72]:
    nom_lst = oscars_data[col]
    category_lst=[]
    for nom,yr in zip(nom_lst, oscars_data.iloc[:,0]):
        value = list(set(nom.values()))[0]
        new_lst = []
        new_lst.append(yr)
        new_lst.append(value)
        category_lst.append(new_lst)
    total_lst.append(category_lst)

In [None]:
best_for_pic_list = [item for sublist in total_lst for item in sublist]
best_for_pic_noms = pd.DataFrame(best_for_pic_list, columns=['Year','Film'])
best_for_pic_noms = best_for_pic_noms.dropna()
best_for_pic_noms = best_for_pic_noms.sort_values(by='Year', ascending=True).reset_index(drop=True)
best_for_pic_noms

In [None]:
titles = list(data['Title'])
years = list(data['Rel_year'])
dir1 = list(data['Dir1_films'])
dir2 = list(data['Dir2_films'])
dir3 = list(data['Dir3_films'])

oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,dir1,dir2,dir3):
    df_year = best_for_pic_noms[best_for_pic_noms['Year']<yr]
    if d1==None:
        films1=[]
        yr1=[]
    else:
        yr1 = list(d1.values())
        films1 = list(d1.keys())
    if d2==None:
        films2=[]
        yr2=[]
    else:
        yr2 = list(d2.values())
        films2 = list(d2.keys())
    if d3==None:
        films3=[]
        yr3=[]
    else:
        yr3 = list(d3.values())
        films3 = list(d3.keys())
    
    nominated_films = []
    for f,yr_ in zip(list(df_year['Film']),list(df_year['Year'])):
        for f_,y in zip(films1,range(len(films1))):
            if (f==f_) and (abs(yr_-yr1[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films2,range(len(films2))):
            if (f==f_) and (abs(yr_-yr2[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films3,range(len(films3))):
            if (f==f_) and (abs(yr_-yr3[y])<=1):
                nominated_films.append(f)
    nom_cnt = len(list(set(nominated_films)))
    oscar_cnt.append(list(set(nominated_films)))

In [None]:
best_for_pic_df = data[['Title','Rel_year','Dir1','Dir1_films']]
best_for_pic_df['Best_for_pic_noms']=oscar_cnt
best_for_pic_df[best_for_pic_df['Best_for_pic_noms'].apply(lambda x: len(x)) > 0]

#### Best Documentary (Directors)

In [None]:
oscars_data.iloc[:,72:77]

In [None]:
total_lst=[]
for col in oscars_data.iloc[:,72:77]:
    nom_lst = oscars_data[col]
    category_lst=[]
    for nom,yr in zip(nom_lst, oscars_data.iloc[:,0]):
        new_lst = []
        new_lst.append(yr)
        new_lst.append(nom)
        category_lst.append(new_lst)
    total_lst.append(category_lst)

In [None]:
best_doc_list = [item for sublist in total_lst for item in sublist]
best_doc_noms = pd.DataFrame(best_doc_list, columns=['Year','Film'])
best_doc_noms = best_doc_noms.dropna()
best_doc_noms = best_doc_noms.sort_values(by='Year', ascending=True).reset_index(drop=True)
best_doc_noms

In [None]:
titles = list(data['Title'])
years = list(data['Rel_year'])
dir1 = list(data['Dir1_films'])
dir2 = list(data['Dir2_films'])
dir3 = list(data['Dir3_films'])

oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,dir1,dir2,dir3):
    df_year = best_doc_noms[best_doc_noms['Year']<yr]
    if d1==None:
        films1=[]
        yr1=[]
    else:
        yr1 = list(d1.values())
        films1 = list(d1.keys())
    if d2==None:
        films2=[]
        yr2=[]
    else:
        yr2 = list(d2.values())
        films2 = list(d2.keys())
    if d3==None:
        films3=[]
        yr3=[]
    else:
        yr3 = list(d3.values())
        films3 = list(d3.keys())
    
    nominated_films = []
    for f,yr_ in zip(list(df_year['Film']),list(df_year['Year'])):
        for f_,y in zip(films1,range(len(films1))):
            if (f==f_) and (abs(yr_-yr1[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films2,range(len(films2))):
            if (f==f_) and (abs(yr_-yr2[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films3,range(len(films3))):
            if (f==f_) and (abs(yr_-yr3[y])<=1):
                nominated_films.append(f)
    nom_cnt = len(list(set(nominated_films)))
    oscar_cnt.append(list(set(nominated_films)))

In [None]:
best_doc_df = data[['Title','Rel_year','Dir1','Dir1_films']]
best_doc_df['Best_doc_noms']=oscar_cnt
best_doc_df[best_doc_df['Best_doc_noms'].apply(lambda x: len(x)) > 0]

#### Best Animated Film (Directors)

In [None]:
oscars_data.iloc[:,77:]

In [None]:
total_lst=[]
for col in oscars_data.iloc[:,77:]:
    nom_lst = oscars_data[col]
    category_lst=[]
    for nom,yr in zip(nom_lst, oscars_data.iloc[:,0]):
        new_lst = []
        new_lst.append(yr)
        new_lst.append(nom)
        category_lst.append(new_lst)
    total_lst.append(category_lst)

In [None]:
best_anim_list = [item for sublist in total_lst for item in sublist]
best_anim_noms = pd.DataFrame(best_anim_list, columns=['Year','Film'])
best_anim_noms = best_anim_noms.dropna()
best_anim_noms = best_anim_noms.sort_values(by='Year', ascending=True).reset_index(drop=True)
best_anim_noms

In [None]:
titles = list(data['Title'])
years = list(data['Rel_year'])
dir1 = list(data['Dir1_films'])
dir2 = list(data['Dir2_films'])
dir3 = list(data['Dir3_films'])

oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,dir1,dir2,dir3):
    df_year = best_anim_noms[best_anim_noms['Year']<yr]
    if d1==None:
        films1=[]
        yr1=[]
    else:
        yr1 = list(d1.values())
        films1 = list(d1.keys())
    if d2==None:
        films2=[]
        yr2=[]
    else:
        yr2 = list(d2.values())
        films2 = list(d2.keys())
    if d3==None:
        films3=[]
        yr3=[]
    else:
        yr3 = list(d3.values())
        films3 = list(d3.keys())
    
    nominated_films = []
    for f,yr_ in zip(list(df_year['Film']),list(df_year['Year'])):
        for f_,y in zip(films1,range(len(films1))):
            if (f==f_) and (abs(yr_-yr1[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films2,range(len(films2))):
            if (f==f_) and (abs(yr_-yr2[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films3,range(len(films3))):
            if (f==f_) and (abs(yr_-yr3[y])<=1):
                nominated_films.append(f)
    nom_cnt = len(list(set(nominated_films)))
    oscar_cnt.append(list(set(nominated_films)))

In [None]:
best_anim_df = data[['Title','Rel_year','Dir1','Dir1_films']]
best_anim_df['Best_anim_noms']=oscar_cnt
best_anim_df[best_anim_df['Best_anim_noms'].apply(lambda x: len(x)) > 0]

In [None]:
dir_noms_df = data[['Title','Rel_year','Dir1','Dir2','Dir3']]
dir_noms_df['Best_dir_noms'] = list(best_dir_df['Dir_noms'])
dir_noms_df['Best_pic_noms'] = list(best_pic_df['Best_pic_noms'])
dir_noms_df['Best_for_pic_noms'] = list(best_for_pic_df['Best_for_pic_noms'])
dir_noms_df['Best_doc_noms'] = list(best_doc_df['Best_doc_noms'])
dir_noms_df['Best_anim_noms'] = list(best_anim_df['Best_anim_noms'])

In [None]:
cnt_lst = []
for i in range(622):
    f_lst = []
    f1 = dir_noms_df['Best_dir_noms'].iloc[i]
    f2 = dir_noms_df['Best_pic_noms'].iloc[i]
    f3 = dir_noms_df['Best_for_pic_noms'].iloc[i]
    f4 = dir_noms_df['Best_doc_noms'].iloc[i]
    f5 = dir_noms_df['Best_anim_noms'].iloc[i]
    f_lst.append(f1)
    f_lst.append(f2)
    f_lst.append(f3)
    f_lst.append(f4)
    f_lst.append(f5)
    f_cnt = len(set([item for sublist in f_lst for item in sublist]))
    cnt_lst.append(f_cnt)
    
dir_noms_df['Dir_nom_cnt'] = cnt_lst

In [None]:
dir_noms_df[dir_noms_df['Dir_nom_cnt']>0]

## Best Writers 

In [None]:
oscars_data.iloc[:,45:67]

In [None]:
total_lst=[]
for col in oscars_data.iloc[:,45:67]:
    nom_lst = oscars_data[col]
    category_lst=[]
    for nom,yr in zip(nom_lst, oscars_data.iloc[:,0]):
        keys = list(nom.keys())
        value = list(set(nom.values()))[0]

        for k in keys:
            new_lst = []
            new_lst.append(yr)
            new_lst.append(k)
            new_lst.append(value)
            category_lst.append(new_lst)
    total_lst.append(category_lst)

In [None]:
writer_list = [item for sublist in total_lst for item in sublist]
writer_noms = pd.DataFrame(writer_list, columns=['Year','Name','Film'])
writer_noms = writer_noms.dropna()
writer_noms = writer_noms.sort_values(by='Year', ascending=True).reset_index(drop=True)
writer_noms

In [None]:
titles = list(data['Title'])
years = list(data['Rel_year'])
writer1 = list(data['Writer1'])
writer2 = list(data['Writer2'])
writer3 = list(data['Writer3'])

oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,writer1,writer2,writer3):
    df_year = writer_noms[writer_noms['Year']<yr]
    nominated_films = []
    for n,f in zip(list(df_year['Name']),list(df_year['Film'])):
        if t==f:
            continue
        elif n==d1:
            nominated_films.append(f)
        elif n==d2:
            nominated_films.append(f)
        elif n==d3:
            nominated_films.append(f)
    nominated_films = list(set(nominated_films))
    oscar_cnt.append(nominated_films)

best_writer_df = data[['Title','Rel_year','Writer1','Writer2','Writer3']]
best_writer_df['Writer_noms']=oscar_cnt
best_writer_df[best_writer_df['Writer_noms'].apply(lambda x: len(x)) > 0]

#### Best Picture (Writers)

In [None]:
titles = list(data['Title'])
years = list(data['Rel_year'])
writer1 = list(data['Writer1_films'])
writer2 = list(data['Writer2_films'])
writer3 = list(data['Writer3_films'])

oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,writer1,writer2,writer3):
    df_year = best_pic_noms[best_pic_noms['Year']<yr]
    if d1==None:
        films1=[]
        yr1=[]
    else:
        yr1 = list(d1.values())
        films1 = list(d1.keys())
    if d2==None:
        films2=[]
        yr2=[]
    else:
        yr2 = list(d2.values())
        films2 = list(d2.keys())
    if d3==None:
        films3=[]
        yr3=[]
    else:
        yr3 = list(d3.values())
        films3 = list(d3.keys())
    
    nominated_films = []
    for f,yr_ in zip(list(df_year['Film']),list(df_year['Year'])):
        for f_,y in zip(films1,range(len(films1))):
            if (f==f_) and (abs(yr_-yr1[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films2,range(len(films2))):
            if (f==f_) and (abs(yr_-yr2[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films3,range(len(films3))):
            if (f==f_) and (abs(yr_-yr3[y])<=1):
                nominated_films.append(f)
    nom_cnt = len(list(set(nominated_films)))
    oscar_cnt.append(list(set(nominated_films)))

In [None]:
best_pic_writer_df = data[['Title','Rel_year','Writer1','Writer2','Writer3','Writer1_films']]
best_pic_writer_df['Best_pic_writer_noms']=oscar_cnt
best_pic_writer_df[best_pic_writer_df['Best_pic_writer_noms'].apply(lambda x: len(x)) > 0]

#### Best Foreign Film (Writers)

In [None]:
titles = list(data['Title'])
years = list(data['Rel_year'])
writer1 = list(data['Writer1_films'])
writer2 = list(data['Writer2_films'])
writer3 = list(data['Writer3_films'])

oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,writer1,writer2,writer3):
    df_year = best_for_pic_noms[best_for_pic_noms['Year']<yr]
    if d1==None:
        films1=[]
        yr1=[]
    else:
        yr1 = list(d1.values())
        films1 = list(d1.keys())
    if d2==None:
        films2=[]
        yr2=[]
    else:
        yr2 = list(d2.values())
        films2 = list(d2.keys())
    if d3==None:
        films3=[]
        yr3=[]
    else:
        yr3 = list(d3.values())
        films3 = list(d3.keys())
    
    nominated_films = []
    for f,yr_ in zip(list(df_year['Film']),list(df_year['Year'])):
        for f_,y in zip(films1,range(len(films1))):
            if (f==f_) and (abs(yr_-yr1[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films2,range(len(films2))):
            if (f==f_) and (abs(yr_-yr2[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films3,range(len(films3))):
            if (f==f_) and (abs(yr_-yr3[y])<=1):
                nominated_films.append(f)
    nom_cnt = len(list(set(nominated_films)))
    oscar_cnt.append(list(set(nominated_films)))

In [None]:
best_for_pic_writer_df = data[['Title','Rel_year','Writer1','Writer2','Writer3','Writer1_films']]
best_for_pic_writer_df['Best_for_pic_writer_noms']=oscar_cnt
best_for_pic_writer_df[best_for_pic_writer_df['Best_for_pic_writer_noms'].apply(lambda x: len(x)) > 0]

#### Best Documentary (Writers)

In [None]:
writer1 = list(data['Writer1_films'])
writer2 = list(data['Writer2_films'])
writer3 = list(data['Writer3_films'])

oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,writer1,writer2,writer3):
    df_year = best_doc_noms[best_doc_noms['Year']<yr]
    if d1==None:
        films1=[]
        yr1=[]
    else:
        yr1 = list(d1.values())
        films1 = list(d1.keys())
    if d2==None:
        films2=[]
        yr2=[]
    else:
        yr2 = list(d2.values())
        films2 = list(d2.keys())
    if d3==None:
        films3=[]
        yr3=[]
    else:
        yr3 = list(d3.values())
        films3 = list(d3.keys())
    
    nominated_films = []
    for f,yr_ in zip(list(df_year['Film']),list(df_year['Year'])):
        for f_,y in zip(films1,range(len(films1))):
            if (f==f_) and (abs(yr_-yr1[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films2,range(len(films2))):
            if (f==f_) and (abs(yr_-yr2[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films3,range(len(films3))):
            if (f==f_) and (abs(yr_-yr3[y])<=1):
                nominated_films.append(f)
    nom_cnt = len(list(set(nominated_films)))
    oscar_cnt.append(list(set(nominated_films)))

In [None]:
best_doc_writer_df = data[['Title','Rel_year','Writer1','Writer2','Writer3','Writer1_films']]
best_doc_writer_df['Best_doc_writer_noms']=oscar_cnt
best_doc_writer_df[best_doc_writer_df['Best_doc_writer_noms'].apply(lambda x: len(x)) > 0]

#### Best Animated Film (Writers)

In [None]:
oscar_cnt = []
for t,yr,d1,d2,d3 in zip(titles,years,writer1,writer2,writer3):
    df_year = best_anim_noms[best_anim_noms['Year']<yr]
    if d1==None:
        films1=[]
        yr1=[]
    else:
        yr1 = list(d1.values())
        films1 = list(d1.keys())
    if d2==None:
        films2=[]
        yr2=[]
    else:
        yr2 = list(d2.values())
        films2 = list(d2.keys())
    if d3==None:
        films3=[]
        yr3=[]
    else:
        yr3 = list(d3.values())
        films3 = list(d3.keys())
    
    nominated_films = []
    for f,yr_ in zip(list(df_year['Film']),list(df_year['Year'])):
        for f_,y in zip(films1,range(len(films1))):
            if (f==f_) and (abs(yr_-yr1[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films2,range(len(films2))):
            if (f==f_) and (abs(yr_-yr2[y])<=1):
                nominated_films.append(f)
        for f_,y in zip(films3,range(len(films3))):
            if (f==f_) and (abs(yr_-yr3[y])<=1):
                nominated_films.append(f)
    nom_cnt = len(list(set(nominated_films)))
    oscar_cnt.append(list(set(nominated_films)))

In [None]:
best_anim_writer_df = data[['Title','Rel_year','Writer1','Writer2','Writer3','Writer1_films']]
best_anim_writer_df['Best_anim_writer_noms']=oscar_cnt
best_anim_writer_df[best_anim_writer_df['Best_anim_writer_noms'].apply(lambda x: len(x)) > 0]

In [None]:
writer_noms_df = data[['Title','Rel_year','Writer1','Writer2','Writer3']]
writer_noms_df['Best_writer_noms'] = list(best_writer_df['Writer_noms'])
writer_noms_df['Best_pic_noms'] = list(best_pic_writer_df['Best_pic_writer_noms'])
writer_noms_df['Best_for_pic_noms'] = list(best_for_pic_writer_df['Best_for_pic_writer_noms'])
writer_noms_df['Best_doc_noms'] = list(best_doc_writer_df['Best_doc_writer_noms'])
writer_noms_df['Best_anim_noms'] = list(best_anim_writer_df['Best_anim_writer_noms'])

In [None]:
cnt_lst = []
for i in range(622):
    f_lst = []
    f1 = writer_noms_df['Best_writer_noms'].iloc[i]
    f2 = writer_noms_df['Best_pic_noms'].iloc[i]
    f3 = writer_noms_df['Best_for_pic_noms'].iloc[i]
    f4 = writer_noms_df['Best_doc_noms'].iloc[i]
    f5 = writer_noms_df['Best_anim_noms'].iloc[i]
    f_lst.append(f1)
    f_lst.append(f2)
    f_lst.append(f3)
    f_lst.append(f4)
    f_lst.append(f5)
    f_cnt = len(set([item for sublist in f_lst for item in sublist]))
    cnt_lst.append(f_cnt)
    
writer_noms_df['Writer_nom_cnt'] = cnt_lst

In [None]:
writer_noms_df[writer_noms_df['Writer_nom_cnt']>0]

## Best Actors 

In [None]:
oscars_data.iloc[:,13:38]

In [None]:
total_lst=[]
for col in oscars_data.iloc[:,13:38]:
    nom_lst = oscars_data[col]
    category_lst=[]
    for nom,yr in zip(nom_lst, oscars_data.iloc[:,0]):
        keys = list(nom.keys())
        value = list(set(nom.values()))[0]

        for k in keys:
            new_lst = []
            new_lst.append(yr)
            new_lst.append(k)
            new_lst.append(value)
            category_lst.append(new_lst)
    total_lst.append(category_lst)

In [None]:
actor_list = [item for sublist in total_lst for item in sublist]
actor_noms = pd.DataFrame(actor_list, columns=['Year','Name','Film'])
actor_noms = actor_noms.dropna()
actor_noms = actor_noms.sort_values(by='Year', ascending=True).reset_index(drop=True)
actor_noms

In [None]:
titles = list(data['Title'])
years = list(data['Rel_year'])
actor1 = list(data['Actor1'])
actor2 = list(data['Actor2'])
actor3 = list(data['Actor3'])
actor4 = list(data['Actor4'])
actor5 = list(data['Actor5'])

oscar_cnt = []
for t,yr,d1,d2,d3,d4,d5 in zip(titles,years,actor1,actor2,actor3,actor4,actor5):
    df_year = actor_noms[actor_noms['Year']<yr]
    nominated_films = []
    for n,f in zip(list(df_year['Name']),list(df_year['Film'])):
        if t==f:
            continue
        elif n==d1:
            nominated_films.append(f)
        elif n==d2:
            nominated_films.append(f)
        elif n==d3:
            nominated_films.append(f)
        elif n==d4:
            nominated_films.append(f)
        elif n==d5:
            nominated_films.append(f)
    # nominated_films = list(set(nominated_films))
    oscar_cnt.append(nominated_films)

actor_noms_df = data[['Title','Rel_year','Actor1','Actor2','Actor3','Actor4','Actor5']]
actor_noms_df['Actor_noms']=oscar_cnt
actor_noms_df[actor_noms_df['Actor_noms'].apply(lambda x: len(x)) > 0]

In [None]:
actor_noms_df['Actor_nom_cnt'] = actor_noms_df['Actor_noms'].apply(lambda x: len(x))
actor_noms_df[actor_noms_df['Actor_nom_cnt']>0]

## Add Subgenres

In [None]:
# read in master data file
data2 = pd.read_pickle('master_data_v2.pkl')

In [None]:
from ast import literal_eval

genres = [literal_eval(item) for item in list(data2['Genre'])]
genres = list(set([item for sublist in genres for item in sublist]))
genres

In [None]:
lgbt = []
for i in range(622):
    keywords = data['Plot_keywords'].iloc[i]
    title = data['Title'].iloc[i]
    cnt=0
    for k in keywords:
        if ('gay' in k) or ('lesbian' in k) or ('bisexual' in k) or ('trans' in k) or ('queer' in k) or \
        ('homosexual' in k) or ('lgbt' in k.lower()) or ('homophob' in k) or ('closet' in k):
            cnt+=1
    if (cnt>2) or (title=='John Was Trying to Contact Aliens') or (title=='A Secret Love'):
        lgbt.append(1)
    else:
        lgbt.append(0)

In [None]:
data2['Genre_LGBTQ']=lgbt
data_gay = data2[['Title','Plot_keywords','Genre_LGBTQ']]
data_gay[data_gay['Genre_LGBTQ']==1]

## Consolidate

In [None]:
# read in master data file
data3 = pd.read_pickle('master_data_v2.pkl')

In [None]:
eng=[]
for l in list(data3['Lang']):
    if l=='English':
        eng.append(1)
    else:
        eng.append(0)
data3['Lang_eng']=eng

In [None]:
rating_cat = []
for r in list(data3['Rating']):
    r=str(r)
    if (r=='R') or (r=='TV-MA'):
        rating_cat.append('Restricted')
    elif (r=='TV-14') or (r=='PG-13'):
        rating_cat.append('Limited')
    elif r=='nan':
        rating_cat.append('Not rated')
    else:
        rating_cat.append('General')

df_ratings = pd.DataFrame(rating_cat, columns=['Rating_cat'])
df_ratings = pd.get_dummies(df_ratings, prefix=['Rating'])
df_ratings[['Rating_General']]

In [None]:
# concatenate everything
df_final_final = pd.concat([data3.iloc[:,:2],data3['Lang_eng'],data3.iloc[:,2:17],data2['Genre_LGBTQ'],\
                     data3.iloc[:,17:33],df_ratings['Rating_General'],data3.iloc[:,33:52],\
                     dir_noms_df['Dir_nom_cnt'],data3.iloc[:,52:65],writer_noms_df['Writer_nom_cnt'],\
                     data3.iloc[:,65:86],actor_noms_df['Actor_nom_cnt'],data3.iloc[:,86:91]], axis=1)

In [None]:
df_final_final.columns

In [None]:
# save final file
# df_final_final.to_pickle('master_data_v3.pkl')