# Diversity in the film industry: data cleaning - Academy Awards

## 1. Importing libraries and dataset

The dataset *Nominees_raw* lists all Academy Awards nominees, with information on name, year, category, film and result.

In [14]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import re
from itertools import chain

In [2]:
# nominees until the year 2020
nominees = pd.read_csv("./data/nominees_raw.csv")
nominees.rename(columns ={'Unnamed: 0': 'index'}, inplace= True)
nominees_original = nominees.copy()

## 2. Scrapping wikipedia

The following wikipedia articles contain information on:
- [All female nominees in the Academy Awards](https://en.wikipedia.org/wiki/List_of_female_Academy_Award_winners_and_nominees_for_non-gendered_categories) (Includes only nominees for non-gendered categories. That is, all categories except *actress*, *actress in a supporting role* and *actress in a leading role*)
- [All black nominees in the Academy Awards](https://en.wikipedia.org/wiki/List_of_black_Academy_Award_winners_and_nominees)
- [All asian nominees in the Academy Awards](https://en.wikipedia.org/wiki/List_of_Asian_Academy_Award_winners_and_nominees)
- [All latin american nominees in the Academy Awards](https://en.wikipedia.org/wiki/List_of_Latin_American_Academy_Award_winners_and_nominees)
- [All hispanic nominees in the Academy Awards](https://en.wikipedia.org/wiki/List_of_Hispanic_Academy_Award_winners_and_nominees) (People of Hispanic descent born outside of Latin America)

By web scrapping the site I will obtained 4 lists containing the names of female, black, asian and latin american nominees.

*Note: hispanic and latin american nominees are allocated in the same list*

#### Creating the lists

In [6]:
# defining function to web scrape all the pages
    # The function returns a list with names, not cleaned

def wikipedia(name, num_tables, url):
    name = []
    for i in range(num_tables):
        df = pd.read_html(url, header=0)[i]
        name.append(list(df.iloc[1:,1]))
    return name

In [4]:
# applying function

female = wikipedia('female', 23, 'https://en.wikipedia.org/wiki/List_of_female_Academy_Award_winners_and_nominees_for_non-gendered_categories')
black = wikipedia('black', 21, 'https://en.wikipedia.org/wiki/List_of_black_Academy_Award_winners_and_nominees')
asian = wikipedia('asian', 24, 'https://en.wikipedia.org/wiki/List_of_Asian_Academy_Award_winners_and_nominees')
    # hispanic people born outside the US 
latin_american = wikipedia('latin_american', 27, 'https://en.wikipedia.org/wiki/List_of_Latin_American_Academy_Award_winners_and_nominees')
    # hispanic people born in the US 
latin_american_us = wikipedia('latin_american_us', 10, 'https://en.wikipedia.org/wiki/List_of_Hispanic_Academy_Award_winners_and_nominees')

Each wikipedia page is composed of several tables. In all of them but 2 the name is in the second column. The following code removes the information appended to the list for these 2 tables and adds the correct items.

In [8]:
# table 23 of 'asian'

    # delete values for that table
del asian[23]

    # get list with correct names
url='https://en.wikipedia.org/wiki/List_of_Asian_Academy_Award_winners_and_nominees'
asian1 = list(pd.read_html(url, header=0)[23].iloc[1:,3])    
    
    # append list to 'asian'
asian.append(asian1)

In [9]:
# table 12 of 'latin_american'

    # delete values for that table
del latin_american[12]

    # get list with correct names
url='https://en.wikipedia.org/wiki/List_of_Latin_American_Academy_Award_winners_and_nominees'
latin_american1 = list(pd.read_html(url, header=0)[12].iloc[1:,3])    
    
    # append list to 'latin_american'
latin_american.append(latin_american1)

In [10]:
# appending list of latin_american_us to latin_american
latin_american = latin_american + latin_american_us

In [11]:
# making copies of original lists
female_original = female.copy()
black_original = black.copy()
asian_original = asian.copy()
latin_american_original = latin_american.copy()

#### Unnesting lists and making sets to remove duplicate names

In [15]:
female = list(chain(*female))
black = list(chain(*black))
asian = list(chain(*asian))
latin_american = list(chain(*latin_american))

In [16]:
# female
female = list(set(female))
female.sort()
female = female[1:]

# black
black = list(set(black))
black = black[1:]

# asian
asian = list(set(asian))
asian = asian[1:]

# latin_american
latin_american = list(set(latin_american))

#### Cleaning lists

Given the format of the wikipedia table, some values in the lists created were not accurate. To solve the issue, 2 steps have been followed:
1. Splitting elements of the list where two names were appended together
2. Remove every item with less than 3 letters

*Splitting elements of the list where two names were appended together*



In [23]:
# function to clean lists
def clean_list(mylist):
    for i in range(len(mylist)): 
        mylist[i] = re.sub("[\(\[].*?[\)\]]", "", mylist[i])
    
    for i in range(len(mylist)):
        if len(re.split(r'(?=[a-z][A-Z])', mylist[i])) == 2:
            a, b = re.split(r'(?=[a-z][A-Z])', mylist[i])
            mylist[i] = a+b[0]
            mylist.append(b[1:])
        elif len(re.split(r'(?=[a-z][A-Z])', mylist[i])) == 3:
            a, b, c = re.split(r'(?=[a-z][A-Z])', mylist[i])
            mylist[i] = a+b[0]
            mylist.append(b[1:]+c[0])
            mylist.append(c[1:])

    for i in range(len(mylist)):
        mylist[i] = mylist[i].strip()
       
    return mylist   

In [24]:
# applying function to every lists

female = clean_list(female)
black = clean_list(black)
asian = clean_list(asian)
latin_american = clean_list(latin_american)

*Remove every item with less than 3 letters*

In [25]:
# female
female = [i for i in female if len(i) > 2]

In [26]:
# black
black = [i for i in black if len(i) > 2]
black.remove('Daniel')
black.remove('SZAAnthony Tiffith')
black.append('Anthony Tiffith')

In [27]:
# asian
asian = [i for i in asian if len(i) > 2]

In [28]:
# latin_american
latin_american = [i for i in latin_american if len(i) > 2]

## 3. Merging lists with 'nominees' dataframe

My goal is to add a new column in the dataframe for each demographic variable (female, black, asian, latin american). To do so, I will create a function that returns the indexes of the *nominees* dataframe where the name of the nominee is found in any of the previously created lists. After, I will add new columns that return 'True' for every index found in each demographic variable.

In [17]:
# defining function to get indexes of nominees who are female/black/asian/latin_american

def checker(variable):
    lst = []
    for i in variable:
        indexes = list(nominees.loc[nominees['name'].str.contains(i, case=True)].index)
        if indexes != []:
            lst.append(indexes)
    lst = [l for i in lst for l in i]
    return lst

In [18]:
# applying function

female_indexes = checker(female)
black_indexes = checker(black)
asian_indexes = checker(asian)
latin_american_indexes = checker(latin_american)

  return func(self, *args, **kwargs)


In [19]:
# creating new columns for each paramter, returning True for indexes in each list

    # female
def artists(row):
    return row['index'] in female_indexes
nominees['female'] = nominees.apply(artists, axis=1)

    # black
def artists(row):
    return row['index'] in black_indexes
nominees['black'] = nominees.apply(artists, axis=1)

    # asian
def artists(row):
    return row['index'] in asian_indexes
nominees['asian'] = nominees.apply(artists, axis=1)
     
    # latin_american
def artists(row):
    return row['index'] in latin_american_indexes
nominees['latin_american'] = nominees.apply(artists, axis=1)


#### Changing value of 'female' to 'True' for female categories
Recall that the *female* list only contained the nominees for non-gendered categories. We thus need to change *female* to *True* for each female category.

In [20]:
# find gendered categories
nominees.category.unique()
gendered_cat_women = ['ACTRESS', 'ACTRESS IN A SUPPORTING ROLE', 'ACTRESS IN A LEADING ROLE']

In [22]:
# changing value of 'female' to 'True' for female categories
for i in gendered_cat_women:
    nominees.loc[nominees['category'] == i, 'female'] = True

#### Adding 'non_white' column
Add non_white column for future analysis

In [36]:
def non_white(row):
    if row['black'] == True or row['asian'] == True or row['latin_american'] == True:
        return True
    else:
        return False

nominees['non_white'] = nominees.apply(lambda row: non_white(row), axis=1)

#### Adding 'gendered_cat' column

Add a column that returns *True* for gendered categories and *False* for non-gendered categories. 

In [23]:
gendered_cat = ['ACTRESS', 'ACTRESS IN A SUPPORTING ROLE', 'ACTRESS IN A LEADING ROLE', 'ACTOR', 'ACTOR IN A SUPPORTING ROLE', 'ACTOR IN A LEADING ROLE']

def gendered_categories(row):
    return row['category'] in gendered_cat
        
nominees['gendered_cat'] = nominees.apply(lambda row: gendered_categories(row), axis=1)

#### Removing first column (index)

In [24]:
del nominees['index']

#### Exporting clean dataset

In [43]:
nominees.to_csv('./data/nominees_clean.csv')