In [266]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import re

## Acquire and Prepare

I am going to pull in each award show dataset to clean up to then combine dataframes.

### The Academy Awards

In [103]:
df = pd.read_csv('data/academy_awards.csv')

In [85]:
df.head()

Unnamed: 0,year,category,winner,entity
0,1927,ACTOR,False,Richard Barthelmess
1,1927,ACTOR,True,Emil Jannings
2,1927,ACTRESS,False,Louise Dresser
3,1927,ACTRESS,True,Janet Gaynor
4,1927,ACTRESS,False,Gloria Swanson


In [4]:
df.tail()

Unnamed: 0,year,category,winner,entity
11053,2017,SCIENTIFIC AND TECHNICAL AWARD (Scientific and...,True,"To LEONARD CHAPMAN for the overall concept, de..."
11054,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,To JASON SMITH and JEFF WHITE for the original...
11055,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,"To JOE MANCEWICZ, MATT DERKSEN and HANS RIJPKE..."
11056,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,To ALEX POWELL for his contribution to the des...
11057,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,To ROB JENSEN for the foundational design and ...


In [5]:
df.shape

(11058, 4)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11058 entries, 0 to 11057
Data columns (total 4 columns):
year        11058 non-null int64
category    11058 non-null object
winner      11058 non-null bool
entity      11058 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 270.1+ KB


In [7]:
df.year.unique()

array([1927, 1928, 1929, 1930, 1931, 1932, 1934, 1935, 1936, 1937, 1938,
       1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949,
       1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017])

In [8]:
df.category.unique()

array(['ACTOR', 'ACTRESS', 'ART DIRECTION', 'CINEMATOGRAPHY',
       'DIRECTING (Comedy Picture)', 'DIRECTING (Dramatic Picture)',
       'ENGINEERING EFFECTS', 'OUTSTANDING PICTURE',
       'UNIQUE AND ARTISTIC PICTURE', 'WRITING (Adaptation)',
       'WRITING (Original Story)', 'WRITING (Title Writing)',
       'SPECIAL AWARD', 'DIRECTING', 'WRITING', 'OUTSTANDING PRODUCTION',
       'SOUND RECORDING', 'SCIENTIFIC OR TECHNICAL AWARD (Class I)',
       'SCIENTIFIC OR TECHNICAL AWARD (Class II)',
       'SCIENTIFIC OR TECHNICAL AWARD (Class III)',
       'SHORT SUBJECT (Cartoon)', 'SHORT SUBJECT (Comedy)',
       'SHORT SUBJECT (Novelty)', 'ASSISTANT DIRECTOR', 'FILM EDITING',
       'MUSIC (Scoring)', 'MUSIC (Song)', 'DANCE DIRECTION',
       'WRITING (Screenplay)', 'ACTOR IN A SUPPORTING ROLE',
       'ACTRESS IN A SUPPORTING ROLE', 'SHORT SUBJECT (Color)',
       'SHORT SUBJECT (One-reel)', 'SHORT SUBJECT (Two-reel)',
       'IRVING G. THALBERG MEMORIAL AWARD', 'MUSIC (Original Scor

### To Do: 
- **Pull out only the categories I want to use**
    - Best Film
    - Best Actor/Actress
    - Supporting Actor/Actress

- **Normalize the categories throughout the years.**
- **Normalize the text**
- **Change title of 'winner' category**
- **Add in the years 2018, 2019, 2020**

In [None]:
# categories I need to pull out are:
# actress, actor, actress in a supporting role, actor in a supporting role, actor in a leading role,
# actress in a leading role, best motion picture, outstanding picture, outstanding motion picture

In [74]:
# lowercase letters
df['entity'].str.lower()

0                                      richard barthelmess
1                                            emil jannings
2                                           louise dresser
3                                             janet gaynor
4                                           gloria swanson
                               ...                        
11053    to leonard chapman for the overall concept, de...
11054    to jason smith and jeff white for the original...
11055    to joe mancewicz, matt derksen and hans rijpke...
11056    to alex powell for his contribution to the des...
11057    to rob jensen for the foundational design and ...
Name: entity, Length: 11058, dtype: object

In [38]:
df[df.category == 'ACTOR'].replace('ACTOR', 'lead_actor')

Unnamed: 0,year,category,winner,entity
0,1927,lead_actor,False,Richard Barthelmess
1,1927,lead_actor,True,Emil Jannings
35,1928,lead_actor,False,George Bancroft
36,1928,lead_actor,True,Warner Baxter
37,1928,lead_actor,False,Chester Morris
...,...,...,...,...
5560,1975,lead_actor,False,Walter Matthau
5561,1975,lead_actor,True,Jack Nicholson
5562,1975,lead_actor,False,Al Pacino
5563,1975,lead_actor,False,Maximilian Schell


In [39]:
df[df.category == 'ACTOR IN A LEADING ROLE'].replace('ACTOR IN A LEADING ROLE', 'lead_actor')

Unnamed: 0,year,category,winner,entity
5676,1976,lead_actor,False,Robert De Niro
5677,1976,lead_actor,True,Peter Finch
5678,1976,lead_actor,False,Giancarlo Giannini
5679,1976,lead_actor,False,William Holden
5680,1976,lead_actor,False,Sylvester Stallone
...,...,...,...,...
10920,2017,lead_actor,False,Timothée Chalamet
10921,2017,lead_actor,False,Daniel Day-Lewis
10922,2017,lead_actor,False,Daniel Kaluuya
10923,2017,lead_actor,True,Gary Oldman


In [41]:
df[df.category == 'ACTRESS'].replace('ACTRESS', 'lead_actress')

Unnamed: 0,year,category,winner,entity
2,1927,lead_actress,False,Louise Dresser
3,1927,lead_actress,True,Janet Gaynor
4,1927,lead_actress,False,Gloria Swanson
40,1928,lead_actress,False,Ruth Chatterton
41,1928,lead_actress,False,Betty Compson
...,...,...,...,...
5570,1975,lead_actress,False,Isabelle Adjani
5571,1975,lead_actress,False,Ann-Margret
5572,1975,lead_actress,True,Louise Fletcher
5573,1975,lead_actress,False,Glenda Jackson


In [42]:
df[df.category == 'ACTRESS IN A LEADING ROLE'].replace("ACTRESS IN A LEADING ROLE", 'lead_actress')

Unnamed: 0,year,category,winner,entity
5686,1976,lead_actress,False,Marie-Christine Barrault
5687,1976,lead_actress,True,Faye Dunaway
5688,1976,lead_actress,False,Talia Shire
5689,1976,lead_actress,False,Sissy Spacek
5690,1976,lead_actress,False,Liv Ullmann
...,...,...,...,...
10930,2017,lead_actress,False,Sally Hawkins
10931,2017,lead_actress,True,Frances McDormand
10932,2017,lead_actress,False,Margot Robbie
10933,2017,lead_actress,False,Saoirse Ronan


In [43]:
df[df.category == 'ACTRESS IN A SUPPORTING ROLE'].replace("ACTRESS IN A SUPPORTING ROLE", 'supporting_actress')

Unnamed: 0,year,category,winner,entity
453,1936,supporting_actress,False,Beulah Bondi
454,1936,supporting_actress,False,Alice Brady
455,1936,supporting_actress,False,Bonita Granville
456,1936,supporting_actress,False,Maria Ouspenskaya
457,1936,supporting_actress,True,Gale Sondergaard
...,...,...,...,...
10935,2017,supporting_actress,False,Mary J. Blige
10936,2017,supporting_actress,True,Allison Janney
10937,2017,supporting_actress,False,Lesley Manville
10938,2017,supporting_actress,False,Laurie Metcalf


In [44]:
df[df.category == 'ACTOR IN A SUPPORTING ROLE'].replace("ACTOR IN A SUPPORTING ROLE", 'supporting_actor')

Unnamed: 0,year,category,winner,entity
443,1936,supporting_actor,False,Mischa Auer
444,1936,supporting_actor,True,Walter Brennan
445,1936,supporting_actor,False,Stuart Erwin
446,1936,supporting_actor,False,Basil Rathbone
447,1936,supporting_actor,False,Akim Tamiroff
...,...,...,...,...
10925,2017,supporting_actor,False,Willem Dafoe
10926,2017,supporting_actor,False,Woody Harrelson
10927,2017,supporting_actor,False,Richard Jenkins
10928,2017,supporting_actor,False,Christopher Plummer


In [55]:
df[df.category == 'OUTSTANDING PICTURE']

Unnamed: 0,year,category,winner,entity
19,1927,OUTSTANDING PICTURE,False,The Caddo Company
20,1927,OUTSTANDING PICTURE,False,Fox
21,1927,OUTSTANDING PICTURE,True,Paramount Famous Lasky
62,1928,OUTSTANDING PICTURE,False,Feature Productions
63,1928,OUTSTANDING PICTURE,False,Fox
64,1928,OUTSTANDING PICTURE,True,Metro-Goldwyn-Mayer
65,1928,OUTSTANDING PICTURE,False,Metro-Goldwyn-Mayer
66,1928,OUTSTANDING PICTURE,False,Paramount Famous Lasky


In [63]:
df[df.category == 'OUTSTANDING PRODUCTION']

Unnamed: 0,year,category,winner,entity
100,1929,OUTSTANDING PRODUCTION,True,All Quiet on the Western Front
101,1929,OUTSTANDING PRODUCTION,False,The Big House
102,1929,OUTSTANDING PRODUCTION,False,Disraeli
103,1929,OUTSTANDING PRODUCTION,False,The Divorcee
104,1929,OUTSTANDING PRODUCTION,False,The Love Parade
...,...,...,...,...
1081,1940,OUTSTANDING PRODUCTION,False,The Letter
1082,1940,OUTSTANDING PRODUCTION,False,The Long Voyage Home
1083,1940,OUTSTANDING PRODUCTION,False,Our Town
1084,1940,OUTSTANDING PRODUCTION,False,The Philadelphia Story


In [53]:
df[df.category == 'OUTSTANDING MOTION PICTURE']

Unnamed: 0,year,category,winner,entity
1251,1941,OUTSTANDING MOTION PICTURE,False,Blossoms in the Dust
1252,1941,OUTSTANDING MOTION PICTURE,False,Citizen Kane
1253,1941,OUTSTANDING MOTION PICTURE,False,Here Comes Mr. Jordan
1254,1941,OUTSTANDING MOTION PICTURE,False,Hold Back the Dawn
1255,1941,OUTSTANDING MOTION PICTURE,True,How Green Was My Valley
1256,1941,OUTSTANDING MOTION PICTURE,False,The Little Foxes
1257,1941,OUTSTANDING MOTION PICTURE,False,The Maltese Falcon
1258,1941,OUTSTANDING MOTION PICTURE,False,One Foot in Heaven
1259,1941,OUTSTANDING MOTION PICTURE,False,Sergeant York
1260,1941,OUTSTANDING MOTION PICTURE,False,Suspicion


In [57]:
df[df.category == 'BEST MOTION PICTURE']

Unnamed: 0,year,category,winner,entity
1821,1944,BEST MOTION PICTURE,False,Double Indemnity
1822,1944,BEST MOTION PICTURE,False,Gaslight
1823,1944,BEST MOTION PICTURE,True,Going My Way
1824,1944,BEST MOTION PICTURE,False,Since You Went Away
1825,1944,BEST MOTION PICTURE,False,Wilson
...,...,...,...,...
3980,1961,BEST MOTION PICTURE,False,Fanny
3981,1961,BEST MOTION PICTURE,False,The Guns of Navarone
3982,1961,BEST MOTION PICTURE,False,The Hustler
3983,1961,BEST MOTION PICTURE,False,Judgment at Nuremberg


In [56]:
df[df.category == 'BEST PICTURE']

Unnamed: 0,year,category,winner,entity
4107,1962,BEST PICTURE,True,Lawrence of Arabia
4108,1962,BEST PICTURE,False,The Longest Day
4109,1962,BEST PICTURE,False,Meredith Willson's The Music Man
4110,1962,BEST PICTURE,False,Mutiny on the Bounty
4111,1962,BEST PICTURE,False,To Kill a Mockingbird
...,...,...,...,...
10997,2017,BEST PICTURE,False,Lady Bird
10998,2017,BEST PICTURE,False,Phantom Thread
10999,2017,BEST PICTURE,False,The Post
11000,2017,BEST PICTURE,True,The Shape of Water


In [75]:
# rename category strings needed
df.replace({'category': {'ACTOR': 'lead_actor', 'ACTOR IN A LEADING ROLE': 'lead_actor',
                         'ACTRESS': 'lead_actress', 'ACTRESS IN A SUPPORTING ROLE': 'lead_actress',
                         'ACTRESS IN A SUPPORTING ROLE': 'supporting_actress', 
                         'ACTOR IN A SUPPORTING ROLE': 'supporting_actor',
                         'OUTSTANDING PICTURE': 'best_picture','OUTSTANDING PRODUCTION': 'best_picture', 
                         'OUTSTANDING MOTION PICTURE': 'best_picture', 'BEST MOTION PICTURE': 'best_picture', 
                         'BEST PICTURE': 'bset_picture'}})

Unnamed: 0,year,category,winner,entity
0,1927,lead_actor,False,Richard Barthelmess
1,1927,lead_actor,True,Emil Jannings
2,1927,lead_actress,False,Louise Dresser
3,1927,lead_actress,True,Janet Gaynor
4,1927,lead_actress,False,Gloria Swanson
...,...,...,...,...
11053,2017,SCIENTIFIC AND TECHNICAL AWARD (Scientific and...,True,"To LEONARD CHAPMAN for the overall concept, de..."
11054,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,To JASON SMITH and JEFF WHITE for the original...
11055,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,"To JOE MANCEWICZ, MATT DERKSEN and HANS RIJPKE..."
11056,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,To ALEX POWELL for his contribution to the des...


In [278]:
def normalize_oscars():
    df = pd.read_csv('data/academy_awards.csv')
    df = df.replace({'category': {'ACTOR': 'lead_actor', 'ACTOR IN A LEADING ROLE': 'lead_actor',
                         'ACTRESS': 'lead_actress', 'ACTRESS IN A SUPPORTING ROLE': 'lead_actress',
                         'ACTRESS IN A SUPPORTING ROLE': 'supporting_actress', 
                         'ACTOR IN A SUPPORTING ROLE': 'supporting_actor',
                         'OUTSTANDING PICTURE': 'best_picture','OUTSTANDING PRODUCTION': 'best_picture', 
                         'OUTSTANDING MOTION PICTURE': 'best_picture', 'BEST MOTION PICTURE': 'best_picture', 
                         'BEST PICTURE': 'bset_picture'}})
    df1 = df[df.category == 'best_picture']
    df2 = df[df.category == 'lead_actor']
    df3 = df[df.category == 'lead_actress']
    df4 = df[df.category == 'supporting_actor']
    df5 = df[df.category == 'supporting_actress']
    df = pd.concat([df1, df2, df3, df4, df5])
    
    newlist = [[2018, 'best_picture', True, 'The Shape of Water'],
               [2018, 'best_picture', False, 'Call Me By Your Name'],
               [2018, 'best_picture', False, 'Darkest Hour'],
               [2018, 'best_picture', False, 'Dunkirk'],
               [2018, 'best_picture', False, 'Get Out'],
               [2018, 'best_picture', False, 'Lady Bird'],
               [2018, 'best_picture', False, 'Phantom Thread'],
               [2018, 'best_picture', False, 'The Post'],
               [2018, 'best_picture', False, 'Three Billbaords Outside of Ebbing, Missouri'],
               [2018, 'lead_actor', True, 'Gary Oldman'], 
               [2018, 'lead_actor', False, 'Timothee Chalamet'],
               [2018, 'lead_actor', False, 'Daniel Day-Lewis'],
               [2018, 'lead_actor', False, 'Daniel Kaluuya'],
               [2018, 'lead_actor', False, 'Danzel Washington'],
               [2018, 'lead_actress', True, 'Frances McDormand'], 
               [2018, 'lead_actress', False, 'Sally Hawkins'],
               [2018, 'lead_actress', False, 'Margot Robbie'],
               [2018, 'lead_actress', False, 'Saoirse Ronan'],
               [2018, 'lead_actress', False, 'Meryl Streep'],
               [2018, 'supporting_actor', True, 'Gary Oldman'], 
               [2018, 'supporting_actor', False, 'Timothee Chalamet'],
               [2018, 'supporting_actor', False, 'Daniel Day-Lewis'],
               [2018, 'supporting_actor', False, 'Daniel Kaluuya'],
               [2018, 'supporting_actor', False, 'Danzel Washington'],
               [2018, 'supporting_actress', True, 'Frances McDormand'], 
               [2018, 'supporting_actress', False, 'Sally Hawkins'],
               [2018, 'supporting_actress', False, 'Margot Robbie'],
               [2018, 'supporting_actress', False, 'Saoirse Ronan'],
               [2018, 'supporting_actress', False, 'Meryl Streep'],
               [2019, 'best_picture', True, 'Green Book'],
               [2019, 'best_picture', False, 'Black Panther'],
               [2019, 'best_picture', False, 'Blackkklansman'],
               [2019, 'best_picture', False, 'Bohemian Rhaspody'],
               [2019, 'best_picture', False, 'The Favourite'],
               [2019, 'best_picture', False, 'Roma'],
               [2019, 'best_picture', False, 'A Star Is Born'],
               [2019, 'best_picture', False, 'Vice'],
               [2019, 'lead_actor', True, 'Rami Malek'], 
               [2019, 'lead_actor', False, 'Christian Bale'],
               [2019, 'lead_actor', False, 'Bradley Cooper'],
               [2019, 'lead_actor', False, 'Willem Dafoe'],
               [2019, 'lead_actor', False, 'Viggo Mortensen'],
               [2019, 'lead_actress', True, 'Olivia Coleman'], 
               [2019, 'lead_actress', False, 'Yalitza Aparicio'],
               [2019, 'lead_actress', False, 'Glenn Close'],
               [2019, 'lead_actress', False, 'Lady Gaga'],
               [2019, 'lead_actress', False, 'Melissa McCarthy'],
               [2019, 'supporting_actor', True, 'Mahershala Ali'], 
               [2019, 'supporting_actor', False, 'Adam Driver'],
               [2019, 'supporting_actor', False, 'Sam Elliott'],
               [2019, 'supporting_actor', False, 'Richard E. Grant'],
               [2019, 'supporting_actor', False, 'Sam Rockwell'],
               [2019, 'supporting_actress', True, 'Regina King'], 
               [2019, 'supporting_actress', False, 'Amy Adams'],
               [2019, 'supporting_actress', False, 'Marina De Tavira'],
               [2019, 'supporting_actress', False, 'Emma Stone'],
               [2019, 'supporting_actress', False, 'Rachel Weisz'],
               [2020, 'best_picture', False, 'Ford V Ferrari'],
               [2020, 'best_picture', False, 'The Irishman'],
               [2020, 'best_picture', False, 'Jojo Rabbit'],
               [2020, 'best_picture', False, 'Joker'],
               [2020, 'best_picture', False, 'Little Woman'],
               [2020, 'best_picture', False, 'Marriage Story'],
               [2020, 'best_picture', False, '1917'],
               [2020, 'best_picture', False, 'Once Upon A Time In Hollywood'],
               [2020, 'best_picture', False, 'Parasite'],
               [2020, 'lead_actor', False, 'Antonio Banderas'], 
               [2020, 'lead_actor', False, 'Leonardo Dicaprio'],
               [2020, 'lead_actor', False, 'Adam Driver'],
               [2020, 'lead_actor', False, 'Joaquin Phoenix'],
               [2020, 'lead_actor', False, 'Jonathan Pryce'],
               [2020, 'lead_actress', False, 'Cynthia Erivo'], 
               [2020, 'lead_actress', False, 'Scarlett Johansson'],
               [2020, 'lead_actress', False, 'Saoirse Ronan'],
               [2020, 'lead_actress', False, 'Charlize Theron'],
               [2020, 'lead_actress', False, 'Renee Zellweger'],
               [2020, 'supporting_actor', False, 'Tom Hanks'], 
               [2020, 'supporting_actor', False, 'Anthony Hopkins'],
               [2020, 'supporting_actor', False, 'Al Pacino'],
               [2020, 'supporting_actor', False, 'Joe Pesci'],
               [2020, 'supporting_actor', False, 'Brad Pitt'],
               [2020, 'supporting_actress', False, 'Kathy Bates'], 
               [2020, 'supporting_actress', False, 'Laura Dern'],
               [2020, 'supporting_actress', False, 'Scarlett Johansson'],
               [2020, 'supporting_actress', False, 'Florence Pugh'],
               [2020, 'supporting_actress', False, 'Margot Robbie']]
    addition = pd.DataFrame(newlist, columns = ['year', 'category', 'winner', 'entity'])
    df = df.append(addition)
    
    df.entity = df.entity.str.lower()
    df = df.reset_index().drop(columns='index')
    df = df.rename(columns = {'winner':'oscar_wins', 'entity': 'nominee'})
    df.year = df.year.astype(object)
    return df

In [279]:
test = normalize_oscars()

In [280]:
test

Unnamed: 0,year,category,oscar_wins,nominee
0,1927,best_picture,False,the caddo company
1,1927,best_picture,False,fox
2,1927,best_picture,True,paramount famous lasky
3,1928,best_picture,False,feature productions
4,1928,best_picture,False,fox
...,...,...,...,...
1809,2020,supporting_actress,False,kathy bates
1810,2020,supporting_actress,False,laura dern
1811,2020,supporting_actress,False,scarlett johansson
1812,2020,supporting_actress,False,florence pugh


In [240]:
test.shape

(1814, 4)

In [282]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1814 entries, 0 to 1813
Data columns (total 4 columns):
year          1814 non-null object
category      1814 non-null object
oscar_wins    1814 non-null bool
nominee       1814 non-null object
dtypes: bool(1), object(3)
memory usage: 44.4+ KB


In [281]:
test.category.unique()

array(['best_picture', 'lead_actor', 'lead_actress', 'supporting_actor',
       'supporting_actress'], dtype=object)

In [243]:
test.year.unique()

array([1927, 1928, 1929, 1930, 1931, 1932, 1934, 1935, 1936, 1937, 1938,
       1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949,
       1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020], dtype=object)

### The British Academy Film Awards

In [283]:
df = pd.read_csv('data/bafta_awards.csv')

In [284]:
df.head()

Unnamed: 0,year,category,nominee,workers,winner
0,1949,Film | British Film in 1949,The Fallen Idol,,True
1,1949,Film | British Film in 1949,Hamlet,,False
2,1949,Film | British Film in 1949,Oliver Twist,,False
3,1949,Film | British Film in 1949,Once A Jolly Swagman,,False
4,1949,Film | British Film in 1949,The Red Shoes,,False


In [261]:
df.tail()

Unnamed: 0,year,category,nominee,workers,winner
4171,2020,Film | Supporting Actress in 2020,Marriage Story,Laura Dern,True
4172,2020,Film | Supporting Actress in 2020,Jojo Rabbit,Scarlett Johansson,False
4173,2020,Film | Supporting Actress in 2020,Little Women,Florence Pugh,False
4174,2020,Film | Supporting Actress in 2020,Bombshell,Margot Robbie,False
4175,2020,Film | Supporting Actress in 2020,Once Upon a Time.. in Hollywood,Margot Robbie,False


In [262]:
df.shape

(4176, 5)

In [263]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 5 columns):
year        4176 non-null int64
category    4176 non-null object
nominee     4176 non-null object
workers     3402 non-null object
winner      4176 non-null bool
dtypes: bool(1), int64(1), object(3)
memory usage: 134.7+ KB


In [264]:
df.year.unique()

array([1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959,
       1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970,
       1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
       1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
       1993, 1994, 1995, 1996, 2001, 2016, 2017, 2018, 2019, 2020])

In [265]:
df.category.unique()

array(['Film | British Film in 1949', 'Film | Documentary in 1949',
       'Film | Film From Any Source in 1949',
       'Film | Special Award in 1949',
       'Film | United Nations Award - for the best Film embodying one or more of the principles of the United Nations Charter in 1949',
       'Film | British Film in 1950', 'Film | Documentary in 1950',
       'Film | Film From Any Source in 1950',
       'Film | Special Award in 1950',
       'Film | United Nations Award - for the best Film embodying one or more of the principles of the United Nations Charter in 1950',
       'Film | British Film in 1951', 'Film | Documentary in 1951',
       'Film | Film From Any Source in 1951',
       'Film | Special Award in 1951',
       'Film | United Nations Award - for the best Film embodying one or more of the principles of the United Nations Charter in 1951',
       'Film | British Film in 1952', 'Film | Documentary in 1952',
       'Film | Film From Any Source in 1952',
       'Film | Spec

### To Do:
- **Pull out unique categories wanted**
    - Best Film
    - Best Actor/Actress
    - Supporting Actor/Actress
- **Normalize text**
- **Drop 'workers' column**
- **Change 'winner' column to reflect award**
- **Clean you category names**

In [285]:
df.year = df.year.astype(object)

In [286]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 5 columns):
year        4176 non-null object
category    4176 non-null object
nominee     4176 non-null object
workers     3402 non-null object
winner      4176 non-null bool
dtypes: bool(1), object(4)
memory usage: 134.7+ KB


In [287]:
df.nominee.str.lower()

0                       the fallen idol
1                                hamlet
2                          oliver twist
3                  once a jolly swagman
4                         the red shoes
                     ...               
4171                     marriage story
4172                        jojo rabbit
4173                       little women
4174                          bombshell
4175    once upon a time.. in hollywood
Name: nominee, Length: 4176, dtype: object

In [289]:
df.drop(columns=('workers'))

Unnamed: 0,year,category,nominee,winner
0,1949,Film | British Film in 1949,The Fallen Idol,True
1,1949,Film | British Film in 1949,Hamlet,False
2,1949,Film | British Film in 1949,Oliver Twist,False
3,1949,Film | British Film in 1949,Once A Jolly Swagman,False
4,1949,Film | British Film in 1949,The Red Shoes,False
...,...,...,...,...
4171,2020,Film | Supporting Actress in 2020,Marriage Story,True
4172,2020,Film | Supporting Actress in 2020,Jojo Rabbit,False
4173,2020,Film | Supporting Actress in 2020,Little Women,False
4174,2020,Film | Supporting Actress in 2020,Bombshell,False


In [290]:
### find which categories to use
# supporting actress
# supporting actor
# leading actress
# leading actor
# best film
# actor
# actress
# film
df[df.category == ]

array(['Film | British Film in 1949', 'Film | Documentary in 1949',
       'Film | Film From Any Source in 1949',
       'Film | Special Award in 1949',
       'Film | United Nations Award - for the best Film embodying one or more of the principles of the United Nations Charter in 1949',
       'Film | British Film in 1950', 'Film | Documentary in 1950',
       'Film | Film From Any Source in 1950',
       'Film | Special Award in 1950',
       'Film | United Nations Award - for the best Film embodying one or more of the principles of the United Nations Charter in 1950',
       'Film | British Film in 1951', 'Film | Documentary in 1951',
       'Film | Film From Any Source in 1951',
       'Film | Special Award in 1951',
       'Film | United Nations Award - for the best Film embodying one or more of the principles of the United Nations Charter in 1951',
       'Film | British Film in 1952', 'Film | Documentary in 1952',
       'Film | Film From Any Source in 1952',
       'Film | Spec

In [None]:
def normalize_baftas():
    df.year = df.year.astype(object)
    df = df.nominee.str.lower()
    df = df.drop(columns=('workers'))
    

### Screen Actors Guild Awards

In [16]:
df = pd.read_csv('data/sag_awards.csv')

In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,category,year,win,entity
0,0,0,best_picture_cast,2020,True,['Bombshell']
1,1,1,best_picture_cast,2020,False,['The Irishman']
2,2,2,best_picture_cast,2020,False,['Jojo Rabbit']
3,3,3,best_picture_cast,2020,False,['Once Upon a Time...in Hollywood']
4,4,4,best_picture_cast,2020,False,['Parasite']


In [18]:
df.tail()

Unnamed: 0.1,Unnamed: 0,index,category,year,win,entity
655,20,0,support_actor,1995,True,['Martin Landau']
656,21,1,support_actor,1995,False,['Samuel L. Jackson']
657,22,2,support_actor,1995,False,['Chazz Palminteri']
658,23,3,support_actor,1995,False,['Gary Sinise']
659,24,4,support_actor,1995,False,['JOHN TURTURRO']


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 6 columns):
Unnamed: 0    660 non-null int64
index         660 non-null int64
category      660 non-null object
year          660 non-null int64
win           660 non-null bool
entity        650 non-null object
dtypes: bool(1), int64(3), object(2)
memory usage: 26.6+ KB


In [20]:
df.year.unique()

array([2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010,
       2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999,
       1998, 1997, 1996, 1995])

In [21]:
df.category.unique()

array(['best_picture_cast', 'best_actress', 'best_actor',
       'support_actress', 'support_actor'], dtype=object)

#### To Do:
- **Drop index column, unnamed column**
- **Normalize text in entity column**
- **Rename 'win' column**

### Golden Globes

In [22]:
df = pd.read_csv('data/golden_globe_awards.csv')

In [23]:
df.head()

Unnamed: 0,year_film,year_award,ceremony,category,nominee,film,win
0,1943,1944,1,Best Performance by an Actress in a Supporting...,Katina Paxinou,For Whom The Bell Tolls,True
1,1943,1944,1,Best Performance by an Actor in a Supporting R...,Akim Tamiroff,For Whom The Bell Tolls,True
2,1943,1944,1,Best Director - Motion Picture,Henry King,The Song Of Bernadette,True
3,1943,1944,1,Picture,The Song Of Bernadette,,True
4,1943,1944,1,Actress In A Leading Role,Jennifer Jones,The Song Of Bernadette,True


In [24]:
df.tail()

Unnamed: 0,year_film,year_award,ceremony,category,nominee,film,win
7986,2019,2020,77,Best Performance by an Actor in a Supporting R...,Kieran Culkin,Succession,False
7987,2019,2020,77,Best Performance by an Actor in a Supporting R...,Andrew Scott,Fleabag,False
7988,2019,2020,77,Best Performance by an Actor in a Supporting R...,Henry Winkler,Barry,False
7989,2019,2020,77,Cecil B. deMille Award,Tom Hanks,,False
7990,2019,2020,77,Carol Burnett Award,Ellen DeGeneres,,False


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7991 entries, 0 to 7990
Data columns (total 7 columns):
year_film     7991 non-null int64
year_award    7991 non-null int64
ceremony      7991 non-null int64
category      7991 non-null object
nominee       7991 non-null object
film          6191 non-null object
win           7991 non-null bool
dtypes: bool(1), int64(3), object(3)
memory usage: 382.5+ KB


In [26]:
df.year_award.unique()

array([1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954,
       1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965,
       1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976,
       1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987,
       1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
       1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [27]:
df.category.unique()

array(['Best Performance by an Actress in a Supporting Role in any Motion Picture',
       'Best Performance by an Actor in a Supporting Role in any Motion Picture',
       'Best Director - Motion Picture', 'Picture',
       'Actress In A Leading Role', 'Actor In A Leading Role',
       'Promoting International Understanding',
       'Special Achievement Award', 'Best Screenplay - Motion Picture',
       'Best Original Score - Motion Picture',
       'New Star Of The Year - Actress', 'New Star Of The Year - Actor',
       'Juvenile Performance', 'Cinematography',
       'Foreign Film - English Language',
       'Best Motion Picture - Foreign Language',
       'Outstanding Use Of Color',
       'Best Performance by an Actress in a Motion Picture - Drama',
       'Best Performance by an Actor in a Motion Picture - Drama',
       'Best Performance by an Actor in a Motion Picture - Musical or Comedy',
       'New Star Of The Year',
       'Actress In A Leading Role - Musical Or Comedy',
  

### To Do:
- **Use only categories needed**
    - Best Film Comedy/Drama
    - Best Actor/Actress Comedy/Drama
    - Supporting Actor/Actress
 
- **Drop unwanted columns**
    - year_film
    - ceremony
    - film
- **Normalize text**
- **Change column titles**