In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

## Acquire and Prepare

I am going to pull in each award show dataset to clean up to then combine dataframes.

### The Academy Awards

In [2]:
df = pd.read_csv('data/academy_awards.csv')

In [3]:
df.head()

Unnamed: 0,year,category,winner,entity
0,1927,ACTOR,False,Richard Barthelmess
1,1927,ACTOR,True,Emil Jannings
2,1927,ACTRESS,False,Louise Dresser
3,1927,ACTRESS,True,Janet Gaynor
4,1927,ACTRESS,False,Gloria Swanson


In [4]:
df.tail()

Unnamed: 0,year,category,winner,entity
11053,2017,SCIENTIFIC AND TECHNICAL AWARD (Scientific and...,True,"To LEONARD CHAPMAN for the overall concept, de..."
11054,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,To JASON SMITH and JEFF WHITE for the original...
11055,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,"To JOE MANCEWICZ, MATT DERKSEN and HANS RIJPKE..."
11056,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,To ALEX POWELL for his contribution to the des...
11057,2017,SCIENTIFIC AND TECHNICAL AWARD (Technical Achi...,True,To ROB JENSEN for the foundational design and ...


In [5]:
df.shape

(11058, 4)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11058 entries, 0 to 11057
Data columns (total 4 columns):
year        11058 non-null int64
category    11058 non-null object
winner      11058 non-null bool
entity      11058 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 270.1+ KB


In [7]:
df.year.unique()

array([1927, 1928, 1929, 1930, 1931, 1932, 1934, 1935, 1936, 1937, 1938,
       1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949,
       1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017])

In [8]:
df.category.unique()

array(['ACTOR', 'ACTRESS', 'ART DIRECTION', 'CINEMATOGRAPHY',
       'DIRECTING (Comedy Picture)', 'DIRECTING (Dramatic Picture)',
       'ENGINEERING EFFECTS', 'OUTSTANDING PICTURE',
       'UNIQUE AND ARTISTIC PICTURE', 'WRITING (Adaptation)',
       'WRITING (Original Story)', 'WRITING (Title Writing)',
       'SPECIAL AWARD', 'DIRECTING', 'WRITING', 'OUTSTANDING PRODUCTION',
       'SOUND RECORDING', 'SCIENTIFIC OR TECHNICAL AWARD (Class I)',
       'SCIENTIFIC OR TECHNICAL AWARD (Class II)',
       'SCIENTIFIC OR TECHNICAL AWARD (Class III)',
       'SHORT SUBJECT (Cartoon)', 'SHORT SUBJECT (Comedy)',
       'SHORT SUBJECT (Novelty)', 'ASSISTANT DIRECTOR', 'FILM EDITING',
       'MUSIC (Scoring)', 'MUSIC (Song)', 'DANCE DIRECTION',
       'WRITING (Screenplay)', 'ACTOR IN A SUPPORTING ROLE',
       'ACTRESS IN A SUPPORTING ROLE', 'SHORT SUBJECT (Color)',
       'SHORT SUBJECT (One-reel)', 'SHORT SUBJECT (Two-reel)',
       'IRVING G. THALBERG MEMORIAL AWARD', 'MUSIC (Original Scor

### To Do: 
- **Pull out only the categories I want to use**
    - Best Film
    - Best Actor/Actress
    - Supporting Actor/Actress

- **Normalize the categories throughout the years.**
- **Normalize the text**
- **Change title of 'winner' category**
- **Add in the years 2018, 2019, 2020**

### The British Academy Film Awards

In [9]:
df = pd.read_csv('data/bafta_awards.csv')

In [10]:
df.head()

Unnamed: 0,year,category,nominee,workers,winner
0,1949,Film | British Film in 1949,The Fallen Idol,,True
1,1949,Film | British Film in 1949,Hamlet,,False
2,1949,Film | British Film in 1949,Oliver Twist,,False
3,1949,Film | British Film in 1949,Once A Jolly Swagman,,False
4,1949,Film | British Film in 1949,The Red Shoes,,False


In [11]:
df.tail()

Unnamed: 0,year,category,nominee,workers,winner
4171,2020,Film | Supporting Actress in 2020,Marriage Story,Laura Dern,True
4172,2020,Film | Supporting Actress in 2020,Jojo Rabbit,Scarlett Johansson,False
4173,2020,Film | Supporting Actress in 2020,Little Women,Florence Pugh,False
4174,2020,Film | Supporting Actress in 2020,Bombshell,Margot Robbie,False
4175,2020,Film | Supporting Actress in 2020,Once Upon a Time.. in Hollywood,Margot Robbie,False


In [12]:
df.shape

(4176, 5)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 5 columns):
year        4176 non-null int64
category    4176 non-null object
nominee     4176 non-null object
workers     3402 non-null object
winner      4176 non-null bool
dtypes: bool(1), int64(1), object(3)
memory usage: 134.7+ KB


In [14]:
df.year.unique()

array([1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959,
       1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970,
       1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
       1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
       1993, 1994, 1995, 1996, 2001, 2016, 2017, 2018, 2019, 2020])

In [15]:
df.category.unique()

array(['Film | British Film in 1949', 'Film | Documentary in 1949',
       'Film | Film From Any Source in 1949',
       'Film | Special Award in 1949',
       'Film | United Nations Award - for the best Film embodying one or more of the principles of the United Nations Charter in 1949',
       'Film | British Film in 1950', 'Film | Documentary in 1950',
       'Film | Film From Any Source in 1950',
       'Film | Special Award in 1950',
       'Film | United Nations Award - for the best Film embodying one or more of the principles of the United Nations Charter in 1950',
       'Film | British Film in 1951', 'Film | Documentary in 1951',
       'Film | Film From Any Source in 1951',
       'Film | Special Award in 1951',
       'Film | United Nations Award - for the best Film embodying one or more of the principles of the United Nations Charter in 1951',
       'Film | British Film in 1952', 'Film | Documentary in 1952',
       'Film | Film From Any Source in 1952',
       'Film | Spec

### To Do:
- **Pull out unique categories wanted**
    - Best Film
    - Best Actor/Actress
    - Supporting Actor/Actress
- **Normalize text**
- **Drop 'workers' column**
- **Change 'winner' column to reflect award**
- **Clean you category names**

### Screen Actors Guild Awards

In [16]:
df = pd.read_csv('data/sag_awards.csv')

In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,category,year,win,entity
0,0,0,best_picture_cast,2020,True,['Bombshell']
1,1,1,best_picture_cast,2020,False,['The Irishman']
2,2,2,best_picture_cast,2020,False,['Jojo Rabbit']
3,3,3,best_picture_cast,2020,False,['Once Upon a Time...in Hollywood']
4,4,4,best_picture_cast,2020,False,['Parasite']


In [18]:
df.tail()

Unnamed: 0.1,Unnamed: 0,index,category,year,win,entity
655,20,0,support_actor,1995,True,['Martin Landau']
656,21,1,support_actor,1995,False,['Samuel L. Jackson']
657,22,2,support_actor,1995,False,['Chazz Palminteri']
658,23,3,support_actor,1995,False,['Gary Sinise']
659,24,4,support_actor,1995,False,['JOHN TURTURRO']


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 6 columns):
Unnamed: 0    660 non-null int64
index         660 non-null int64
category      660 non-null object
year          660 non-null int64
win           660 non-null bool
entity        650 non-null object
dtypes: bool(1), int64(3), object(2)
memory usage: 26.6+ KB


In [20]:
df.year.unique()

array([2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010,
       2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999,
       1998, 1997, 1996, 1995])

In [21]:
df.category.unique()

array(['best_picture_cast', 'best_actress', 'best_actor',
       'support_actress', 'support_actor'], dtype=object)

#### To Do:
- **Drop index column, unnamed column**
- **Normalize text in entity column**
- **Rename 'win' column**

### Golden Globes

In [22]:
df = pd.read_csv('data/golden_globe_awards.csv')

In [23]:
df.head()

Unnamed: 0,year_film,year_award,ceremony,category,nominee,film,win
0,1943,1944,1,Best Performance by an Actress in a Supporting...,Katina Paxinou,For Whom The Bell Tolls,True
1,1943,1944,1,Best Performance by an Actor in a Supporting R...,Akim Tamiroff,For Whom The Bell Tolls,True
2,1943,1944,1,Best Director - Motion Picture,Henry King,The Song Of Bernadette,True
3,1943,1944,1,Picture,The Song Of Bernadette,,True
4,1943,1944,1,Actress In A Leading Role,Jennifer Jones,The Song Of Bernadette,True


In [24]:
df.tail()

Unnamed: 0,year_film,year_award,ceremony,category,nominee,film,win
7986,2019,2020,77,Best Performance by an Actor in a Supporting R...,Kieran Culkin,Succession,False
7987,2019,2020,77,Best Performance by an Actor in a Supporting R...,Andrew Scott,Fleabag,False
7988,2019,2020,77,Best Performance by an Actor in a Supporting R...,Henry Winkler,Barry,False
7989,2019,2020,77,Cecil B. deMille Award,Tom Hanks,,False
7990,2019,2020,77,Carol Burnett Award,Ellen DeGeneres,,False


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7991 entries, 0 to 7990
Data columns (total 7 columns):
year_film     7991 non-null int64
year_award    7991 non-null int64
ceremony      7991 non-null int64
category      7991 non-null object
nominee       7991 non-null object
film          6191 non-null object
win           7991 non-null bool
dtypes: bool(1), int64(3), object(3)
memory usage: 382.5+ KB


In [26]:
df.year_award.unique()

array([1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954,
       1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965,
       1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976,
       1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987,
       1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
       1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [27]:
df.category.unique()

array(['Best Performance by an Actress in a Supporting Role in any Motion Picture',
       'Best Performance by an Actor in a Supporting Role in any Motion Picture',
       'Best Director - Motion Picture', 'Picture',
       'Actress In A Leading Role', 'Actor In A Leading Role',
       'Promoting International Understanding',
       'Special Achievement Award', 'Best Screenplay - Motion Picture',
       'Best Original Score - Motion Picture',
       'New Star Of The Year - Actress', 'New Star Of The Year - Actor',
       'Juvenile Performance', 'Cinematography',
       'Foreign Film - English Language',
       'Best Motion Picture - Foreign Language',
       'Outstanding Use Of Color',
       'Best Performance by an Actress in a Motion Picture - Drama',
       'Best Performance by an Actor in a Motion Picture - Drama',
       'Best Performance by an Actor in a Motion Picture - Musical or Comedy',
       'New Star Of The Year',
       'Actress In A Leading Role - Musical Or Comedy',
  

### To Do:
- **Use only categories needed**
    - Best Film Comedy/Drama
    - Best Actor/Actress Comedy/Drama
    - Supporting Actor/Actress
 
- **Drop unwanted columns**
    - year_film
    - ceremony
    - film
- **Normalize text**
- **Change column titles**