# Oscar Award Dataset exploration
## 1. Which films have won the most awards
## 2. Which actors & actresses have won the most awards
## 3. At each year, which films have the most nominations and awards?


In [3]:
import numpy as np
import seaborn as sns
import pandas as pd

In [8]:
oscar_df = pd.read_csv('../src/datasets/the_oscar_award.csv')

In [9]:
oscar_df

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False
...,...,...,...,...,...,...,...
10390,2019,2020,92,WRITING (Original Screenplay),Parasite,Parasite,True
10391,2019,2020,92,JEAN HERSHOLT HUMANITARIAN AWARD,Geena Davis,,True
10392,2019,2020,92,HONORARY AWARD,David Lynch,,True
10393,2019,2020,92,HONORARY AWARD,Wes Studi,,True


In [10]:
# taking a peek at unique award categories
oscar_df.category.unique()

array(['ACTOR', 'ACTRESS', 'ART DIRECTION', 'CINEMATOGRAPHY',
       'DIRECTING (Comedy Picture)', 'DIRECTING (Dramatic Picture)',
       'ENGINEERING EFFECTS', 'OUTSTANDING PICTURE',
       'UNIQUE AND ARTISTIC PICTURE', 'WRITING (Adaptation)',
       'WRITING (Original Story)', 'WRITING (Title Writing)',
       'SPECIAL AWARD', 'DIRECTING', 'WRITING', 'OUTSTANDING PRODUCTION',
       'SOUND RECORDING', 'SHORT SUBJECT (Cartoon)',
       'SHORT SUBJECT (Comedy)', 'SHORT SUBJECT (Novelty)',
       'ASSISTANT DIRECTOR', 'FILM EDITING', 'MUSIC (Scoring)',
       'MUSIC (Song)', 'DANCE DIRECTION', 'WRITING (Screenplay)',
       'ACTOR IN A SUPPORTING ROLE', 'ACTRESS IN A SUPPORTING ROLE',
       'SHORT SUBJECT (Color)', 'SHORT SUBJECT (One-reel)',
       'SHORT SUBJECT (Two-reel)', 'IRVING G. THALBERG MEMORIAL AWARD',
       'MUSIC (Original Score)', 'CINEMATOGRAPHY (Black-and-White)',
       'CINEMATOGRAPHY (Color)', 'SPECIAL EFFECTS',
       'ART DIRECTION (Black-and-White)', 'ART DIRECT

In [11]:
films_df = oscar_df[(oscar_df.category == 'OUTSTANDING PICTURE')]
oscar_df['film'].value_counts()

A Star Is Born                                                            25
Titanic                                                                   16
Mutiny on the Bounty                                                      15
Moulin Rouge                                                              15
Cleopatra                                                                 14
                                                                          ..
The Preacher's Wife                                                        1
Forbidden Planet                                                           1
Conquer by the Clock                                                       1
The Children of Soong Ching Ling                                           1
Eyes on the Prize: America's Civil Rights Years/Bridge to Freedom 1965     1
Name: film, Length: 4833, dtype: int64

In [12]:
win_df = oscar_df[(oscar_df.winner == True)]
win_df['film'].value_counts()

Titanic                                          12
The Lord of the Rings: The Return of the King    11
Ben-Hur                                          11
West Side Story                                  10
The English Patient                               9
                                                 ..
Interstellar                                      1
The Milagro Beanfield War                         1
San Francisco                                     1
The Red Violin                                    1
Knighty Knight Bugs                               1
Name: film, Length: 1273, dtype: int64

In [13]:
oscar_df['film'].value_counts()

A Star Is Born                                                            25
Titanic                                                                   16
Mutiny on the Bounty                                                      15
Moulin Rouge                                                              15
Cleopatra                                                                 14
                                                                          ..
The Preacher's Wife                                                        1
Forbidden Planet                                                           1
Conquer by the Clock                                                       1
The Children of Soong Ching Ling                                           1
Eyes on the Prize: America's Civil Rights Years/Bridge to Freedom 1965     1
Name: film, Length: 4833, dtype: int64

In [14]:
oscar_df['film'][(oscar_df.winner == True)].value_counts()

Titanic                                          12
The Lord of the Rings: The Return of the King    11
Ben-Hur                                          11
West Side Story                                  10
The English Patient                               9
                                                 ..
Interstellar                                      1
The Milagro Beanfield War                         1
San Francisco                                     1
The Red Violin                                    1
Knighty Knight Bugs                               1
Name: film, Length: 1273, dtype: int64

In [15]:
films_df = pd.DataFrame({'count':oscar_df['film'].value_counts()})

In [16]:
nominated_films = pd.DataFrame(oscar_df.film.value_counts().reset_index())
nominated_films.columns = ['film', 'nominated']

In [17]:
nominated_films

Unnamed: 0,film,nominated
0,A Star Is Born,25
1,Titanic,16
2,Mutiny on the Bounty,15
3,Moulin Rouge,15
4,Cleopatra,14
...,...,...
4828,The Preacher's Wife,1
4829,Forbidden Planet,1
4830,Conquer by the Clock,1
4831,The Children of Soong Ching Ling,1


In [18]:
awarded_films = pd.DataFrame(oscar_df.film[(oscar_df.winner == True)].value_counts().reset_index())
awarded_films.columns = ['film', 'won']

In [34]:
awarded_films['won'].fillna(0, inplace=True)

Unnamed: 0,film,won
0,Titanic,12
1,The Lord of the Rings: The Return of the King,11
2,Ben-Hur,11
3,West Side Story,10
4,The English Patient,9
...,...,...
1268,Interstellar,1
1269,The Milagro Beanfield War,1
1270,San Francisco,1
1271,The Red Violin,1


In [35]:
awarded_films.to_csv("../src/datasets/awarded_only_films.csv")

In [20]:
merged_df = pd.merge(nominated_films, awarded_films, on='film',how = 'outer')

In [21]:
merged_df['won'].fillna(0, inplace=True)
merged_df['won']=pd.to_numeric(merged_df['won'], downcast='integer')
merged_df['nominated']=pd.to_numeric(merged_df['nominated'], downcast='integer')
merged_df

Unnamed: 0,film,nominated,won
0,A Star Is Born,25,3
1,Titanic,16,12
2,Mutiny on the Bounty,15,1
3,Moulin Rouge,15,4
4,Cleopatra,14,5
...,...,...,...
4828,The Preacher's Wife,1,0
4829,Forbidden Planet,1,0
4830,Conquer by the Clock,1,0
4831,The Children of Soong Ching Ling,1,0


In [114]:
merged_df.to_csv("awarded_films.csv")

In [22]:
# just double check if there are any NA values
df1 = merged_df[merged_df.isna().any(axis=1)]
df1

Unnamed: 0,film,nominated,won


In [26]:
actors_df = pd.DataFrame(oscar_df.name[(oscar_df.category == "ACTOR")].value_counts().reset_index())
actors_df

Unnamed: 0,index,name
0,Spencer Tracy,9
1,Laurence Olivier,7
2,Marlon Brando,7
3,Paul Muni,6
4,Richard Burton,5
...,...,...
118,Tony Curtis,1
119,Charlton Heston,1
120,Sir Laurence Olivier,1
121,James Whitmore,1


In [27]:
actress_df = pd.DataFrame(oscar_df.name[(oscar_df.category == "ACTRESS")].value_counts().reset_index())
actress_df

Unnamed: 0,index,name
0,Bette Davis,11
1,Katharine Hepburn,11
2,Greer Garson,7
3,Deborah Kerr,6
4,Norma Shearer,6
...,...,...
116,Debbie Reynolds,1
117,Miriam Hopkins,1
118,Corinne Griffith,1
119,Shirley Booth,1


In [33]:
lotr_df = pd.DataFrame(oscar_df.film[(oscar_df.name == "The Lord of the Rings ")].value_counts().reset_index())
lotr_df

Unnamed: 0,index,film
