In [3]:
import pandas as pd

# Reading in the full list of all films associated with the Academy Awards from 1927 - 2017

In [4]:
oscars = pd.read_csv('data/oscars_1927_to_2017.csv')

In [5]:
oscars.head(10)

Unnamed: 0,year,category,winner,entity
0,1927,ACTOR,False,Richard Barthelmess
1,1927,ACTOR,True,Emil Jannings
2,1927,ACTRESS,False,Louise Dresser
3,1927,ACTRESS,True,Janet Gaynor
4,1927,ACTRESS,False,Gloria Swanson
5,1927,ART DIRECTION,False,Rochus Gliese
6,1927,ART DIRECTION,True,William Cameron Menzies
7,1927,ART DIRECTION,False,Harry Oliver
8,1927,CINEMATOGRAPHY,False,George Barnes
9,1927,CINEMATOGRAPHY,True,Charles Rosher


# Establishing a list of categories associated with the Best Picture and filtering the data frame from the list

In [6]:
pic_names = ['BEST PICTURE', 'OUTSTANDING PICTURE', 'OUTSTANDING PRODUCTION', 'OUTSTANDING MOTION PICTURE', 'BEST MOTION PICTURE']

In [7]:
best_pic_list = oscars.loc[oscars['category'].isin(pic_names)]

In [8]:
best_pic_list.head()

Unnamed: 0,year,category,winner,entity
19,1927,OUTSTANDING PICTURE,False,The Racket
20,1927,OUTSTANDING PICTURE,False,7th Heaven
21,1927,OUTSTANDING PICTURE,True,Wings
62,1928,OUTSTANDING PICTURE,False,Alibi
63,1928,OUTSTANDING PICTURE,False,In Old Arizona


# Converting the data frame into a .CSV so that it can be used in the scrape workbook

In [9]:
best_pic_list.to_csv('data/best_pic_list.csv', index = False)

In [10]:
best_pic_list = pd.read_csv('data/best_pic_list.csv')

# Doing a count of the unique entity values to see which movies, actors/actresses, etc show up multiple times through the years

In [11]:
pd.value_counts(oscars.entity)

Meryl Streep                                                                                                                                                                                                                                                                                        21
Titanic                                                                                                                                                                                                                                                                                             14
Cleopatra                                                                                                                                                                                                                                                                                           13
A Star Is Born                                                                                                     

# Doing a removal of all rows that contain the word "AWARD" in the category column as those awards are not applicable towards this project

In [12]:
oscars_clean = oscars[~oscars.category.str.contains("AWARD")]
oscars_clean.shape

(9898, 4)

# Creating a new data frame only containing rows that are either for ACTOR/ACTRESS awards

In [13]:
acting = oscars_clean.loc[oscars_clean.category.str.contains('ACT')]

In [14]:
acting

Unnamed: 0,year,category,winner,entity
0,1927,ACTOR,False,Richard Barthelmess
1,1927,ACTOR,True,Emil Jannings
2,1927,ACTRESS,False,Louise Dresser
3,1927,ACTRESS,True,Janet Gaynor
4,1927,ACTRESS,False,Gloria Swanson
35,1928,ACTOR,False,George Bancroft
36,1928,ACTOR,True,Warner Baxter
37,1928,ACTOR,False,Chester Morris
38,1928,ACTOR,False,Paul Muni
39,1928,ACTOR,False,Lewis Stone


In [15]:
acting.to_csv('data/acting.csv', index = False)

# Creating a new data frame for all of the categories that are NOT associated with acting

In [16]:
oscars_films = oscars_clean[~oscars_clean.category.str.contains("ACT")]
oscars_films

Unnamed: 0,year,category,winner,entity
5,1927,ART DIRECTION,False,Rochus Gliese
6,1927,ART DIRECTION,True,William Cameron Menzies
7,1927,ART DIRECTION,False,Harry Oliver
8,1927,CINEMATOGRAPHY,False,George Barnes
9,1927,CINEMATOGRAPHY,True,Charles Rosher
10,1927,CINEMATOGRAPHY,True,Karl Struss
11,1927,DIRECTING (Comedy Picture),True,Lewis Milestone
12,1927,DIRECTING (Comedy Picture),False,Ted Wilde
13,1927,DIRECTING (Dramatic Picture),True,Frank Borzage
14,1927,DIRECTING (Dramatic Picture),False,Herbert Brenon


In [17]:
oscars_films.to_csv('data/oscars_films.csv', index = False)