In [1]:
from bs4 import BeautifulSoup 
import urllib.request
import pandas as pd
import numpy as np

### Oscars  - French Wikipedia 

In [2]:
# Grabbing the html file + reading the html using Beautiful soup
html = urllib.request.urlopen("https://fr.wikipedia.org/wiki/Liste_des_films_ayant_obtenu_un_ou_des_Oscars").read()
soup = BeautifulSoup(html, 'html.parser')
urllib.request.urlopen("https://fr.wikipedia.org/wiki/Liste_des_films_ayant_obtenu_un_ou_des_Oscars").close()

In [3]:
# Grabbing the table containing all Oscars
tab = soup.findAll("table",{"class":"wikitable sortable"})[0]

In [4]:
# Grabbing labels + all lines of the table
labels_html = tab.findAll("th")
items_html = tab.findAll("tr")[1:]

In [5]:
# Creating vector with column labels
labels = []
for i, tag in enumerate(labels_html):
    labels.append(tag.text.replace('\n',''))

In [6]:
# Creating array with all films/items 
items = []
for i, tag in enumerate(items_html):
    objects = []
    features = tag.findAll("td")
    if features[0].i.a.text == '(en)' :objects.append(features[0].i.text[:-4])
    if features[0].i.a.text != '(en)' :objects.append(features[0].i.a.text)
    objects.append(features[1].a.text)
    objects.append(features[2].text)
    objects.append(features[3].text.replace('\n',''))
        
    items.append(objects)


In [7]:
# Creating & Saving the data frame using labels and items/films
oscars = pd.DataFrame(data = items,columns=labels)
oscars.to_csv("oscars_french.csv")

In [8]:
# Checking if loading df is OK
oscars=pd.read_csv("oscars_french.csv",index_col=0)
oscars.set_index('Films')

Unnamed: 0_level_0,Années,Oscars,Nominations
Films,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
007 Spectre,2015,1,1
20.000 lieues sous les mers,1954,2,3
"2001, l'Odyssée de l'espace",1968,1,4
49e Parallèle,1942,1,3
8 Mile,2002,1,1
...,...,...,...
Young at Heart,1987,1,1
Z,1969,2,5
Zero Dark Thirty,2012,1,5
Zootopie,2016,1,1


### Oscars - English Wikipedia

In [9]:
# Grabbing the html file + reading the html using Beautiful soup
html = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films").read()
soup = BeautifulSoup(html, 'html.parser')
urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films").close()

In [10]:
# Grabbing the table containing all Oscars
tab = soup.findAll("table",{"class":"wikitable sortable"})[0]

In [11]:
# Grabbing labels + all lines of the table
labels_html = tab.findAll("th")
items_html = tab.findAll("tr")[1:]

In [12]:
# Creating vector with column labels
labels = []
for i, tag in enumerate(labels_html):
    labels.append(tag.text.replace('\n',''))

In [13]:
# Creating array with all films/items 
items = []
for i, tag in enumerate(items_html):
    objects = []
    features = tag.findAll("td")
    
    # film name
    names =[tag.name for tag in features[0].find_all()]
    if len(names) == 1: objects.append(features[0].i.text)
    if len(names) != 1: objects.append(features[0].a.text)
    
    # year 
    objects.append(features[1].a.text)
    
    # number of oscars
    nb_oscars = features[2].text
    objects.append(nb_oscars.split(' ')[0])
    
    # number of nominations
    objects.append(features[3].text.replace('\n',''))
    items.append(objects)

In [14]:
# Creating & Saving the data frame using labels and items/films
oscars = pd.DataFrame(data =items,columns=labels)
oscars.to_csv("oscars_english.csv")

In [15]:
# Checking if loading df is OK
oscars=pd.read_csv("oscars_english.csv",index_col=0)
oscars.set_index("Film")

Unnamed: 0_level_0,Year,Awards,Nominations
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Green Book,2018,3,5
Bohemian Rhapsody,2018,4,5
Roma,2018,3,10
Black Panther,2018,3,7
The Favourite,2018,1,10
...,...,...,...
The Yankee Doodle Mouse,1943,1,1
The Yearling,1946,2,7
"Yesterday, Today and Tomorrow",1964,1,1
You Can't Take It with You,1938,2,7


### Oscars - Oscars Website

In [16]:
# General info before starting to generate data frame
root_url = "https://www.oscars.org/oscars/ceremonies/"
years = [str(i) for i in range(1929,2020)]
labels =  ['Films','Year','Oscars','Nominations']
oscars = pd.DataFrame(data = [],columns = labels)

In [17]:
for i, year in enumerate(years):
    
    # string of the url for each year
    url = root_url + year   
    # Grab html page content
    html = urllib.request.urlopen(url).read() 
    # Read html content
    soup = BeautifulSoup(html, 'html.parser')
    # Close html page
    urllib.request.urlopen(url).close()
    
    # Grab table with all oscars info
    tab = soup.findAll("div",{"class":"view-content"})[1]
    # Grad tables each containing info for each oscars catagory
    categories = tab.findAll("div",{"class":"view-grouping"})
    
    # Iterationg over categories of oscars
    for i, category in enumerate(categories):
        
        # Name of the oscar's category 
        cat_name = category.findAll("div",{"class":"view-grouping-header"})[0]  
        
        # Don't consider special Awards, only oscars
        if "Award" not in cat_name.h2.text:                                     
            
            # if category awards an actor/actress
            if "Actor" in cat_name.h2.text or "Actress" in cat_name.h2.text :
                for j, item in enumerate(category.findAll("div",{"class":"views-field views-field-title"})):
                    
                    # if won the oscar
                    if j == 0: df = pd.DataFrame(data=[[item.span.text.replace('\n',''),year,1,1]],columns = labels)
                    
                    # if only got nominated
                    else : df = pd.DataFrame(data=[[item.span.text.replace('\n',''),year,0,1]],columns = labels)
                    
                    # append df with single nomination to the data base
                    oscars = oscars.append(df,ignore_index = True)                    
            
            # if category awards a film 
            else: 
                for j, item in enumerate(category.findAll("div",{"class":"views-field views-field-field-actor-name"})):
                    
                    # if won the oscar
                    if j == 0: df = pd.DataFrame(data=[[item.h4.text.replace('\n',''),year,1,1]],columns = labels)
                    
                    # if only got nominated
                    else : df = pd.DataFrame(data=[[item.h4.text.replace('\n',''),year,0,1]],columns = labels) 
                    
                    # append df with single nomination to the data base
                    oscars = oscars.append(df,ignore_index = True)
           

In [19]:
clean_oscars = oscars.copy()
grouped = clean_oscars.groupby(['Films'])
final_oscars = pd.DataFrame(data = {"Films":list(grouped.count().index.values),
                                    "Year":grouped['Year'].agg(pd.Series.mode),
                                    "Oscars":list(grouped.Oscars.sum()), 
                                    "Nominations":list(grouped.Nominations.sum())})reset_index(drop=True, inplace=True)

In [30]:
# Creating & Saving the data frame using labels and items/films
final_oscars.to_csv("oscars_website.csv", index=False)

### Golden Globes - Golden Globes Website

In [None]:
# General info before starting to generate data frame
root_url = "https://www.goldenglobes.com/winners-nominees/"
years = [str(i) for i in range(1944,2020)]
labels =  ['Films','Années','Golden Globes','Nominations']
golden_globes = pd.DataFrame(data = [],columns = labels)

In [None]:
# Grabb all info for each year
for i, year in enumerate(years):
    
    url = root_url + year   # string of the url for each year
    
    # Grab html page content
    html = urllib.request.urlopen(url).read() 
    # Read html content
    soup = BeautifulSoup(html, 'html.parser')
    # Close html page
    urllib.request.urlopen(url).close()
    
    # Grab table with all GG info
    tab = soup.findAll("div",{"class":"pane-content"})[0]
    
    # Grab all nominees (films and films nominated through persons)
    films_nominees = tab.findAll("div",{"class":"primary-nominee"})
    person_nominees = tab.findAll("div",{"class":"secondary-nominee"}) 
    
    # Grab each nominations film name one by one
    for i, tag in enumerate(films_nominees):
        if "film" in tag.a['href']:
            golden_globes = golden_globes.append(pd.DataFrame(data=[[tag.a.text,year,0,1]], columns=labels),ignore_index = True)
    for i, tag in enumerate(person_nominees):
        if "film" in tag.a['href']:
            golden_globes = golden_globes.append(pd.DataFrame(data = [[tag.a.text,year,0,1]], columns=labels),ignore_index = True)

In [None]:
# Generalise to all pages + add winner content + clear the resulting data frame
# Peut etre petit probleme avec les titres, dans page wiki, certains titre sont en langue original, d'autres non
golden_globes.groupby('Films')['Nominations'].count()