In [1]:
from bs4 import BeautifulSoup 
import urllib.request
import pandas as pd
import numpy as np

### Oscars  - Wikipedia 

In [2]:
# Grabbing the html file + reading the html using Beautiful soup
html = urllib.request.urlopen("https://fr.wikipedia.org/wiki/Liste_des_films_ayant_obtenu_un_ou_des_Oscars").read()
soup = BeautifulSoup(html, 'html.parser')
urllib.request.urlopen("https://fr.wikipedia.org/wiki/Liste_des_films_ayant_obtenu_un_ou_des_Oscars").close()

In [3]:
# Grabbing the table containing all Oscars
tab = soup.findAll("table",{"class":"wikitable sortable"})[0]

In [4]:
# Grabbing labels + all lines of the table
labels_html = tab.findAll("th")
items_html = tab.findAll("tr")[1:]

In [5]:
# Creating vector with column labels
labels = []
for i, tag in enumerate(labels_html):
    labels.append(tag.text.replace('\n',''))
    #labels.append(tag[:-1])

In [6]:
# Creating array with all films/items 
items = []
for i, tag in enumerate(items_html):
    objects = []
    features = tag.findAll("td")
    if features[0].i.a.text == '(en)' :objects.append(features[0].i.text[:-4])
    if features[0].i.a.text != '(en)' :objects.append(features[0].i.a.text)
    objects.append(features[1].a.text)
    objects.append(features[2].text)
    objects.append(features[3].text.replace('\n',''))
        
    items.append(objects)


In [7]:
# Creating & Saving the data frame using labels and items/films
oscars = pd.DataFrame(data = items,columns=labels)
oscars.to_csv("oscars.csv")

In [8]:
# Checking if loading df is OK
oscars=pd.read_csv("oscars.csv",index_col=0)
oscars.set_index('Films')

Unnamed: 0_level_0,Années,Oscars,Nominations
Films,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
007 Spectre,2015,1,1
20.000 lieues sous les mers,1954,2,3
"2001, l'Odyssée de l'espace",1968,1,4
49e Parallèle,1942,1,3
8 Mile,2002,1,1
Abyss,1989,1,4
À chacun son destin,1946,1,2
The Accountant,2001,1,1
Les Accusés,1988,1,1
Adaptation.,2002,1,4


#### Looking at the Oscars Database

In [9]:
oscars.size

4940

In [10]:
oscars.describe()

Unnamed: 0,Oscars,Nominations
count,1235.0,1235.0
mean,1.624291,3.663158
std,1.402611,3.127993
min,1.0,1.0
25%,1.0,1.0
50%,1.0,2.0
75%,2.0,6.0
max,11.0,14.0


### Golden Globes - Golden Globes Website

In [11]:
# General info before starting to generate data frame
root_url = "https://www.goldenglobes.com/winners-nominees/"
years = [str(i) for i in range(1944,2020)]
labels =  ['Films','Années','Golden Globes','Nominations']
golden_globes = pd.DataFrame(data = [],columns = labels)

In [12]:
# Grabb all info for each year
for i, year in enumerate(years):
    
    url = root_url + year   # string of the url for each year
    
    # Grab html page content
    html = urllib.request.urlopen(url).read() 
    # Read html content
    soup = BeautifulSoup(html, 'html.parser')
    # Close html page
    urllib.request.urlopen(url).close()
    
    # Grab table with all GG info
    tab = soup.findAll("div",{"class":"pane-content"})[0]
    
    # Grab all nominees (films and films nominated through persons)
    films_nominees = tab.findAll("div",{"class":"primary-nominee"})
    person_nominees = tab.findAll("div",{"class":"secondary-nominee"}) 
    
    # Grab each nominations film name one by one
    for i, tag in enumerate(films_nominees):
        if "film" in tag.a['href']:
            golden_globes = golden_globes.append(pd.DataFrame(data=[[tag.a.text,year,0,1]], columns=labels),ignore_index = True)
    for i, tag in enumerate(person_nominees):
        if "film" in tag.a['href']:
            golden_globes = golden_globes.append(pd.DataFrame(data = [[tag.a.text,year,0,1]], columns=labels),ignore_index = True)

In [14]:
# Generalise to all pages + add winner content + clear the resulting data frame
# Peut etre petit probleme avec les titres, dans page wiki, certains titre sont en langue original, d'autres non
golden_globes.groupby('Films')['Nominations'].count()

Films
 Remains of the Day, The                             3
'Round Midnight                                      1
'night, Mother                                       1
(500) Days of Summer                                 1
10                                                   1
12 Angry Men (1957)                                  2
12 Years a Slave                                     2
127 Hours                                            1
1776                                                 1
20th Century Women                                   1
50/50                                                1
A Beautiful Mind                                     2
A History of Violence                                2
A Man And A Woman                                    1
A Medal For Benny                                    1
A Place In The Sun                                   2
A Single Man                                         1
A Streetcar Named Desire (1951)                      2
A Th