## Data scraping part 1: Oscars winners and nominee(1990-2019)

Here, we make use of Selenium to help us get on the Oscars website and scrape all the info regarding Oscar-winning and Oscar nominated films from **1990** to **2019**

In [None]:
from bs4 import BeautifulSoup
import requests
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

PATH = "/Applications/chromedriver" # path to the chromedriver executable
chromedriver=webdriver.Chrome(PATH)


In [None]:
chromedriver.get("http://google.com")
time.sleep(2) 

In [None]:
year_button1 = chromedriver.find_element_by_xpath(
    '//button[contains(@class, "yearsfrom")]'
)

year_button1.click()

year_button2=chromedriver.find_element_by_xpath(
    '//*[@id="basicsearch"]/div/div[2]/div[2]/div/div[1]/span/div/ul/li[32]/a/label/input'
    )

year_button2.click()


In [None]:
year_button3=chromedriver.find_element_by_xpath(
    '//*[@id="basicsearch"]/div/div[2]/div[2]/div/div[2]/span/div/button'
)

year_button3.click()


year_button4=chromedriver.find_element_by_xpath(
    '//*[@id="basicsearch"]/div/div[2]/div[2]/div/div[2]/span/div/ul/li[3]/a/label/input'
)

year_button4.click()


In [None]:
search_button=chromedriver.find_element_by_xpath(
    '//*[@id="btnbasicsearch"]'
)

search_button.click()



In [None]:
soup = BeautifulSoup(chromedriver.page_source)



Here, we make a web scraping function that will get all the relevent data needed and put them in a list. Those include:award category, year of the release, movie name, nominated individuals, and whether or not it ended up winning.

In [None]:
def cateogries_by_year(soup,full_list):
    for group in soup.find_all('div', class_='awards-result-chron'):
        for subgroup in group.find_all('div',class_='result-subgroup'):
                for details in subgroup.find_all('div',class_='result-details'):
                    full_list.append(subgroup.find('div',class_='result-subgroup-header').find('a').text)
                    full_list.append(group.find('div',class_='result-group-header').text)
                    full_list.append(details.text)
                    full_list.append(str(bool(details.find('span',class_='glyphicon'))))
    return full_list


## Data cleaning part 1:
Data scraped from the oscars website is cleaned and converted to a dataframe.

lets first clean the scraped data by first creating a list and getting rid of some special characters, items in brackets, and distracting line breaks.

In [None]:
full_list=[]
cateogries_by_year(soup,full_list);
import re
full_list=[i.replace('\n','') for i in full_list]
full_list=[re.sub('\{.*\}','',i) for i in full_list]
full_list=[re.sub('\s\(\d\d\w\w\)','',i) for i in full_list]
full_list=[i.split('--') for i in full_list]


Now,let's make a dictionary of all the list items

In [None]:
category_dict={}


def lists_maker(full_list,catgory_dict):
    c=0
    i=0
    while c < len(full_list) :
        if len(full_list[c+2])==2:
            category_dict[i]=[full_list[c][0]]+[full_list[c+1][0]]+[full_list[c+2][0]]+[full_list[c+2][1]]+[full_list[c+3][0]]
            i+=1
        c+=4
        
    return category_dict

lists_maker(full_list,category_dict)


In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", 100)
movies_df=pd.DataFrame(category_dict).T
movies_df.rename(columns={0: 'category'}, inplace=True)
movies_df.rename(columns={1: 'year'}, inplace=True)
movies_df.rename(columns={2: 'movie'}, inplace=True)
movies_df.rename(columns={3: 'names'}, inplace=True)
movies_df.rename(columns={4: 'oscar_win'}, inplace=True)
movies_df.loc[(movies_df['category']=='ACTOR IN A LEADING ROLE')|(movies_df['category']=='ACTOR IN A SUPPORTING ROLE')|(movies_df['category']=='ACTRESS IN A LEADING ROLE')|(movies_df['category']=='ACTRESS IN A SUPPORTING ROLE'),['movie','names']]=movies_df.loc[(movies_df['category']=='ACTOR IN A LEADING ROLE')|(movies_df['category']=='ACTOR IN A SUPPORTING ROLE')|(movies_df['category']=='ACTRESS IN A LEADING ROLE')|(movies_df['category']=='ACTRESS IN A SUPPORTING ROLE'),['names','movie']].values


In [None]:
def movie_title_clean(movies_df):
    for i in range(len(movies_df['movie'])):
        movies_df['movie'][i]=re.sub('".*"\w\w\w\w','',movies_df['movie'][i])
        movies_df['movie'][i].strip()
        movies_df['names'][i].strip()
        movies_df['movie'][i]=movies_df['movie'][i].lower()
        if movies_df['oscar_win'][i]=='False':
            movies_df['oscar_win'][i]=0
        else:
            movies_df['oscar_win'][i]=1
    return movies_df
movie_title_clean(movies_df)
movies_df=movies_df.loc[(movies_df['category']!='INTERNATIONAL FEATURE FILM')&(movies_df['category']!='FOREIGN LANGUAGE FILM')&(movies_df['category']!='SCIENTIFIC AND TECHNICAL AWARD (Scientific and Engineering Award)')&(movies_df['category']!='DOCUMENTARY (Feature)')&(movies_df['category']!='DOCUMENTARY (Short Subject)')]

Let's count how many times each movie was nominated and how many oscars it won

In [None]:
pd.set_option("display.max_rows", 3500)
movies_df['times_nominated']=movies_df.groupby('movie')['movie'].transform('count')
new_df=movies_df.groupby(['movie','times_nominated'],as_index=False)[['oscar_win']].sum()


Great! Now lets visualize what that looks like with a bar graph.Hmm,it seems like its a linear relationship for the most part: the more categories a movie is nominated for an oscar, the more oscar its going to win

In [None]:
import matplotlib.pyplot as plt
x=new_df['times_nominated']
plt.xlabel('times nominated')
plt.ylabel('Oscars won')
y=new_df['oscar_win']
plt.bar(x,y)


## Data collection part 2: boxoffice mojo
For this section, I am going to scrape box office mojo for the movies' release month(its important, more details to follow), budget, domestic boxoffice, MPAA rating, genres and runtime. This is done by creating a selenium loop and several helper functions.


In [None]:
#first lets create some empty columns with the appropriate header to fill our values later
new_df['domestic_boxoffice']=''
new_df['distributor']=''
new_df['budget']=''
new_df['release_month']=''
new_df['MPAA_rating']=''
new_df['running_time']=''
new_df['genre']=''
new_df['tropes']=''



In [None]:
import re
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None




In [None]:
soup = BeautifulSoup(chromedriver.page_source)
genre=get_movie_value(soup, 'Genres')
companystr=get_movie_value(soup, 'Domestic Distributor')

box=get_movie_value(soup, 'Budget')

rating=get_movie_value(soup, 'MPAA')
print(rating)


# domestic box office:
soup.find('span',class_='money').text


In [None]:
import dateutil.parser
from datetime import datetime as dt

# parse the genre:
def genre_list(genre):
    try:
        genre=re.sub('\n\s*',',',genre)
        genre=genre.split(',')
        return genre
    except:
        return None

# parse the company:
def production_company(companystr):
    try:
        company=companystr.split('See')[0]
        return company
    except:
        return None
    
# parse any money amount:
def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except:
        return None
    
# parse any runtime:
def runtime_to_minutes(runtimestring):
    try:
        runtime = runtimestring.split()
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

# searching for boxoffice:
def soup_find_money(soup):
    try:
        money=soup.find('span',class_='money').text
        return money
    except:
        return None
    
# parse date to its month: 
def to_date(datestr):
    try:
        date=datestr.split('/n')[0]
        date = dt.strptime(datestr.split('\n')[0],'%B %d, %Y')
        return date.month
    except:
        return None



In [None]:
#a scrape bot that will search each of the movies in our list on boxofficemojo and return all the info we need 
#info is then stored in the dataframe.

pd.set_option("display.max_rows", 100)
def boxoffice_mojo_scrape_bot(new_df):
    i=0
    while i < len(new_df):
        chromedriver.find_elements_by_xpath('//*[@id="mojo-search-text-input"]')[0].send_keys(new_df['movie'][i])
        chromedriver.find_elements_by_xpath('//*[@id="mojo-search-text-input"]')[0].send_keys(Keys.ENTER)
        soup = BeautifulSoup(chromedriver.page_source)
        if bool('No results' not in soup.find('div','mojo-gutter').text):
            chromedriver.find_elements_by_xpath('//*[@id="a-page"]/main/div/div/div/div/div/div[2]/a')[0].click()
            soup = BeautifulSoup(chromedriver.page_source)
            new_df['release_month'][i]=to_date(get_movie_value(soup, 'Earliest Release Date'))
            new_df['genre'][i]=genre_list(get_movie_value(soup, 'Genres'))
            new_df['distributor'][i]=production_company(get_movie_value(soup, 'Domestic Distributor'))
            new_df['MPAA_rating'][i]=get_movie_value(soup, 'MPAA')
            new_df['domestic_boxoffice'][i]=money_to_int(soup_find_money(soup))
            new_df['budget'][i]=money_to_int(get_movie_value(soup, 'Budget'))
            new_df['running_time'][i]=runtime_to_minutes(get_movie_value(soup, 'Running Time'))
            i+=1
        else:
            new_df['release_month'][i]=''
            new_df['genre'][i]=''
            new_df['distributor'][i]=''
            new_df['MPAA_rating'][i]=''
            new_df['domestic_boxoffice'][i]=''
            new_df['budget'][i]=''
            new_df['running_time'][i]=''
            i+=1
    return new_df
#uncomment below to run
#boxoffice_mojo_scrape_bot(new_df)


In [None]:
#Let's only select non-empty rows

new_df=new_df.loc[(new_df['domestic_boxoffice']!='')].reset_index()



In [None]:
#now lets make a separate dataframe with only the oscar winners
oscars_winner=new_df.loc[new_df['oscar_win']!=0]['movie'].reset_index()



## Data collection part 3: TV tropes

A great study done by data scientists over at DW shows that there are certain 'tropes',or common motifs found among oscar winning and nominated movies. A list of the top 200 tropes is provided here:https://dw-data.github.io/movie-tropes/. This section involves creating another selenium web scrape bot that will scrape tv tropes for each of the movies on our list for those 200 tropes. The total number of tropes a movie has that matches those 200 is displayed in the column named 'tropes'.

In [None]:
#first, lets scrape the DW website for the top 200 tropes. 
#If you are using this code you will have to mannually change the table display setting to display all 200 tropes
chromedriver.get("https://dw-data.github.io/movie-tropes/")

soup = BeautifulSoup(chromedriver.page_source)
table=soup.find('table')
tropes=[]
rows = [row for row in table.find_all('a')]
for row in rows:
    tropes.append(row.text.split('\n'))



Next, let's clean the data a bit by changing the format of some of the strings


In [None]:
for i in range(len(tropes)):
    if tropes[i] == ['\\The Reason You Suck\\" Speech"']:
        tropes[i]=['"The Reason You Suck" Speech']
    elif tropes[i]==['Heel\x96Face Turn']:
        tropes[i]=['Heel-Face Turn']
    elif tropes[i]==['Big \\NO!\\""']:
        tropes[i]=['Big "NO!"']
        

Now we have the list of top 200 tropes it is time to scrape tvtropes.org.
Here is a selenium bot that will help us automate that process

In [None]:
#tv tropes data scrape bot
def tropes_identifier(new_df):
    for i in range(len(new_df)):
        chromedriver.get("http://google.com")
        inputElems=chromedriver.find_elements_by_css_selector('input[name=q]')
        for inputElem in inputElems:
            inputElem.send_keys(new_df['movie'][i]+'(film)'+' tv tropes')
            inputElem.send_keys(Keys.ENTER)

        results=chromedriver.find_elements_by_xpath('//*[@id="rso"]/div[1]/div/div[1]/a/h3')
        for x in results:
            x.click()
        soup = BeautifulSoup(chromedriver.page_source)
        tropelist=soup.find_all('a',class_='twikilink')
        counter=0
        for item in tropelist:
            if item.text.split('\n') in tropes:
                counter+=1
        new_df['tropes'][i]=counter
    return(new_df)
# uncomment below to run the function
# tropes_identifier(new_df)

In [1]:
from matplotlib.pyplot import figure
figure(figsize=(20,10))
x=new_df['tropes']
y=new_df['oscar_win']
plt.bar(x,y)


NameError: name 'new_df' is not defined

<Figure size 1440x720 with 0 Axes>

In [None]:
#let's do a save of our current data!
new_df.to_csv('movie_data.csv')

## Data collection part 4: ratings(IMDB,rotten tomatoes)
for this section I am going to webscrape google to find the ratings for each movie from the IMDB and rotten tomatoes. IMDB reviews are more user based whereas rotten tomatoes is more critic based.



In [None]:
#new_df['rt_score']=''


In [None]:
def rating_rottentomatoes(soup):
    try:
        rating=(soup.find("div",class_="meta-value").text).split('(')
        rating=rating[0].strip()
        return rating
    except:
        return None

def score_rottentomatoes(soup):
    try:
        score=soup.find("span",class_="mop-ratings-wrap__percentage").text
        score=int(score.strip().replace('%',''))
        return score
    except:
        return None

def score_from_google_search(soup):
    try:
        score=int(rating.split('%')[0].split(':')[1].strip())
        return score
    except:
        return None


In [None]:
def rotten_tomatoes(new_df):
    for i in range(len(new_df['rt_score'])):
        
    

In [None]:
soup = BeautifulSoup(chromedriver.page_source)
score=soup.find("span",class_="mop-ratings-wrap__percentage").text
score=int(score.strip().replace('%',''))


rating=(soup.find("div",class_="meta-value").text).split('(')
rating=rating[0].strip()

In [None]:
print(soup.find("div",class_="meta-value").text)


In [None]:
join_df=movies_df[['year','movie']]

new_df['year']=''
for i in range(len(new_df)):
    index=join_df.loc[join_df['movie']==new_df['movie'][i]].index
    year=str(join_df['year'][index])
    replace=(year.split('\s')[0].split('\n')[0])
    year=re.sub('.*\s\s\s\s','',replace)
    new_df['year'][i]=year


    

In [None]:
#tv tropes data scrape bot(v1)
def rotten_tomatoes_scrapev1(new_df):
    for i in range(len(new_df)):
        chromedriver.get("http://google.com")
        inputElems=chromedriver.find_elements_by_css_selector('input[name=q]')
        for inputElem in inputElems:
            inputElem.send_keys(new_df['movie'][i]+'(film) '+new_df['year'][i]+' rotten tomatoes')
            inputElem.send_keys(Keys.ENTER)

        results=chromedriver.find_elements_by_xpath('//*[@id="rso"]/div[1]/div/div[1]/a/h3')
        for x in results:
            x.click()
        soup = BeautifulSoup(chromedriver.page_source)
        new_df['rt_score'][i]=score_rottentomatoes(soup)
        if new_df['MPAA_rating'][i]==None:
            new_df['MPAA_rating'][i]=rating_rottentomatoes(soup)
            
    return(new_df)


In [None]:
#The far superior version compared to v1, but might get flagged by google as a bot if running for too long

def rotten_tomatoes_scrapev2(new_df):
    for i in range(len(new_df)):
        if bool(new_df['rt_score'][i])==False:
            chromedriver.get("http://google.com")
            inputElems=chromedriver.find_elements_by_css_selector('input[name=q]')
            for inputElem in inputElems:
                inputElem.send_keys(new_df['movie'][i]+'(film) '+new_df['year'][i]+' rotten tomatoes')
                inputElem.send_keys(Keys.ENTER)
            soup = BeautifulSoup(chromedriver.page_source)
            new_df['rt_score'][i]=score_from_google_search(soup)
        

            
    return(new_df)


In [None]:
# uncomment below to run the function
#rotten_tomatoes_scrapev1(new_df)
#rotten_tomatoes_scrapev2(new_df)

In [None]:
new_df.to_csv('movie_data.csv')


In [None]:
chromedriver.get("http://google.com")
inputElems=chromedriver.find_elements_by_css_selector('input[name=q]')
for inputElem in inputElems:
    inputElem.send_keys(new_df['movie'][247]+'(film) '+new_df['year'][247]+' rotten tomatoes')
    inputElem.send_keys(Keys.ENTER)
soup = BeautifulSoup(chromedriver.page_source)

In [None]:
rating=soup.find('div',class_="dhIWPd f").text

In [None]:
bool(new_df['rt_score'][7])==False

In [None]:
new_df

## Data collection part 5: original story?
For this section I am going to see if a movie's source material has anything to do with its oscar chances.
For example, movies like 'fight club','12 years a slave', and many more are all based on a novels or other forms
of written work.


In [None]:
def find_table(soup):
    try:
        table=soup.find('table')
        rows = [row for row in table.find_all('tr')]
        return rows
    except:
        return None
    

In [None]:
new_df['book_based']=''
def based_on(new_df):
    for i in range(len(new_df)):
        chromedriver.find_elements_by_xpath('//*[@id="searchInput"]')[0].send_keys(new_df['movie'][i]+'(film) '+new_df['year'][i])
        chromedriver.find_elements_by_xpath('//*[@id="searchInput"]')[0].send_keys(Keys.ENTER)
        results=chromedriver.find_elements_by_xpath('//*[@id="mw-content-text"]/div[3]/ul/li[1]/div[1]/a/span[1]')
        for x in results:
            x.click()
        time.sleep(1)
        soup = BeautifulSoup(chromedriver.page_source)
        rows=soup.find_all('th')
        if bool('Based on' in str(rows)):
            new_df['book_based'][i]=1
        else:
            new_df['book_based'][i]=0
    return new_df
        
    

In [None]:
based_on(new_df)


In [None]:
soup1 = BeautifulSoup(chromedriver.page_source)
find_table(soup1)
print(rows)


In [None]:
https://en.wikipedia.org/wiki/12_Years_a_Slave_(film)
    

## Data collection part 6: supplement budget and boxoffice info
The previously collected data from boxofficemojo seems to be incomplete. Now,I am going to try to supplement the data set by scraping the data found on The Numbers website:https://www.the-numbers.com/movie/budgets

In [None]:
PATH = "/Applications/chromedriver" # path to the chromedriver executable
chromedriver=webdriver.Chrome(PATH)


In [None]:
#This helps us scrape the whole website for box office and budget data and put them in a dictionary
i=1
data=[]
while i < 6001:
    chromedriver.get("https://www.the-numbers.com/movie/budgets/all/"+str(i))
    soup = BeautifulSoup(chromedriver.page_source)
    table=soup.find('table')
    rows=table.find_all('td')

    for row in rows:
        data.append(row.text)
    i+=100

the_numbers={}
i=2
while i <len(data):
    the_numbers[data[i].lower()]=[int(data[i+1].split('$')[1].replace('$', '').replace(',', ''))]+[int(data[i+2].split('$')[1].replace('$', '').replace(',', ''))]
    i+=6

In [None]:
#helper functions that help us match the movies from our dataset with the dataset scraped from The Numbers website
from fuzzywuzzy import process
def movie_budget(the_numbers):
    try:
        budget=the_numbers[get_matches(new_df['movie'][365],movie_list_names)[0][0]]
        return budget
    except:
        return None
        


def box_office(the_numbers):
    try:
        boxoffice=the_numbers[get_matches(new_df['movie'][365],movie_list_names)[0][0]]
        return boxoffice
    except:
        return None
    
def get_matches(query,choice,limit=1):
    results=process.extract(query,choice,limit=limit)
    return results

In [None]:
#this loop supplements the missing values from our current dataset with budget and boxoffice data from the new dataset

for i in range(len(new_df)):
    if new_df['budget'][i]==None and get_matches(new_df['movie'][i],movie_list_names)[0][1]>95:
        get_matches(new_df['movie'][i],movie_list_names)[0][1]
        new_df['budget'][i]=movie_budget(the_numbers)
    elif new_df['domestic_boxoffice'][i]==None and get_matches(new_df['movie'][i],movie_list_names)[0][1]>95:
        new_df['domestic_boxoffice'][i]=box_office(the_numbers)
        


Nice! Around 300 missing values were filled using the data from the new website
.

In [None]:
new_df.count()


In [None]:
sorted_movie_df=new_df[['movie','year','oscar_win','times_nominated','domestic_boxoffice','budget','release_month','MPAA_rating','running_time','genre','tropes','rt_score','book_based']]

In [None]:
sorted_movie_df


In [None]:
soup = BeautifulSoup(chromedriver.page_source)
links=soup.find_all('a')


In [None]:
list_of_movies=[]
for link in links:
    list_of_movies.append((link.text.split('\n')))
    

In [None]:
sorted_movie_df['true_story_based']=''

def based_on_true_story(sorted_movie_df):
    for i in range(len(sorted_movie_df)):
        if get_matches(sorted_movie_df['movie'][i],list_of_movies)[0][1] >95:
            sorted_movie_df['true_story_based'][i]=1
        else:
            sorted_movie_df['true_story_based'][i]=0
    return sorted_movie_df
    


In [None]:
based_on_true_story(sorted_movie_df)


In [None]:
sorted_movie_df.to_csv('sorted_movie_data')


In [None]:
sorted_movie_df.describe()


In [None]:
new_df.to_csv('sorted_movie_data2')
