<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
import requests
import re
import dateutil.parser
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pickle
import sys

In [None]:
url = 'https://www.boxofficemojo.com/yearly/'
source_code = requests.get(url)
soup = BeautifulSoup(source_code.text, 'lxml')
source_code.status_code

In [None]:
#collect urls for the yearly box office charts
yearly_chart_pages = []
for x in soup.find_all('a'):
    if 'chart/?' in x.attrs['href']:
        yearly_chart_pages.append(x.attrs['href'])   

In [None]:
project_years = yearly_chart_pages[1:11] # use most recent 10 complete years

In [None]:
#collect soup files for first page results for each year
project_year_soups = []
for year in project_years:
    source = requests.get(url+year)
    soup = BeautifulSoup(source.text,'lxml')
    project_year_soups.append(soup)

In [None]:
#collect links to each movie (top 100 for each of the 10 most recent complete years)
movie_links = []
for soup in project_year_soups:
    for x in soup.find_all('div',id = 'body'):
        for y in x.find_all('a'):
            for key in y.attrs.keys():
                if key == 'href' and '/movies/?' in y.attrs[key]:
                    movie_links.append(y.attrs[key])

In [None]:
#collect soup files for each movie
movie_soups = []
for link in movie_links:
    source = requests.get('https://www.boxofficemojo.com/'+link)
    soup = BeautifulSoup(source.text,'lxml')
    movie_soups.append(soup)

In [None]:
#save movie_soups
with open('movie_soups.pkl','wb') as picklefile:
    pickle.dump(movie_soups,picklefile)

In [None]:
#load movie_soups
with open("movie_soups.pkl", 'rb') as picklefile: 
    movie_soups = pickle.load(picklefile)

In [None]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    
    if next_sibling:
        return next_sibling.text 
    else:
        return None

In [None]:
def money_to_int(moneystring): #converts dollar values from strings to int
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring): #converts runtime from hrs and min to min
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring): #converts date from string to datetime
    date = dateutil.parser.parse(datestring)
    return date

def budget_to_int(budgetstring): #converts budget from string to float
    if budgetstring == 'N/A':
        budget_int = np.nan
    elif 'million' not in budgetstring:
        budget_int = float(budgetstring.strip().replace('$','').replace(',',''))
    else: 
        budget_int = budgetstring.split() # separate 
        budget_int = (float(budget_int[0][1:]) * 1000000)
    return budget_int

In [None]:
def get_movie_data(soup):
    '''A function to grab all relevant data points from a movie page on Box Office Mojo
        returns a dictionary of field names with corresponding values and a list of errors'''
    error_log = []
    title = soup.find('title').text.split(' (')[0]
    try:
        domestic_total_gross = money_to_int(get_movie_value(soup,'Domestic Total Gross'))
    except:
        domestic_total_gross = np.nan
        error_log.append([title,'gross'])
    try:
        runtime = runtime_to_minutes(get_movie_value(soup,'Runtime'))
    except:
        runtime = np.nan
        error_log.append([title,'runtime'])
    try:
        budget = budget_to_int(get_movie_value(soup,'Budget'))
    except:
        budget = np.nan
        error_log.append([title,'budget'])
    try:
        release_date = to_date(get_movie_value(soup,'Release Date'))                
    except:
        release_date = np.nan
        error_log.append([title,'release date'])
    try:
        genre = get_movie_value(soup,'Genre:')
    except:
        genre = 'Unknown'
        error_log.append([title,'genre'])
    try:
        mpaa = get_movie_value(soup,'MPAA')
    except:
        mpaa = 'Unknown'
        error_log.append([title,'mpaa'])
    try:
        distributor = get_movie_value(soup,'Distributor')
    except:
        distributor = 'Unknown'
        error_log.append([title,'distributor'])
    if soup.find_all('div',class_ = 'mp_box_content')[1].find_all('td')[-2].text == 'In Release:':
        try:
            widest_release = soup.find_all('div',class_ = 'mp_box_content')[1].find_all('td')[-5].text.replace('\xa0','')
            widest_release = int(widest_release.split(' ')[0].replace(',',''))
        except:
            widest_release = np.nan
            error_log.append([title,'widest release'])
        try:
            close_date = soup.find_all('div',class_ = 'mp_box_content')[1].find_all('td')[-3].text.replace('\xa0','')
            close_date = to_date(close_date)
        except:
            close_date = np.nan
            error_log.append([title,'close date'])
        try:
            days_in_release = soup.find_all('div',class_ = 'mp_box_content')[1].find_all('td')[-1].text.replace('\xa0','')
            days_in_release = int(days_in_release.split(' ')[0])
        except:
            days_in_release = np.nan
            error_log.append([title,'in release'])
    else:
        days_in_release = np.nan
        try:
            widest_release = soup.find_all('div',class_ = 'mp_box_content')[1].find_all('td')[-3].text.replace('\xa0','')
            widest_release = int(widest_release.split(' ')[0].replace(',',''))
        except:
            widest_release = np.nan
            error_log.append([title,'widest release'])
        try:
            close_date = soup.find_all('div',class_ = 'mp_box_content')[1].find_all('td')[-1].text.replace('\xa0','')
            close_date = to_date(close_date)
        except:
            close_date = np.nan
            error_log.append([title,'close date'])
    
    franchises = []
    franchise_links = []
    
    try:
        for f in soup.find('th', text = re.compile('Franchise')).find_parent('tr').find_parent().find_all('a',text = re.compile('Series')):
            franchise_link = f.get('href')
            franchise = f.text.split('\n')[0]
            franchise = franchise.split(':')[1].strip()
            franchises.append(franchise)
            franchise_links.append(franchise_link)
    except:
        pass
        
    #grab director 
    try:
        director_info = (soup.find('div', class_ = 'mp_box_tab', text = re.compile('The Players'))
                         .find_parent().find('td',text = re.compile('Director')).findNextSibling().find('a'))
        director = director_info.text.strip()
        director_link = director_info.get('href')
         
    except:
        error_log.append([title,'director'])
        director_link = ''
        director = 'Unknown'
    
    #grab writer 
    try:
        writer_info = (soup.find('div', class_ = 'mp_box_tab', text = re.compile('The Players'))
                       .find_parent().find('td',text = re.compile('Writer')).findNextSibling().find('a'))
        writer = writer_info.text.strip()
        writer_link = writer_info.get('href')
        
    except:
        error_log.append([title,'writer'])
        writer_link = ''
        writer = 'Unknown'

    #grab actor 
    try:
        actor_info = (soup.find('div', class_ = 'mp_box_tab', text = re.compile('The Players'))
                      .find_parent().find('td',text = re.compile('Actor')).findNextSibling().find('a'))
        actor = actor_info.text.strip()
        actor_link = actor_info.get('href')
        
    except:
        error_log.append([title,'actor'])
        actor_link = ''
        actor = 'Unknown'

    #grab producer 
    try:
        producer_info = (soup.find('div', class_ = 'mp_box_tab', text = re.compile('The Players'))
                      .find_parent().find('td',text = re.compile('Producer')).findNextSibling().find('a'))
        producer = producer_info.text.strip()
        producer_link = producer_info.get('href')
        
    except:
        error_log.append([title,'producer'])
        producer_link = ''
        producer = 'Unknown'
    
    headers = (['Title','Domestic Total Gross','Runtime(min)','Budget','Release Date',
                'Genre','MPAA','Distributor','Widest Release','Close Date','Days in Release',
                'Franchise(s)','Director','Writer','Actor','Producer'])
    values = ([title,domestic_total_gross,runtime,budget,release_date,genre,mpaa,distributor,
              widest_release,close_date,days_in_release,franchises,director,writer,actor,producer])
    movie_dict = dict(zip(headers,values))
    links_dict = (dict(zip(['Franchise','Director','Writer','Actor','Producer'],
                           [franchise_links,director_link,writer_link,actor_link,producer_link])))
    return movie_dict, error_log,links_dict


In [None]:
#save output from get_movie_data function in 3 separate lists
movie_data = []
errors = []
links = []
for soup in movie_soups: 
        index = int(movie_soups.index(soup))
        movie_dict, error_log,links_dict = get_movie_data(soup)
        movie_data.append(movie_dict)
        if len(links_dict) > 0: #only add links_dict to links list if links were extracted from movie page
            links.append(links_dict)
        if len(error_log) > 0: #only add error_log to errors list if there were errors extracting info from the movie page
            error_log.insert(0,index)
            errors.append(error_log)

In [None]:
def get_field_links(links_dict,field):
    '''returns a list of links for a particular field from the links_dict'''
    field_links_list = []
    for link in links:
        for x in link[field]:
            field_links_list.append(x)
    return field_links_list

In [None]:
franchise_links = get_field_links(links,'Franchise')
franchise_links = list((set(franchise_links))) #removing any duplicates

In [None]:
#collect names and sizes of franchise from page on boxofficemojo.com
url = 'https://www.boxofficemojo.com/'
f_names = []
f_sizes = []
for f in franchise_links:
    franchise_source = requests.get(url + f)
    franchise_soup = BeautifulSoup(franchise_source.text,'lxml')
    franchise_name = franchise_soup.find('h1').text
    f_names.append(franchise_name)
    franchise_size = (len(franchise_soup.find('td',text = re.compile('Rank')).find_parent()
                          .find_parent().find_all('tr')) - 3)
    f_sizes.append(franchise_size)

franchise_dict = dict(zip(f_names,f_sizes))

In [None]:
movie_df = pd.DataFrame(movie_data) #create dataframe of movie data

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
def get_imdb_budget(row):
    '''function to scrape budget info from IMBD (was missing from many movie pages on boxofficemojo)'''
    if np.isnan(row['Budget']):
        try:
            title = row['Title']
            imdb = "https://www.imdb.com/"
            driver = webdriver.Chrome(chromedriver)
            driver.get(imdb)

            search_box = driver.find_element_by_xpath("//input[@type='text']")

            #clear the current search
            search_box.clear()

            #input new search
            search_box.send_keys(title)

            #hit enter
            search_box.send_keys(Keys.RETURN)

            soup = BeautifulSoup(driver.page_source,'html.parser')

            movie_page_link = soup.find('td',class_ = 'result_text').find('a').get('href')

            movie_page = requests.get(imdb + movie_page_link).text
            movie_soup = BeautifulSoup(movie_page,'lxml')
            budget = (movie_soup.find('h3',text = re.compile('Box Office'))
                      .findNextSibling().text.split(':')[1].split('\n')[0])
            budget = budget_to_int(budget)

            driver.close()
        except:
            budget = row['Budget']
            driver.close()
    else:
        budget = row['Budget']
    time.sleep(3)
    return budget

In [None]:
#add budget info from IMDB to dataframe
movie_df['Budget Adj'] = movie_df.apply(get_imdb_budget,axis = 1)

In [None]:
#save dataframe for modeling
with open('movie_df.pkl','wb') as picklefile:
    pickle.dump(movie_df,picklefile)