In [4]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']  # or svg
%matplotlib inline

sns.set()

Grab Links of Movies From 2000 to 2019

In [1]:
url = 'https://www.boxofficemojo.com/year/{}/?ref_=bo_yl_table_3'
urls = []
for year in range(2000,2019):
    urls.append(url.format(str(year)))
print(urls)

['https://www.boxofficemojo.com/year/2000/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2001/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2002/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2003/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2004/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2005/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2006/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2007/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2008/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2009/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2010/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2011/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2012/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2013/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2014/?ref_=bo_yl_table_3', 'https://www.boxofficemojo.com/year/2015/?ref_=bo_yl_t

Scrapping Domestic Movies for year 2000 - 2019

In [5]:
def get_all_rows(urls):
    rows = []
    for url in urls:
        response = requests.get(url)
        page = response.text
        Soup = BeautifulSoup(page,'lxml')
        table = Soup.find('table')
        for row in table.find_all('tr')[1:]:
            rows.append(row)
    return rows
    
rows = get_all_rows(urls)

In [6]:
len(rows)

3800

Get Table From Page / Scrape Table

In [7]:
def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def generate_movies_dict(rows):
    movies = {}
    for row in rows:
        items = row.find_all('td')
        link = items[1].find('a')
        title, url = link.text, link['href']
        
        # avoid repeted titles 
        if title in movies.keys():
            continue
        else:
            domestic_gross = items[5].text
            domestic_gross = money_to_int(domestic_gross)
            distributor = items[9].text[:-2]
            movies[title] = [url,domestic_gross, distributor]
    return movies


movies = generate_movies_dict(rows)
movies_df = pd.DataFrame(movies).T  #transpose
movies_df.columns =['link', 
                    'domestic_gross','distributor']

movies_df.head()

Unnamed: 0,link,domestic_gross,distributor
How the Grinch Stole Christmas,/release/rl3059189249/?ref_=bo_yld_table_1,251628705,Universal Pictures
Mission: Impossible II,/release/rl1600292353/?ref_=bo_yld_table_2,215409889,Paramount Pictures
Gladiator,/release/rl2136245761/?ref_=bo_yld_table_3,186610052,DreamWorks Distribution
The Perfect Storm,/release/rl661161473/?ref_=bo_yld_table_4,182618434,Warner Bros.
Meet the Parents,/release/rl677545473/?ref_=bo_yld_table_5,161146255,Universal Pictures


Adding Helper Functions to Scrapp the data

In [9]:
import dateutil.parser
import re

def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
        
    if next_element:
        if field_name is 'IMDbPro':
            return next_element.find_all('a')[0].get('href')
        else:
            return next_element.text 
    else:
        return None
    


def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None


def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic_gross
        - total gross
        - budget
        - runtime 
        - Genres
        - MPAA rating
        -distributor
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_gross', 'total_gross','budget', 
               'runtime_minutes', 'genres','rating', 'imd_url','distributor']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()

    #Get World Wide gross
    head = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')
    
    #Domestic Gross
    #print(len(head))
    raw_domestic_gross = (head[0].text)
    domestic_gross = money_to_int(raw_domestic_gross)
    
    #Gross Total
    if len(head) == 3:
        raw_worldwide_total_gross = (head[2].text
                               )
        total_gross = money_to_int(raw_worldwide_total_gross)
    elif (len(head) == 2) and (head[0].txt == head[1].txt):
        raw_worldwide_total_gross = (head[1].txt)
        total_gross = money_to_int(raw_worldwide_total_gross)
    else:
        total_gross = None
        
    
    # Get Budget
    raw_budget = get_movie_value(soup, 'Budget')
    if raw_budget is not None:
        budget = money_to_int(raw_budget)
    else:
        budget = raw_budget
    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    runtime = runtime_to_minutes(raw_runtime)
    # Considering the first Genres
    genres = get_movie_value(soup, 'Genres')
    genres = genres.split('\n')[0]
    #Get rating
    rating = get_movie_value(soup,'MPAA')
    imd_url = get_movie_value(soup, 'IMDbPro')
    #print(f'print link {next_link}')
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title, domestic_gross, total_gross, budget, runtime, genres, rating, imd_url]))

    return movie_dict

def scrape_imdbpro(url):
    response = requests.get(url)
    page = response.text
    imd_soup = BeautifulSoup(page,"lxml")
    
    head = imd_soup.find_all('div', id='const_page_summary_section')
    imd_dict = {}
    spans = head[0].find_all('span')
    for i, span in enumerate(spans):
        #print(f'{i}  {span.text}')
        if 'Director' in span.text:
            if 'director' not in imd_dict:
                imd_dict['director'] = spans[i+1].text
        if 'Cinematographer' in span.text:
             imd_dict['cinematographer'] = spans[i+1].text
        if 'Producer' in span.text:
            imd_dict['producers'] = spans[i+1].text        
    return imd_dict

def scrape_opening(url):
    response = requests.get(url)
    page = response.text
    imd_soup = BeautifulSoup(page,"lxml")
    head = imd_soup.find_all('span', class_='a-color-secondary')
    return head
    


In [None]:
movies_info_list = []
for i, link in enumerate(movies_df.link):
    try:
        movie_dict = get_movie_dict(link)
        #print(movie_dict)
        imd_dict = scrape_imdbpro(movie_dict['imd_url'])
        imd_dict['distributor'] = movies_df.iloc[i,2]
        #print(imd_dict)
        movies_info_list.append( dict(movie_dict, **imd_dict))
    except:
        print(f'Error in {link}')

# Save scrapped data in data directory

In [None]:
movies_info_list[0]
scrapped_data = pd.DataFrame(movies_info_list)
scrapped_data.to_csv(r'.\data\scrapped_data.csv', index=False)