# Box Office Mojo - Scraping 

Importing packages and opening chromedriver

In [2]:
from bs4 import BeautifulSoup
import requests
import time, os
import re
import pandas as pd
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

### Getting table data for foreign language movies from main page 

Getting list of foreign language movies main page urls

In [7]:
url_list = []
list_ranges = [100*i for i in range(25)]
for range_ in list_ranges:
    url_ = "https://www.boxofficemojo.com/genre/sg4208980225/?offset={}".format(range_)
    url_list.append(url_)

Getting table data from each page 

In [8]:
movies = {}
for url in url_list: 
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    #finding the main table and its rows 
    table = soup.find('table') #finding the main table 
    rows = table.find_all('tr') #finding the rows - tr tag is for rows
    
    #grabbing data for 100 movies with a loop and adding to a dictionary
    for row in rows[1:]:
        items = row.find_all('td')
        link = items[1].find('a')
        title, url = link.text, link['href']
        if title not in movies.keys(): #some movies appear twice 
            movies[title] = [url] + [i.text for i in items] 
        else:
            movies[title+"_2"] = [url] + [i.text for i in items]

Putting table data for foreign language movies in a dataframe 

In [9]:
fl_movies = pd.DataFrame(movies).T  #transpose
fl_movies.columns = ['link_stub', 'gross_rank', 'title',
                    'lifetime_gross', 'max_theaters', 'domestic_opening', 
                     'num_opening_theaters', 'release_date', 'domestic_distributor']

Looking at duplicate movie titles

In [10]:
for movie in movies.keys():
    if movie[-2:] == "_2":
        print(movie)

The Promise_2
Fireworks_2
Brothers_2
Stalingrad_2
The Big Picture_2
Three_2
Django_2
Clash_2
Gloria_2
Come Undone_2
Marius_2
Fanny_2
Gabrielle_2
The Family_2
Iceman_2


### Getting table data for foreign language movies from individual movie pages 

Function to grab values

In [11]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

Helper functions to parse strings into appropriate data types

In [12]:
import dateutil.parser

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

Creating a function to get features from individual pages and put them in a dictionary 

In [20]:
def get_movie_dict(link):
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    for i in range(5): 
        try:
            response = requests.get(url)
        except:
            time.sleep(10) 
            continue
        else:
            break
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['title', 
               'domestic_total_gross', 
               'international_total_gross',
               'domestic_distributor',
               'domestic_opening',
                'budget',
                'release_date',
                 'release_location',
                 'rating',
                'runtime', 
               'genres',
                'original_release_markets_num',
                 'crew',
                  'cast',
              'earliest_release_location_opening_gross',
               'earliest_release_location_original_gross']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('- Box Office Mojo')[0].strip()

    #Get domestic gross
    try:
        raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
        domestic_total_gross = money_to_int(raw_domestic_total_gross)
    except:
        domestic_total_gross = None 
    
    #Get international gross
    try:
        raw_international_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[1]
                                    .text
                               )
        international_total_gross = money_to_int(raw_international_total_gross)
    except:
        international_total_gross = None
    
    #Get domestic distributor
    try:
        domestic_distributor = get_movie_value(soup,'Domestic Distributor')
        domestic_distributor = domestic_distributor.replace('See full company information', '') # Removing extra descrip
        domestic_distributor = domestic_distributor.strip()
    except:
        domestic_distributor = None 

    #Get domestic opening revenue 
    try:
        raw_domestic_opening = get_movie_value(soup,'Domestic Opening')
        domestic_opening = money_to_int(raw_domestic_opening)
    except:
        domestic_opening = None 
    
    #Get budget
    try:
        raw_budget = get_movie_value(soup,'Budget')
        budget = money_to_int(raw_budget)
    except:
        budget = None 

    #Get release date
    try:
        raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
        release_date = to_date(raw_release_date)
    except:
        release_date = None 
    
    #Get earliest release location
    try:
        release_location = get_movie_value(soup,'Release Date').split('\n')[1].strip().strip(')').strip('(')
    except:
        release_location = None 
        
    #Get rating
    try:
        rating = get_movie_value(soup,'MPAA')
    except:
        rating = None
    
    #Get runtime
    try:
        raw_runtime = get_movie_value(soup,'Running')
        runtime = runtime_to_minutes(raw_runtime)
    except:
        runtime = None 
    
    #Get genres
    try:
        genres = get_movie_value(soup,'Genres')
        genres = genres.replace('\n', '')
        _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
        genres = _RE_COMBINE_WHITESPACE.sub(" ", genres).strip()
    except: 
        genres = None 
    
    #Get original release number of markets 
    try:
        original_release_markets_num = soup.find(class_='a-bordered a-horizontal-stripes a-size-base-plus').find_all('td', class_='a-align-center')[2].text
        original_release_markets_num = original_release_markets_num.replace('markets', '').strip()
    except:
        original_release_markets_num = None 
       
    #OPENING CHROME DRIVER 
    driver = webdriver.Chrome(chromedriver)
    for i in range(5): #Try 5 times to reach page
        try:
            driver.get(url)
        except:
            time.sleep(10) # wait 10 seconds and then make http request again
            continue
        else:
            break
    
    #CLICKING ON CAST AND CREW TAB      
    for i in range(5): #Try 5 times to click tab
        try:
            cast_and_crew = driver.find_element_by_xpath('//*[@id="tabs"]/div/a[2]')
            cast_and_crew.click()
        except:
            time.sleep(5) # wait 5 seconds and then make http request again
            driver.refresh() # refresh page
            continue
        else:
            break
    
    #Get crew
    try:
        raw_crew = driver.find_element_by_xpath('//*[@id="principalCrew"]/tbody')
        crew = raw_crew.text.replace('Crew Member Role\n', '').replace('\n', ', ')
    except: 
        crew = None
    
    #Get cast
    try: 
        raw_cast = driver.find_element_by_xpath('//*[@id="principalCast"]')
        string_cast = raw_cast.text
        list_cast = string_cast.split('\n')[1::2]
        cast = ",".join(list_cast).replace(',',', ')
    except:
        cast = None 
    
    #CLICKING ON RELEASE GROUP DROPDOWN AND SELECTING ORIGINAL
    for i in range(5): #Try 5 times to click dropdown
        try:
            all_releases_dropdown = driver.find_element_by_xpath('//*[@id="releasegroup-picker-navSelector"]')
            all_releases_dropdown.send_keys("Original Release")
        except:
            time.sleep(5) # wait 5 seconds and then make request again
            driver.refresh() # refresh page
            continue
        else:
            break
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    table_dates = []
    tables = soup.find_all('table', class_="a-bordered a-horizontal-stripes mojo-table releases-by-region")
    for table in tables:
        rows = table.find_all('tr')
        for row in rows:
            data = row.find_all('td')
            if len(data) != 0:
                table_dates.append(data)
    new_list = []
    for list_ in table_dates:
        list_row = []
        for element in list_:
            if element.text == '\n        –\n    ':
                list_row.append('missing')
            elif element.text == '':
                list_row.append('missing')
            else: 
                list_row.append(element.text)
        new_list.append(list_row)

    original_release_df = pd.DataFrame(new_list, columns=['country','release_date','opening_revenue','gross_revenue'])
    original_release_df['release_date'].replace({"missing": "Jan 1, 2022"}, inplace=True)
    original_release_df['release_date'] = pd.to_datetime(original_release_df['release_date'], format='%b %d, %Y')
    original_release_df = original_release_df.sort_values(by="release_date")
    original_release_df.reset_index(drop=True)
    
    #Get earliest release location opening revenue
    try:
        raw_earliest_release_location_opening = original_release_df.opening_revenue[0]
        earliest_release_location_opening_gross = money_to_int(raw_earliest_release_location_opening)
    except:
        earliest_release_location_opening_gross = None 
    
    #Get earliest release location gross revenue
    try:
        raw_earliest_release_location_total_gross = original_release_df.gross_revenue[0]
        earliest_release_location_original_gross = money_to_int(raw_earliest_release_location_total_gross)
    except: 
        earliest_release_location_original_gross = None 
    
    #QUITTING CHROME DRIVER
    driver.quit()


    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title,
                                domestic_total_gross, 
                                international_total_gross,
                                domestic_distributor,
                                domestic_opening,
                                budget,
                                release_date,
                                release_location,
                                rating,
                                runtime, 
                                genres,
                                original_release_markets_num,
                                crew,
                                cast,
                                earliest_release_location_opening_gross,
                                earliest_release_location_original_gross]))

    return movie_dict

Passing each link stub into the function 

In [9]:
fl_movies_page_info_list = [] #saving the info to a dictionary OUTSIDE the function

for link in fl_movies.link_stub:
    fl_movies_page_info_list.append(get_movie_dict(link))
    
fl_movies_page_info = pd.DataFrame(fl_movies_page_info_list)  #converting list of dict to df
fl_movies_page_info.set_index('title', inplace=True) 

import pickle #immediately pickling dataframe
fl_movies_page_info.to_pickle('fl_movies_page_info.pickle')

In [13]:
fl_movies_page_info = pd.read_pickle('fl_movies_page_info.pickle')

Converting list of dictionaries into a dataframe

In [14]:
fl_movies_page_info.shape

(2499, 15)

Replacing titles where movies share the same title  

In [15]:
fl_movies_page_info.index = fl_movies_page_info.index.where(~fl_movies_page_info.index.duplicated(), fl_movies_page_info.index + '_2')

In [16]:
fl_movies.head()

Unnamed: 0,link_stub,gross_rank,title,lifetime_gross,max_theaters,domestic_opening,num_opening_theaters,release_date,domestic_distributor
"Crouching Tiger, Hidden Dragon",/title/tt0190332/?ref_=bo_ge_table_1,1,"Crouching Tiger, Hidden Dragon","$128,078,872",2027,"$663,205",16,"Dec 8, 2000",Sony Pictures Classics\n\n
Life Is Beautiful,/title/tt0118799/?ref_=bo_ge_table_2,2,Life Is Beautiful,"$57,247,384",1136,"$118,920",6,"Oct 23, 1998",Miramax\n\n
Hero,/title/tt0299977/?ref_=bo_ge_table_3,3,Hero,"$53,710,019",2175,"$17,800,000",2031,"Aug 27, 2004",Miramax\n\n
Parasite,/title/tt6751668/?ref_=bo_ge_table_4,4,Parasite,"$53,369,749",2001,"$393,216",3,"Oct 11, 2019",Neon\n\n
Instructions Not Included,/title/tt2378281/?ref_=bo_ge_table_5,5,Instructions Not Included,"$44,467,206",978,"$7,846,426",348,"Aug 30, 2013",Lionsgate\n\n


In [17]:
fl_movies_page_info.shape

(2499, 15)

Finding different indices for the two dataframes created

In [18]:
different_indices = fl_movies[~fl_movies.index.isin(fl_movies_page_info.index)]
different_indices

Unnamed: 0,link_stub,gross_rank,title,lifetime_gross,max_theaters,domestic_opening,num_opening_theaters,release_date,domestic_distributor
Padmaavat,/title/tt5935704/?ref_=bo_ge_table_23,23,Padmaavat,"$11,846,060",354,"$4,493,384",326,"Jan 25, 2018",Viva Pictures\n\n
Huevos: Little Rooster's Egg-cellent Adventure,/title/tt4643580/?ref_=bo_ge_table_38,38,Huevos: Little Rooster's Egg-cellent Adventure,"$9,080,818",616,"$3,424,702",395,"Sep 4, 2015",Lionsgate\n\n
Central Station,/title/tt0140888/?ref_=bo_ge_table_69,69,Central Station,"$5,969,553",144,"$35,708",2,"Nov 20, 1998",Sony Pictures Classics\n\n
Bad Education,/title/tt0275491/?ref_=bo_ge_table_85,85,Bad Education,"$5,211,842",106,"$147,370",3,"Nov 19, 2004",Sony Pictures Classics\n\n
Ong-Bak: The Thai Warrior,/title/tt0368909/?ref_=bo_ge_table_99,99,Ong-Bak: The Thai Warrior,"$4,563,167",387,"$1,334,869",387,"Feb 11, 2005",Magnolia Pictures\n\n
Entre Nous,/title/tt0085370/?ref_=bo_ge_table_20,120,Entre Nous,"$3,974,975",-,-,-,"Jan 25, 1984",Metro-Goldwyn-Mayer (MGM)\n\n
La veuve de Saint-Pierre,/title/tt0191636/?ref_=bo_ge_table_53,153,La veuve de Saint-Pierre,"$3,193,889",83,"$31,702",1,"Mar 2, 2001",Lionsgate\n\n
Veer-Zaara,/title/tt0420332/?ref_=bo_ge_table_68,168,Veer-Zaara,"$2,938,532",88,"$843,010",88,"Nov 11, 2004",Yash Raj Films\n\n
Dans une galaxie près de chez vous - Le film,/title/tt0385635/?ref_=bo_ge_table_27,227,Dans une galaxie près de chez vous - Le film,"$2,259,688",74,-,-,"Apr 9, 2004",-
The Chorus,/title/tt0372824/?ref_=bo_ge_table_53,253,The Chorus,"$2,087,128",34,-,-,"Oct 1, 2004",Alliance Atlantis Vivafilm\n\n


In [21]:
fl_movies_page_info_list_2 = [] #saving the info to a dictionary OUTSIDE the function

for link in different_indices.link_stub:
    fl_movies_page_info_list_2.append(get_movie_dict(link))
    
fl_movies_page_info_2 = pd.DataFrame(fl_movies_page_info_list_2)  #converting list of dict to df
fl_movies_page_info_2.set_index('title', inplace=True) 

In [22]:
fl_movies_page_info_2.head()

Unnamed: 0_level_0,domestic_total_gross,international_total_gross,domestic_distributor,domestic_opening,budget,release_date,release_location,rating,runtime,genres,original_release_markets_num,crew,cast,earliest_release_location_opening_gross,earliest_release_location_original_gross
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Padmaavat,11846060,48662720,Viva Pictures,4493384.0,,2018-01-25,5 markets,,164.0,Drama History Romance War,,"Sanjay Leela Bhansali Director, Prakash Kapadi...","Deepika Padukone, Ranveer Singh, Shahid Kapoor...",4493384.0,11846060
Huevos: Little Rooster's Egg-cellent Adventure,9080818,16811743,Lionsgate,3424702.0,5300000.0,2015-08-20,LATAM,PG-13,98.0,Adventure Animation Comedy Family Fantasy,,"Gabriel Riva Palacio Alatriste Director, Rodol...","Bruno Bichir, Carlos Espejel, Angélica Vale, O...",3424702.0,9080818
Central Station,5969553,1520,Sony Pictures Classics,35708.0,,1998-11-20,Domestic,R,110.0,Drama,Domestic,"Walter Salles Director, Marcos Bernstein Write...","Fernanda Montenegro, Vinícius de Oliveira, Mar...",35708.0,5969553
Bad Education,5284284,35138995,Sony Pictures Classics,147370.0,5000000.0,2004-03-19,Spain,NC-17,106.0,Crime Drama,35,"Pedro Almodóvar Director, Pedro Almodóvar Writ...","Gael García Bernal, Fele Martínez, Javier Cáma...",147370.0,5211842
Ong-Bak: The Thai Warrior,4563167,15672259,Magnolia Pictures,1334869.0,,2003-08-13,Taiwan,R,108.0,Action Crime Thriller,,"Prachya Pinkaew Director, Panna Rittikrai Writ...","Tony Jaa, Petchtai Wongkamlao, Pumwaree Yodkam...",1334869.0,4563167


In [23]:
different_indices2 = fl_movies_page_info[~fl_movies_page_info.index.isin(fl_movies.index)]
different_indices2.index

Index(['Server Error', 'Huevos: Little Rooster's Egg', 'Server Error_2',
       'Server Error_2', 'Ong', 'Server Error_2', 'La veuve de Saint', 'Veer',
       'Dans une galaxie près de chez vous', 'Server Error_2', 'Hum Saath',
       'Salaam', 'Kon', 'Server Error_2', 'Server Error_2',
       'Xiu Xiu: The Sent', 'Server Error_2', 'The 100 Year', 'Delhi',
       'The Ex', 'Server Error_2', 'Non', 'Mid', 'Jaan', 'Teo',
       'Server Error_2', 'Server Error_2', 'Head', 'Baise', 'Server Error_2',
       'Ju', 'Anonyma', '3', 'Angel', 'Live', 'Lilya 4', 'Server Error_2',
       'Mughal', '3_2', 'Luck_2', 'Shortkut', 'Tazza: One',
       'I Married an Anti', 'Hara', 'Cet amour', 'Server Error_2',
       'Server Error_2', 'Server Error_2', 'Britt', 'Server Error_2',
       'All About Lily Chou', 'Brodeuses', 'Haikara', 'Ong_2',
       'Server Error_2', 'Server Error_2', 'Server Error_2',
       'Arabian Nights: Volume 3', 'Demi'],
      dtype='object', name='title')

In [24]:
fl_movies_page_info.drop(different_indices2.index, inplace=True)

In [25]:
fl_movies_page_info.shape

(2440, 15)

In [26]:
fl_movies_page_info_2.index = fl_movies_page_info_2.index.where(~fl_movies_page_info_2.index.duplicated(), fl_movies_page_info_2.index + '_2')

In [27]:
fl_movies_page_info = pd.concat((fl_movies_page_info, fl_movies_page_info_2), axis=0)

In [28]:
fl_movies_page_info.shape

(2499, 15)

In [29]:
fl_movies_page_info.index = fl_movies_page_info.index.where(~fl_movies_page_info.index.duplicated(), fl_movies_page_info.index + '_2')

In [30]:
fl_movies = fl_movies.merge(fl_movies_page_info, left_index=True, right_index=True)
fl_movies

Unnamed: 0,link_stub,gross_rank,title,lifetime_gross,max_theaters,domestic_opening_x,num_opening_theaters,release_date_x,domestic_distributor_x,domestic_total_gross,...,release_date_y,release_location,rating,runtime,genres,original_release_markets_num,crew,cast,earliest_release_location_opening_gross,earliest_release_location_original_gross
"Crouching Tiger, Hidden Dragon",/title/tt0190332/?ref_=bo_ge_table_1,1,"Crouching Tiger, Hidden Dragon","$128,078,872",2027,"$663,205",16,"Dec 8, 2000",Sony Pictures Classics\n\n,128078872.0,...,2000-12-08,Domestic,PG-13,,Action Adventure Fantasy Romance,9,"Ang Lee Director, Hui-Ling Wang Writer, James ...","Chow Yun-Fat, Michelle Yeoh, Ziyi Zhang, Chen ...",663205.0,128078872.0
Life Is Beautiful,/title/tt0118799/?ref_=bo_ge_table_2,2,Life Is Beautiful,"$57,247,384",1136,"$118,920",6,"Oct 23, 1998",Miramax\n\n,57563264.0,...,1998-10-23,Domestic,PG-13,116.0,Comedy Drama Romance War,"Domestic, China","Roberto Benigni Director, Vincenzo Cerami Writ...","Roberto Benigni, Nicoletta Braschi, Giorgio Ca...",118920.0,57247384.0
Hero,/title/tt0299977/?ref_=bo_ge_table_3,3,Hero,"$53,710,019",2175,"$17,800,000",2031,"Aug 27, 2004",Miramax\n\n,53710019.0,...,2002-12-19,"APAC, China",PG-13,,Action Adventure History,,"Yimou Zhang Director, Feng Li Writer, Yimou Zh...","Jet Li, Tony Chiu-Wai Leung, Maggie Cheung, Zi...",17800000.0,53710019.0
Parasite,/title/tt6751668/?ref_=bo_ge_table_4,4,Parasite,"$53,369,749",2001,"$393,216",3,"Oct 11, 2019",Neon\n\n,53369749.0,...,2019-05-30,South Korea,R,132.0,Comedy Drama Thriller,34,"Bong Joon Ho Director, Bong Joon Ho Writer, Bo...","Kang-ho Song, Sun-kyun Lee, Yeo-jeong Cho, Woo...",393216.0,53369749.0
Instructions Not Included,/title/tt2378281/?ref_=bo_ge_table_5,5,Instructions Not Included,"$44,467,206",978,"$7,846,426",348,"Aug 30, 2013",Lionsgate\n\n,44467206.0,...,2013-08-30,Domestic,PG-13,115.0,Comedy Drama,15,"Eugenio Derbez Director, Guillermo Ríos Writer...","Eugenio Derbez, Karla Souza, Jessica Lindsey, ...",7846426.0,44467206.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Falling,/title/tt0832347/?ref_=bo_ge_table_95,2495,Falling,$509,1,$509,1,"Jun 29, 2007",Kino International\n\n,509.0,...,2007-06-29,Domestic,,88.0,Drama,,"Barbara Albert Director, Barbara Albert Writer...","Nina Proll, Birgit Minichmayr, Kathrin Resetar...",509.0,509.0
Canary,/title/tt0468777/?ref_=bo_ge_table_96,2496,Canary,$504,1,$504,1,"Jul 25, 2008",ImaginAsian Pictures\n\n,504.0,...,2008-07-25,Domestic,,132.0,Drama,,"Akihiko Shiota Director, Akihiko Shiota Writer...","Hôshi Ishida, Mitsuki Tanimura, Hidetoshi Nish...",504.0,504.0
Tasuma,/title/tt0423325/?ref_=bo_ge_table_97,2497,Tasuma,$479,1,$281,1,"Jul 30, 2004",ArtMattan Productions\n\n,479.0,...,2004-07-30,Domestic,,90.0,Drama,,"Kollo Sanou Director, Cheick Tidiane Seck Comp...","Serge Henri, Besani Raoul Khalil",281.0,479.0
News from Planet Mars,/title/tt5038358/?ref_=bo_ge_table_98,2498,News from Planet Mars,$310,1,$24,1,"Jul 22, 2016",Kino Lorber\n\n,310.0,...,2016-03-09,Belgium,,101.0,Comedy Drama,3,"Dominik Moll Director, Dominik Moll Writer, Gi...","François Damiens, Vincent Macaigne, Veerle Bae...",24.0,310.0


In [31]:
#immediately pickling dataframe
fl_movies.to_pickle('fl_movies.pickle')