In [1]:
from bs4 import BeautifulSoup 
import requests
import numpy as np
import pandas as pd
import re

In [2]:
# create a function to scrape 
def pg13_info(url):
    '''
    Extract info from the main table layout,
    we will do this five times since each table on a page contains 200 movies
    '''
    # connect to the url
    response = requests.get(url)
    
    # get page HTML code using response.text
    page = response.text
    
    # parse the page's HTML code using Beautiful Soup and display in lxml
    soup = BeautifulSoup(page, 'lxml')
    
    # all the rows are nested inbetween <table><table>, so we first want to locate <table>,
    # then since each row is wrapped between <tr><tr>, we want to find all <tr>'s, should be 200 in total
    rows = [row for row in soup.find('table').find_all('tr')]
    
    # now we go through each row in the table
    # we start at index 1 since index 0 contains the table headers
    for row in rows[1:]:
        
        # get movie title, the title is contained in the first instance of <td> which is index 0   
        title = row.find_all('td')[0].text
        
        # get link, the link is also contained in the first instance of <td> which is also index 0
        # the link is wrapped between <a a>, so we first want to find a, then since the link is set to
        # href, we want to get the value of href
        link = row.find_all('td')[0].find('a').get('href')
        
        # get rank amongst all PG-13 movies, rank is contained in the second instance of <td> which is index 1
        mpaa_rank = row.find_all('td')[1].text
        
        # get lifetime gross, lifetime gross is contained in the third instance of <td> which is index 2
        # once we get the lifetime gross, we want an integer, so we remove $ and commas, then turn it into an int
        lifetime_gross = int(row.find_all('td')[2].text.replace('$', '').replace(',', ''))
        
        # get rank amongst all movies regardless of MPAA, contained in fourth instance of <td> which is index 3
        overall_rank = row.find_all('td')[3].text
        
        # get release year, contained in fifth instance of <td> which is index 4
        release_year = row.find_all('td')[4].text
        
        # add information to dictionary using the title of the movie as the key, and list of all attributes as values
        movies[title] = [link,
                         title,
                         mpaa_rank,
                         lifetime_gross,
                         overall_rank,
                         release_year]

In [3]:
url_list = ['https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&ref_=bo_cso_ac%22',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=200',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=400',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=600',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=800']

movies = {}
for url in url_list:
    pg13_info(url)

In [4]:
#put our data into dataframe
pg13_movies = pd.DataFrame(movies).T  #transpose
pg13_movies.columns = ['link_stub', 'title', 'mpaa_rank', 'lifetime_gross', 'overall_rank', 'year']
pg13_movies

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,Star Wars: Episode VII - The Force Awakens,1,936662225,1,2015
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,Avengers: Endgame,2,858373000,2,2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,Avatar,3,760507625,3,2009
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,Black Panther,4,700426566,4,2018
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,Avengers: Infinity War,5,678815482,5,2018
...,...,...,...,...,...,...
Black Knight,/title/tt0265087/?ref_=bo_cso_table_196,Black Knight,996,33426971,2557,2001
Street Fighter,/title/tt0111301/?ref_=bo_cso_table_197,Street Fighter,997,33423521,2558,1994
Blue Jasmine,/title/tt2334873/?ref_=bo_cso_table_198,Blue Jasmine,998,33405481,2559,2013
Jojo Rabbit,/title/tt2584384/?ref_=bo_cso_table_199,Jojo Rabbit,999,33370906,2561,2019


In [5]:
def indiv_movie_info(df):
    '''
    Get detailed individual movie info 
    '''
    url_temp = 'https://www.boxofficemojo.com'
    
    # we want to loop the function for each row in the df that we pass into the function
    for i in range(df.shape[0]):
        
        #getting movie link
        movie_title = df.iloc[i, 1]
        link_stub = df.iloc[i, 0]
        url = url_temp + link_stub
        
         # connect to the url
        response = requests.get(url)
        
        # get page HTML code using response.text
        page = response.text
        
        # parse the page's HTML code using Beautiful Soup and display in lxml
        soup = BeautifulSoup(page, 'lxml')
        
        # look for the div that has the class of mojo-summary-values, this tag contains all the detailed info
        # that we want
        div = soup.find('div', class_ = 'mojo-summary-values')
        
        # get domestic distributor within the div tag that we want to find <span>, domestic distributor is
        # contained in the second element of <span> which has an index of 1
        # 'see full company information' is also containted within the second element of <span>, but we don't want
        # that portion so we split it
        try:
            domestic_distributor = div.find_all('span')[1].text.split('See')[0]
        except:
            domestic_distributor = None
        
        # get domestic opening, within the div tag that we want to find <span> and the money class, first instance
        # so index of 0
        # once we get the domestic opening, we want to remove $ and commas, then turn it into an int
        try:
            domestic_opening = int(div.find_all('span', class_ = 'money')[0].text.replace('$', '').replace(',', ''))
        except:
            domestic_opening = None
        
        # get budget, within the div tag that we want to find <span> and the money class, second instance
        # so index of 1
        # once we get the budget, we want to remove $ and commas, then turn it into an int
        try:
            budget = int(div.find_all('span', class_ = 'money')[1].text.replace('$', '').replace(',', ''))
        except:
            budget = None
    
        # get earliest release date
        try:
            regex = re.compile('Earliest Release')
            erd = soup.find(text = regex)
            earliest_release_string = erd.findNext().text
            earliest_release_date = earliest_release_string.split('\n')[0]
        except:
            earliest_release_date = None 
            
        # get rating
        try:
            regex = re.compile('MPAA')
            MPAA_string = soup.find(text = regex)
            MPAA = MPAA_string.findNext().text
        except:
            MPAA = None
            
        # get run time
        try:
            runtime_regex = re.compile('Run')
            rt_string = soup.find(text = runtime_regex)
            run_time = rt_string.findNext().text
        except:
            run_time = None
        
        # get genre
        try:
            gen = soup.find(text = 'Genres')
            genres_string = gen.findNext().text
            genres = genres_string.replace('\n', '').split()
        except:
            genres = None
        
        # add each individual movie to indiv_movie dict  
        indiv_movie[movie_title] = [domestic_distributor,
                                    domestic_opening,
                                    budget,
                                    earliest_release_date,
                                    MPAA,
                                    run_time, 
                                    genres]

In [6]:
indiv_movie = {}
indiv_movie_info(pg13_movies)

In [7]:
pg13_movies_det = pd.DataFrame(indiv_movie).transpose()
pg13_movies_det.columns = ['domestic_distributor',
                           'domestic_opening',
                           'budget',
                           'earliest_release_date',
                           'MPAA',
                           'run_time', 
                           'genres']
pg13_movies_det

Unnamed: 0,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Star Wars: Episode VII - The Force Awakens,Walt Disney Studios Motion Pictures,247966675,245000000,"December 16, 2015",PG-13,2 hr 18 min,"[Action, Adventure, Sci-Fi]"
Avengers: Endgame,Walt Disney Studios Motion Pictures,357115007,356000000,"April 24, 2019",PG-13,3 hr 1 min,"[Action, Adventure, Drama, Sci-Fi]"
Avatar,Twentieth Century Fox,77025481,237000000,"December 16, 2009",PG-13,2 hr 42 min,"[Action, Adventure, Fantasy, Sci-Fi]"
Black Panther,Walt Disney Studios Motion Pictures,202003951,,"February 13, 2018",PG-13,2 hr 14 min,"[Action, Adventure, Sci-Fi]"
Avengers: Infinity War,Walt Disney Studios Motion Pictures,257698183,,"April 25, 2018",PG-13,2 hr 29 min,"[Action, Adventure, Sci-Fi]"
...,...,...,...,...,...,...,...
Black Knight,Twentieth Century Fox,11102948,50000000,"November 21, 2001",PG-13,1 hr 35 min,"[Adventure, Comedy, Fantasy]"
Street Fighter,Universal Pictures,6859495,35000000,"December 23, 1994",PG-13,1 hr 42 min,"[Action, Adventure, Comedy, Thriller]"
Blue Jasmine,Sony Pictures Classics,612064,,"July 26, 2013",PG-13,1 hr 38 min,"[Comedy, Drama]"
Jojo Rabbit,Fox Searchlight Pictures,349555,14000000,"October 18, 2019",PG-13,1 hr 48 min,"[Comedy, Drama, War]"


In [8]:
pg13_movies_full = pg13_movies.merge(pg13_movies_det, left_index = True, right_index = True)
pg13_movies_full

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,Star Wars: Episode VII - The Force Awakens,1,936662225,1,2015,Walt Disney Studios Motion Pictures,247966675,245000000,"December 16, 2015",PG-13,2 hr 18 min,"[Action, Adventure, Sci-Fi]"
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,Avengers: Endgame,2,858373000,2,2019,Walt Disney Studios Motion Pictures,357115007,356000000,"April 24, 2019",PG-13,3 hr 1 min,"[Action, Adventure, Drama, Sci-Fi]"
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,Avatar,3,760507625,3,2009,Twentieth Century Fox,77025481,237000000,"December 16, 2009",PG-13,2 hr 42 min,"[Action, Adventure, Fantasy, Sci-Fi]"
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,Black Panther,4,700426566,4,2018,Walt Disney Studios Motion Pictures,202003951,,"February 13, 2018",PG-13,2 hr 14 min,"[Action, Adventure, Sci-Fi]"
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,Avengers: Infinity War,5,678815482,5,2018,Walt Disney Studios Motion Pictures,257698183,,"April 25, 2018",PG-13,2 hr 29 min,"[Action, Adventure, Sci-Fi]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black Knight,/title/tt0265087/?ref_=bo_cso_table_196,Black Knight,996,33426971,2557,2001,Twentieth Century Fox,11102948,50000000,"November 21, 2001",PG-13,1 hr 35 min,"[Adventure, Comedy, Fantasy]"
Street Fighter,/title/tt0111301/?ref_=bo_cso_table_197,Street Fighter,997,33423521,2558,1994,Universal Pictures,6859495,35000000,"December 23, 1994",PG-13,1 hr 42 min,"[Action, Adventure, Comedy, Thriller]"
Blue Jasmine,/title/tt2334873/?ref_=bo_cso_table_198,Blue Jasmine,998,33405481,2559,2013,Sony Pictures Classics,612064,,"July 26, 2013",PG-13,1 hr 38 min,"[Comedy, Drama]"
Jojo Rabbit,/title/tt2584384/?ref_=bo_cso_table_199,Jojo Rabbit,999,33370906,2561,2019,Fox Searchlight Pictures,349555,14000000,"October 18, 2019",PG-13,1 hr 48 min,"[Comedy, Drama, War]"


In [9]:
pg13_movies_full['MPAA'].unique()

array(['PG-13', None], dtype=object)

In [10]:
pg13_movies_full['MPAA'].fillna('PG-13', inplace = True)

In [11]:
pg13_movies_full

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,Star Wars: Episode VII - The Force Awakens,1,936662225,1,2015,Walt Disney Studios Motion Pictures,247966675,245000000,"December 16, 2015",PG-13,2 hr 18 min,"[Action, Adventure, Sci-Fi]"
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,Avengers: Endgame,2,858373000,2,2019,Walt Disney Studios Motion Pictures,357115007,356000000,"April 24, 2019",PG-13,3 hr 1 min,"[Action, Adventure, Drama, Sci-Fi]"
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,Avatar,3,760507625,3,2009,Twentieth Century Fox,77025481,237000000,"December 16, 2009",PG-13,2 hr 42 min,"[Action, Adventure, Fantasy, Sci-Fi]"
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,Black Panther,4,700426566,4,2018,Walt Disney Studios Motion Pictures,202003951,,"February 13, 2018",PG-13,2 hr 14 min,"[Action, Adventure, Sci-Fi]"
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,Avengers: Infinity War,5,678815482,5,2018,Walt Disney Studios Motion Pictures,257698183,,"April 25, 2018",PG-13,2 hr 29 min,"[Action, Adventure, Sci-Fi]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black Knight,/title/tt0265087/?ref_=bo_cso_table_196,Black Knight,996,33426971,2557,2001,Twentieth Century Fox,11102948,50000000,"November 21, 2001",PG-13,1 hr 35 min,"[Adventure, Comedy, Fantasy]"
Street Fighter,/title/tt0111301/?ref_=bo_cso_table_197,Street Fighter,997,33423521,2558,1994,Universal Pictures,6859495,35000000,"December 23, 1994",PG-13,1 hr 42 min,"[Action, Adventure, Comedy, Thriller]"
Blue Jasmine,/title/tt2334873/?ref_=bo_cso_table_198,Blue Jasmine,998,33405481,2559,2013,Sony Pictures Classics,612064,,"July 26, 2013",PG-13,1 hr 38 min,"[Comedy, Drama]"
Jojo Rabbit,/title/tt2584384/?ref_=bo_cso_table_199,Jojo Rabbit,999,33370906,2561,2019,Fox Searchlight Pictures,349555,14000000,"October 18, 2019",PG-13,1 hr 48 min,"[Comedy, Drama, War]"


In [12]:
pg13_movies_full.to_csv('pg13_movies_data.csv')