In [1]:
from bs4 import BeautifulSoup 
import requests
import numpy as np
import pandas as pd
import re

In [2]:
# create a function to scrape 
def pg_info(url):
    '''
    Extract info from the main table layout,
    we will do this five times since each table on a page contains 200 movies
    '''
    # connect to the url
    response = requests.get(url)
    
    # get page HTML code using response.text
    page = response.text
    
    # parse the page's HTML code using Beautiful Soup and display in lxml
    soup = BeautifulSoup(page, 'lxml')
    
    # all the rows are nested inbetween <table><table>, so we first want to locate <table>,
    # then since each row is wrapped between <tr><tr>, we want to find all <tr>'s, should be 200 in total
    rows = [row for row in soup.find('table').find_all('tr')]
    
    # now we go through each row in the table
    # we start at index 1 since index 0 contains the table headers
    for row in rows[1:]:
        
        # get movie title, the title is contained in the first instance of <td> which is index 0   
        title = row.find_all('td')[0].text
        
        # get link, the link is also contained in the first instance of <td> which is also index 0
        # the link is wrapped between <a a>, so we first want to find a, then since the link is set to
        # href, we want to get the value of href
        link = row.find_all('td')[0].find('a').get('href')
        
        # get rank amongst all PG-13 movies, rank is contained in the second instance of <td> which is index 1
        mpaa_rank = row.find_all('td')[1].text
        
        # get lifetime gross, lifetime gross is contained in the third instance of <td> which is index 2
        # once we get the lifetime gross, we want an integer, so we remove $ and commas, then turn it into an int
        lifetime_gross = int(row.find_all('td')[2].text.replace('$', '').replace(',', ''))
        
        # get rank amongst all movies regardless of MPAA, contained in fourth instance of <td> which is index 3
        overall_rank = row.find_all('td')[3].text
        
        # get release year, contained in fifth instance of <td> which is index 4
        release_year = row.find_all('td')[4].text
        
        # add information to dictionary using the title of the movie as the key, and list of all attributes as values
        movies[title] = [link,
                         title,
                         mpaa_rank,
                         lifetime_gross,
                         overall_rank,
                         release_year]

In [3]:
url_list = ['https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=200',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=400',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=600',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG&offset=800']

movies = {}
for url in url_list:
    pg_info(url)

In [4]:
pg_movies = pd.DataFrame(movies).T  #transpose
pg_movies.columns = ['link_stub', 'title', 'mpaa_rank', 'lifetime_gross', 'overall_rank', 'year']
pg_movies

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year
Incredibles 2,/title/tt3606756/?ref_=bo_cso_table_1,Incredibles 2,1,608581744,10,2018
The Lion King,/title/tt6105098/?ref_=bo_cso_table_2,The Lion King,2,543638043,11,2019
Beauty and the Beast,/title/tt2771200/?ref_=bo_cso_table_3,Beauty and the Beast,3,504481165,15,2017
Finding Dory,/title/tt2277860/?ref_=bo_cso_table_4,Finding Dory,4,486295561,16,2016
Frozen II,/title/tt4520988/?ref_=bo_cso_table_5,Frozen II,5,477373578,17,2019
...,...,...,...,...,...,...
Far from Home: The Adventures of Yellow Dog,/title/tt0113028/?ref_=bo_cso_table_196,Far from Home: The Adventures of Yellow Dog,996,11642946,4619,1995
The Promise,/title/tt0079756/?ref_=bo_cso_table_197,The Promise,997,11606005,4623,1979
Police Academy 6: City Under Siege,/title/tt0098105/?ref_=bo_cso_table_198,Police Academy 6: City Under Siege,998,11567217,4631,1989
The Other Side of the Mountain: Part II,/title/tt0078044/?ref_=bo_cso_table_199,The Other Side of the Mountain: Part II,999,11565678,4632,1978


In [5]:
def indiv_movie_info(df):
    '''
    Get detailed individual movie info 
    '''
    url_temp = 'https://www.boxofficemojo.com'
    
    # we want to loop the function for each row in the df that we pass into the function
    for i in range(df.shape[0]):
        
        #getting movie link
        movie_title = df.iloc[i, 1]
        link_stub = df.iloc[i, 0]
        url = url_temp + link_stub
        
         # connect to the url
        response = requests.get(url)
        
        # get page HTML code using response.text
        page = response.text
        
        # parse the page's HTML code using Beautiful Soup and display in lxml
        soup = BeautifulSoup(page, 'lxml')
        
        # look for the div that has the class of mojo-summary-values, this tag contains all the detailed info
        # that we want
        div = soup.find('div', class_ = 'mojo-summary-values')
        
        # get domestic distributor within the div tag that we want to find <span>, domestic distributor is
        # contained in the second element of <span> which has an index of 1
        # 'see full company information' is also containted within the second element of <span>, but we don't want
        # that portion so we split it
        try:
            domestic_distributor = div.find_all('span')[1].text.split('See')[0]
        except:
            domestic_distributor = None
        
        # get domestic opening, within the div tag that we want to find <span> and the money class, first instance
        # so index of 0
        # once we get the domestic opening, we want to remove $ and commas, then turn it into an int
        try:
            domestic_opening = int(div.find_all('span', class_ = 'money')[0].text.replace('$', '').replace(',', ''))
        except:
            domestic_opening = None
        
        # get budget, within the div tag that we want to find <span> and the money class, second instance
        # so index of 1
        # once we get the budget, we want to remove $ and commas, then turn it into an int
        try:
            budget = int(div.find_all('span', class_ = 'money')[1].text.replace('$', '').replace(',', ''))
        except:
            budget = None
    
        # get earliest release date
        try:
            regex = re.compile('Earliest Release')
            erd = soup.find(text = regex)
            earliest_release_string = erd.findNext().text
            earliest_release_date = earliest_release_string.split('\n')[0]
        except:
            earliest_release_date = None 
            
        # get rating
        try:
            regex = re.compile('MPAA')
            MPAA_string = soup.find(text = regex)
            MPAA = MPAA_string.findNext().text
        except:
            MPAA = None
            
        # get run time
        try:
            runtime_regex = re.compile('Run')
            rt_string = soup.find(text = runtime_regex)
            run_time = rt_string.findNext().text
        except:
            run_time = None
        
        # get genre
        try:
            gen = soup.find(text = 'Genres')
            genres_string = gen.findNext().text
            genres = genres_string.replace('\n', '').split()
        except:
            genres = None
        
        # add each individual movie to indiv_movie dict  
        indiv_movie[movie_title] = [domestic_distributor,
                                    domestic_opening,
                                    budget,
                                    earliest_release_date,
                                    MPAA,
                                    run_time, 
                                    genres]

In [6]:
indiv_movie = {}
indiv_movie_info(pg_movies)

In [7]:
pg_movies_det = pd.DataFrame(indiv_movie).transpose()
pg_movies_det.columns = ['domestic_distributor',
                           'domestic_opening',
                           'budget',
                           'earliest_release_date',
                           'MPAA',
                           'run_time', 
                           'genres']
pg_movies_det

Unnamed: 0,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Incredibles 2,Walt Disney Studios Motion Pictures,182687905,,"June 14, 2018",PG,1 hr 58 min,"[Action, Adventure, Animation, Comedy, Family,..."
The Lion King,Walt Disney Studios Motion Pictures,191770759,260000000,"July 11, 2019",PG,1 hr 58 min,"[Adventure, Animation, Drama, Family, Musical]"
Beauty and the Beast,Walt Disney Studios Motion Pictures,174750616,160000000,"March 16, 2017",PG,2 hr 9 min,"[Family, Fantasy, Musical, Romance]"
Finding Dory,Walt Disney Studios Motion Pictures,135060273,,"June 15, 2016",PG,1 hr 37 min,"[Adventure, Animation, Comedy, Family]"
Frozen II,Walt Disney Studios Motion Pictures,130263358,150000000,"November 20, 2019",PG,1 hr 43 min,"[Adventure, Animation, Comedy, Family, Fantasy..."
...,...,...,...,...,...,...,...
Far from Home: The Adventures of Yellow Dog,Twentieth Century Fox,4231917,,"January 13, 1995",PG,1 hr 21 min,"[Adventure, Family]"
The Promise,Universal Pictures,,,"January 1, 1979",,1 hr 37 min,"[Drama, Romance]"
Police Academy 6: City Under Siege,Warner Bros.,4032480,,"March 10, 1989",,1 hr 24 min,"[Comedy, Crime]"
The Other Side of the Mountain: Part II,Universal Pictures,,,"January 1, 1978",,1 hr 40 min,"[Biography, Drama, Romance]"


In [8]:
pg_movies_full = pg_movies.merge(pg_movies_det, left_index = True, right_index = True)
pg_movies_full

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Incredibles 2,/title/tt3606756/?ref_=bo_cso_table_1,Incredibles 2,1,608581744,10,2018,Walt Disney Studios Motion Pictures,182687905,,"June 14, 2018",PG,1 hr 58 min,"[Action, Adventure, Animation, Comedy, Family,..."
The Lion King,/title/tt6105098/?ref_=bo_cso_table_2,The Lion King,2,543638043,11,2019,Walt Disney Studios Motion Pictures,191770759,260000000,"July 11, 2019",PG,1 hr 58 min,"[Adventure, Animation, Drama, Family, Musical]"
Beauty and the Beast,/title/tt2771200/?ref_=bo_cso_table_3,Beauty and the Beast,3,504481165,15,2017,Walt Disney Studios Motion Pictures,174750616,160000000,"March 16, 2017",PG,2 hr 9 min,"[Family, Fantasy, Musical, Romance]"
Finding Dory,/title/tt2277860/?ref_=bo_cso_table_4,Finding Dory,4,486295561,16,2016,Walt Disney Studios Motion Pictures,135060273,,"June 15, 2016",PG,1 hr 37 min,"[Adventure, Animation, Comedy, Family]"
Frozen II,/title/tt4520988/?ref_=bo_cso_table_5,Frozen II,5,477373578,17,2019,Walt Disney Studios Motion Pictures,130263358,150000000,"November 20, 2019",PG,1 hr 43 min,"[Adventure, Animation, Comedy, Family, Fantasy..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Far from Home: The Adventures of Yellow Dog,/title/tt0113028/?ref_=bo_cso_table_196,Far from Home: The Adventures of Yellow Dog,996,11642946,4619,1995,Twentieth Century Fox,4231917,,"January 13, 1995",PG,1 hr 21 min,"[Adventure, Family]"
The Promise,/title/tt0079756/?ref_=bo_cso_table_197,The Promise,997,11606005,4623,1979,Universal Pictures,,,"January 1, 1979",,1 hr 37 min,"[Drama, Romance]"
Police Academy 6: City Under Siege,/title/tt0098105/?ref_=bo_cso_table_198,Police Academy 6: City Under Siege,998,11567217,4631,1989,Warner Bros.,4032480,,"March 10, 1989",,1 hr 24 min,"[Comedy, Crime]"
The Other Side of the Mountain: Part II,/title/tt0078044/?ref_=bo_cso_table_199,The Other Side of the Mountain: Part II,999,11565678,4632,1978,Universal Pictures,,,"January 1, 1978",,1 hr 40 min,"[Biography, Drama, Romance]"


In [9]:
pg_movies_full['MPAA'].unique()

array(['PG', None], dtype=object)

In [10]:
pg_movies_full['MPAA'].fillna('PG', inplace = True)

In [11]:
pg_movies_full

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Incredibles 2,/title/tt3606756/?ref_=bo_cso_table_1,Incredibles 2,1,608581744,10,2018,Walt Disney Studios Motion Pictures,182687905,,"June 14, 2018",PG,1 hr 58 min,"[Action, Adventure, Animation, Comedy, Family,..."
The Lion King,/title/tt6105098/?ref_=bo_cso_table_2,The Lion King,2,543638043,11,2019,Walt Disney Studios Motion Pictures,191770759,260000000,"July 11, 2019",PG,1 hr 58 min,"[Adventure, Animation, Drama, Family, Musical]"
Beauty and the Beast,/title/tt2771200/?ref_=bo_cso_table_3,Beauty and the Beast,3,504481165,15,2017,Walt Disney Studios Motion Pictures,174750616,160000000,"March 16, 2017",PG,2 hr 9 min,"[Family, Fantasy, Musical, Romance]"
Finding Dory,/title/tt2277860/?ref_=bo_cso_table_4,Finding Dory,4,486295561,16,2016,Walt Disney Studios Motion Pictures,135060273,,"June 15, 2016",PG,1 hr 37 min,"[Adventure, Animation, Comedy, Family]"
Frozen II,/title/tt4520988/?ref_=bo_cso_table_5,Frozen II,5,477373578,17,2019,Walt Disney Studios Motion Pictures,130263358,150000000,"November 20, 2019",PG,1 hr 43 min,"[Adventure, Animation, Comedy, Family, Fantasy..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Far from Home: The Adventures of Yellow Dog,/title/tt0113028/?ref_=bo_cso_table_196,Far from Home: The Adventures of Yellow Dog,996,11642946,4619,1995,Twentieth Century Fox,4231917,,"January 13, 1995",PG,1 hr 21 min,"[Adventure, Family]"
The Promise,/title/tt0079756/?ref_=bo_cso_table_197,The Promise,997,11606005,4623,1979,Universal Pictures,,,"January 1, 1979",PG,1 hr 37 min,"[Drama, Romance]"
Police Academy 6: City Under Siege,/title/tt0098105/?ref_=bo_cso_table_198,Police Academy 6: City Under Siege,998,11567217,4631,1989,Warner Bros.,4032480,,"March 10, 1989",PG,1 hr 24 min,"[Comedy, Crime]"
The Other Side of the Mountain: Part II,/title/tt0078044/?ref_=bo_cso_table_199,The Other Side of the Mountain: Part II,999,11565678,4632,1978,Universal Pictures,,,"January 1, 1978",PG,1 hr 40 min,"[Biography, Drama, Romance]"


In [12]:
pg_movies_full.to_csv('pg_movies_data.csv')