In [1]:
from bs4 import BeautifulSoup 
import requests
import numpy as np
import pandas as pd
import re

In [2]:
# create a function to scrape 
def g_info(url):
    '''
    Extract info from the main table layout,
    we will do this five times since each table on a page contains 200 movies
    '''
    # connect to the url
    response = requests.get(url)
    
    # get page HTML code using response.text
    page = response.text
    
    # parse the page's HTML code using Beautiful Soup and display in lxml
    soup = BeautifulSoup(page, 'lxml')
    
    # all the rows are nested inbetween <table><table>, so we first want to locate <table>,
    # then since each row is wrapped between <tr><tr>, we want to find all <tr>'s, should be 200 in total
    rows = [row for row in soup.find('table').find_all('tr')]
    
    # now we go through each row in the table
    # we start at index 1 since index 0 contains the table headers
    for row in rows[1:]:
        
        # get movie title, the title is contained in the first instance of <td> which is index 0   
        title = row.find_all('td')[0].text
        
        # get link, the link is also contained in the first instance of <td> which is also index 0
        # the link is wrapped between <a a>, so we first want to find a, then since the link is set to
        # href, we want to get the value of href
        link = row.find_all('td')[0].find('a').get('href')
        
        # get rank amongst all PG-13 movies, rank is contained in the second instance of <td> which is index 1
        mpaa_rank = row.find_all('td')[1].text
        
        # get lifetime gross, lifetime gross is contained in the third instance of <td> which is index 2
        # once we get the lifetime gross, we want an integer, so we remove $ and commas, then turn it into an int
        lifetime_gross = int(row.find_all('td')[2].text.replace('$', '').replace(',', ''))
        
        # get rank amongst all movies regardless of MPAA, contained in fourth instance of <td> which is index 3
        overall_rank = row.find_all('td')[3].text
        
        # get release year, contained in fifth instance of <td> which is index 4
        release_year = row.find_all('td')[4].text
        
        # add information to dictionary using the title of the movie as the key, and list of all attributes as values
        movies[title] = [link,
                         title,
                         mpaa_rank,
                         lifetime_gross,
                         overall_rank,
                         release_year]

In [3]:
url_list = ['https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=G&offset=200']

movies = {}
for url in url_list:
    g_info(url)

In [4]:
g_movies = pd.DataFrame(movies).T  #transpose
g_movies.columns = ['link_stub', 'title', 'mpaa_rank', 'lifetime_gross', 'overall_rank', 'year']
g_movies

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year
Toy Story 4,/title/tt1979376/?ref_=bo_cso_table_1,Toy Story 4,1,434038008,24,2019
The Lion King,/title/tt0110357/?ref_=bo_cso_table_2,The Lion King,2,422783777,28,1994
Toy Story 3,/title/tt0435761/?ref_=bo_cso_table_3,Toy Story 3,3,415004880,30,2010
Finding Nemo,/title/tt0266543/?ref_=bo_cso_table_4,Finding Nemo,4,380843261,43,2003
"Monsters, Inc.",/title/tt0198781/?ref_=bo_cso_table_5,"Monsters, Inc.",5,290642256,100,2001
...,...,...,...,...,...,...
Wa-shoku Dream: Beyond Sushi,/title/tt3846402/?ref_=bo_cso_table_161,Wa-shoku Dream: Beyond Sushi,361,12240,15550,2015
Mr. Chibbs,/title/tt4633340/?ref_=bo_cso_table_162,Mr. Chibbs,362,8302,16094,2017
Boom Bust Boom,/title/tt3332308/?ref_=bo_cso_table_163,Boom Bust Boom,363,6896,16333,2016
School of Life,/title/tt6330246/?ref_=bo_cso_table_164,School of Life,364,6300,16426,2018


In [5]:
def indiv_movie_info(df):
    '''
    Get detailed individual movie info 
    '''
    url_temp = 'https://www.boxofficemojo.com'
    
    # we want to loop the function for each row in the df that we pass into the function
    for i in range(df.shape[0]):
        
        #getting movie link
        movie_title = df.iloc[i, 1]
        link_stub = df.iloc[i, 0]
        url = url_temp + link_stub
        
         # connect to the url
        response = requests.get(url)
        
        # get page HTML code using response.text
        page = response.text
        
        # parse the page's HTML code using Beautiful Soup and display in lxml
        soup = BeautifulSoup(page, 'lxml')
        
        # look for the div that has the class of mojo-summary-values, this tag contains all the detailed info
        # that we want
        div = soup.find('div', class_ = 'mojo-summary-values')
        
        # get domestic distributor within the div tag that we want to find <span>, domestic distributor is
        # contained in the second element of <span> which has an index of 1
        # 'see full company information' is also containted within the second element of <span>, but we don't want
        # that portion so we split it
        try:
            domestic_distributor = div.find_all('span')[1].text.split('See')[0]
        except:
            domestic_distributor = None
        
        # get domestic opening, within the div tag that we want to find <span> and the money class, first instance
        # so index of 0
        # once we get the domestic opening, we want to remove $ and commas, then turn it into an int
        try:
            domestic_opening = int(div.find_all('span', class_ = 'money')[0].text.replace('$', '').replace(',', ''))
        except:
            domestic_opening = None
        
        # get budget, within the div tag that we want to find <span> and the money class, second instance
        # so index of 1
        # once we get the budget, we want to remove $ and commas, then turn it into an int
        try:
            budget = int(div.find_all('span', class_ = 'money')[1].text.replace('$', '').replace(',', ''))
        except:
            budget = None
    
        # get earliest release date
        try:
            regex = re.compile('Earliest Release')
            erd = soup.find(text = regex)
            earliest_release_string = erd.findNext().text
            earliest_release_date = earliest_release_string.split('\n')[0]
        except:
            earliest_release_date = None 
            
        # get rating
        try:
            regex = re.compile('MPAA')
            MPAA_string = soup.find(text = regex)
            MPAA = MPAA_string.findNext().text
        except:
            MPAA = None
            
        # get run time
        try:
            runtime_regex = re.compile('Run')
            rt_string = soup.find(text = runtime_regex)
            run_time = rt_string.findNext().text
        except:
            run_time = None
        
        # get genre
        try:
            gen = soup.find(text = 'Genres')
            genres_string = gen.findNext().text
            genres = genres_string.replace('\n', '').split()
        except:
            genres = None
        
        # add each individual movie to indiv_movie dict  
        indiv_movie[movie_title] = [domestic_distributor,
                                    domestic_opening,
                                    budget,
                                    earliest_release_date,
                                    MPAA,
                                    run_time, 
                                    genres]

In [6]:
indiv_movie = {}
indiv_movie_info(g_movies)

In [8]:
g_movies_det = pd.DataFrame(indiv_movie).transpose()
g_movies_det.columns = ['domestic_distributor',
                           'domestic_opening',
                           'budget',
                           'earliest_release_date',
                           'MPAA',
                           'run_time', 
                           'genres']
g_movies_det

Unnamed: 0,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Toy Story 4,Walt Disney Studios Motion Pictures,120908065,200000000,"June 20, 2019",G,1 hr 40 min,"[Adventure, Animation, Comedy, Family, Fantasy]"
The Lion King,Walt Disney Studios Motion Pictures,1586753,45000000,"June 15, 1994",G,1 hr 28 min,"[Adventure, Animation, Drama, Family, Musical]"
Toy Story 3,Walt Disney Studios Motion Pictures,110307189,200000000,"June 16, 2010",,1 hr 43 min,"[Adventure, Animation, Comedy, Family, Fantasy]"
Finding Nemo,Walt Disney Studios Motion Pictures,70251710,94000000,"May 30, 2003",,1 hr 40 min,"[Adventure, Animation, Comedy, Family]"
"Monsters, Inc.",Walt Disney Studios Motion Pictures,62577067,115000000,"November 2, 2001",G,1 hr 32 min,"[Adventure, Animation, Comedy, Family, Fantasy]"
...,...,...,...,...,...,...,...
Wa-shoku Dream: Beyond Sushi,Under The Milky Way,5582,,"April 10, 2015",,1 hr 47 min,"[Biography, Documentary, Family, History]"
Mr. Chibbs,Abramorama,3163,,"May 3, 2017",,1 hr 26 min,[Documentary]
Boom Bust Boom,Brainstorm Media,,,"March 11, 2016",,1 hr 14 min,"[Documentary, History]"
School of Life,Distrib Films,,,"October 11, 2017",,1 hr 56 min,"[Comedy, Drama, Family]"


In [9]:
g_movies_full = g_movies.merge(g_movies_det, left_index = True, right_index = True)
g_movies_full

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Toy Story 4,/title/tt1979376/?ref_=bo_cso_table_1,Toy Story 4,1,434038008,24,2019,Walt Disney Studios Motion Pictures,120908065,200000000,"June 20, 2019",G,1 hr 40 min,"[Adventure, Animation, Comedy, Family, Fantasy]"
The Lion King,/title/tt0110357/?ref_=bo_cso_table_2,The Lion King,2,422783777,28,1994,Walt Disney Studios Motion Pictures,1586753,45000000,"June 15, 1994",G,1 hr 28 min,"[Adventure, Animation, Drama, Family, Musical]"
Toy Story 3,/title/tt0435761/?ref_=bo_cso_table_3,Toy Story 3,3,415004880,30,2010,Walt Disney Studios Motion Pictures,110307189,200000000,"June 16, 2010",,1 hr 43 min,"[Adventure, Animation, Comedy, Family, Fantasy]"
Finding Nemo,/title/tt0266543/?ref_=bo_cso_table_4,Finding Nemo,4,380843261,43,2003,Walt Disney Studios Motion Pictures,70251710,94000000,"May 30, 2003",,1 hr 40 min,"[Adventure, Animation, Comedy, Family]"
"Monsters, Inc.",/title/tt0198781/?ref_=bo_cso_table_5,"Monsters, Inc.",5,290642256,100,2001,Walt Disney Studios Motion Pictures,62577067,115000000,"November 2, 2001",G,1 hr 32 min,"[Adventure, Animation, Comedy, Family, Fantasy]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wa-shoku Dream: Beyond Sushi,/title/tt3846402/?ref_=bo_cso_table_161,Wa-shoku Dream: Beyond Sushi,361,12240,15550,2015,Under The Milky Way,5582,,"April 10, 2015",,1 hr 47 min,"[Biography, Documentary, Family, History]"
Mr. Chibbs,/title/tt4633340/?ref_=bo_cso_table_162,Mr. Chibbs,362,8302,16094,2017,Abramorama,3163,,"May 3, 2017",,1 hr 26 min,[Documentary]
Boom Bust Boom,/title/tt3332308/?ref_=bo_cso_table_163,Boom Bust Boom,363,6896,16333,2016,Brainstorm Media,,,"March 11, 2016",,1 hr 14 min,"[Documentary, History]"
School of Life,/title/tt6330246/?ref_=bo_cso_table_164,School of Life,364,6300,16426,2018,Distrib Films,,,"October 11, 2017",,1 hr 56 min,"[Comedy, Drama, Family]"


In [11]:
g_movies_full['MPAA'].unique()

array(['G', None, 'Approved', 'PG'], dtype=object)

In [12]:
g_movies_full['MPAA'].fillna('G', inplace = True)

In [13]:
g_movies_full['MPAA'].replace('Approved', 'G', inplace = True)

In [14]:
g_movies_full['MPAA'].replace('PG', 'G', inplace = True)

In [17]:
g_movies_full.to_csv('g_movies_data.csv')