In [1]:
from bs4 import BeautifulSoup 
import requests
import numpy as np
import pandas as pd
import re

In [2]:
# create a function to scrape 
def r_info(url):
    '''
    Extract info from the main table layout,
    we will do this five times since each table on a page contains 200 movies
    '''
    # connect to the url
    response = requests.get(url)
    
    # get page HTML code using response.text
    page = response.text
    
    # parse the page's HTML code using Beautiful Soup and display in lxml
    soup = BeautifulSoup(page, 'lxml')
    
    # all the rows are nested inbetween <table><table>, so we first want to locate <table>,
    # then since each row is wrapped between <tr><tr>, we want to find all <tr>'s, should be 200 in total
    rows = [row for row in soup.find('table').find_all('tr')]
    
    # now we go through each row in the table
    # we start at index 1 since index 0 contains the table headers
    for row in rows[1:]:
        
        # get movie title, the title is contained in the first instance of <td> which is index 0   
        title = row.find_all('td')[0].text
        
        # get link, the link is also contained in the first instance of <td> which is also index 0
        # the link is wrapped between <a a>, so we first want to find a, then since the link is set to
        # href, we want to get the value of href
        link = row.find_all('td')[0].find('a').get('href')
        
        # get rank amongst all PG-13 movies, rank is contained in the second instance of <td> which is index 1
        mpaa_rank = row.find_all('td')[1].text
        
        # get lifetime gross, lifetime gross is contained in the third instance of <td> which is index 2
        # once we get the lifetime gross, we want an integer, so we remove $ and commas, then turn it into an int
        lifetime_gross = int(row.find_all('td')[2].text.replace('$', '').replace(',', ''))
        
        # get rank amongst all movies regardless of MPAA, contained in fourth instance of <td> which is index 3
        overall_rank = row.find_all('td')[3].text
        
        # get release year, contained in fifth instance of <td> which is index 4
        release_year = row.find_all('td')[4].text
        
        # add information to dictionary using the title of the movie as the key, and list of all attributes as values
        movies[title] = [link,
                         title,
                         mpaa_rank,
                         lifetime_gross,
                         overall_rank,
                         release_year]

In [3]:
url_list = ['https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=200',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=400',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=600',
            'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=R&offset=800']

movies = {}
for url in url_list:
    r_info(url)

In [4]:
r_movies = pd.DataFrame(movies).T  #transpose
r_movies.columns = ['link_stub', 'title', 'mpaa_rank', 'lifetime_gross', 'overall_rank', 'year']
r_movies

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year
The Passion of the Christ,/title/tt0335345/?ref_=bo_cso_table_1,The Passion of the Christ,1,370782930,47,2004
Deadpool,/title/tt1431045/?ref_=bo_cso_table_2,Deadpool,2,363070709,51,2016
American Sniper,/title/tt2179136/?ref_=bo_cso_table_3,American Sniper,3,350126372,56,2014
Joker,/title/tt7286456/?ref_=bo_cso_table_4,Joker,4,335451311,62,2019
It,/title/tt1396484/?ref_=bo_cso_table_5,It,5,328828874,69,2017
...,...,...,...,...,...,...
Sinister 2,/title/tt2752772/?ref_=bo_cso_table_196,Sinister 2,996,27740955,2929,2015
Internal Affairs,/title/tt0099850/?ref_=bo_cso_table_197,Internal Affairs,997,27734391,2930,1990
Oculus,/title/tt2388715/?ref_=bo_cso_table_198,Oculus,998,27695246,2931,2014
Saw VI,/title/tt1233227/?ref_=bo_cso_table_199,Saw VI,999,27693292,2932,2009


In [5]:
def indiv_movie_info(df):
    '''
    Get detailed individual movie info 
    '''
    url_temp = 'https://www.boxofficemojo.com'
    
    # we want to loop the function for each row in the df that we pass into the function
    for i in range(df.shape[0]):
        
        #getting movie link
        movie_title = df.iloc[i, 1]
        link_stub = df.iloc[i, 0]
        url = url_temp + link_stub
        
         # connect to the url
        response = requests.get(url)
        
        # get page HTML code using response.text
        page = response.text
        
        # parse the page's HTML code using Beautiful Soup and display in lxml
        soup = BeautifulSoup(page, 'lxml')
        
        # look for the div that has the class of mojo-summary-values, this tag contains all the detailed info
        # that we want
        div = soup.find('div', class_ = 'mojo-summary-values')
        
        # get domestic distributor within the div tag that we want to find <span>, domestic distributor is
        # contained in the second element of <span> which has an index of 1
        # 'see full company information' is also containted within the second element of <span>, but we don't want
        # that portion so we split it
        try:
            domestic_distributor = div.find_all('span')[1].text.split('See')[0]
        except:
            domestic_distributor = None
        
        # get domestic opening, within the div tag that we want to find <span> and the money class, first instance
        # so index of 0
        # once we get the domestic opening, we want to remove $ and commas, then turn it into an int
        try:
            domestic_opening = int(div.find_all('span', class_ = 'money')[0].text.replace('$', '').replace(',', ''))
        except:
            domestic_opening = None
        
        # get budget, within the div tag that we want to find <span> and the money class, second instance
        # so index of 1
        # once we get the budget, we want to remove $ and commas, then turn it into an int
        try:
            budget = int(div.find_all('span', class_ = 'money')[1].text.replace('$', '').replace(',', ''))
        except:
            budget = None
    
        # get earliest release date
        try:
            regex = re.compile('Earliest Release')
            erd = soup.find(text = regex)
            earliest_release_string = erd.findNext().text
            earliest_release_date = earliest_release_string.split('\n')[0]
        except:
            earliest_release_date = None 
            
        # get rating
        try:
            regex = re.compile('MPAA')
            MPAA_string = soup.find(text = regex)
            MPAA = MPAA_string.findNext().text
        except:
            MPAA = None
            
        # get run time
        try:
            runtime_regex = re.compile('Run')
            rt_string = soup.find(text = runtime_regex)
            run_time = rt_string.findNext().text
        except:
            run_time = None
        
        # get genre
        try:
            gen = soup.find(text = 'Genres')
            genres_string = gen.findNext().text
            genres = genres_string.replace('\n', '').split()
        except:
            genres = None
        
        # add each individual movie to indiv_movie dict  
        indiv_movie[movie_title] = [domestic_distributor,
                                    domestic_opening,
                                    budget,
                                    earliest_release_date,
                                    MPAA,
                                    run_time, 
                                    genres]

In [6]:
indiv_movie = {}
indiv_movie_info(r_movies)

In [7]:
r_movies_det = pd.DataFrame(indiv_movie).transpose()
r_movies_det.columns = ['domestic_distributor',
                           'domestic_opening',
                           'budget',
                           'earliest_release_date',
                           'MPAA',
                           'run_time', 
                           'genres']
r_movies_det

Unnamed: 0,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
The Passion of the Christ,Newmarket Films,83848082,30000000,"February 25, 2004",R,2 hr 7 min,[Drama]
Deadpool,Twentieth Century Fox,132434639,58000000,"February 9, 2016",R,1 hr 48 min,"[Action, Adventure, Comedy, Sci-Fi]"
American Sniper,Warner Bros.,633456,58800000,"December 25, 2014",R,2 hr 13 min,"[Action, Biography, Drama, War]"
Joker,Warner Bros.,96202337,55000000,"October 2, 2019",R,2 hr 2 min,"[Crime, Drama, Thriller]"
It,Warner Bros.,123403419,35000000,"September 6, 2017",R,2 hr 15 min,[Horror]
...,...,...,...,...,...,...,...
Sinister 2,Focus Features,10542116,10000000,"August 19, 2015",R,1 hr 37 min,"[Horror, Mystery, Thriller]"
Internal Affairs,Paramount Pictures,5043516,,"January 12, 1990",,1 hr 55 min,"[Crime, Drama, Thriller]"
Oculus,Relativity Media,12005402,5000000,"April 9, 2014",R,1 hr 44 min,"[Horror, Mystery]"
Saw VI,Lionsgate,14118444,11000000,"October 22, 2009",R,1 hr 30 min,"[Horror, Mystery, Thriller]"


In [8]:
r_movies_full = r_movies.merge(r_movies_det, left_index = True, right_index = True)
r_movies_full

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
The Passion of the Christ,/title/tt0335345/?ref_=bo_cso_table_1,The Passion of the Christ,1,370782930,47,2004,Newmarket Films,83848082,30000000,"February 25, 2004",R,2 hr 7 min,[Drama]
Deadpool,/title/tt1431045/?ref_=bo_cso_table_2,Deadpool,2,363070709,51,2016,Twentieth Century Fox,132434639,58000000,"February 9, 2016",R,1 hr 48 min,"[Action, Adventure, Comedy, Sci-Fi]"
American Sniper,/title/tt2179136/?ref_=bo_cso_table_3,American Sniper,3,350126372,56,2014,Warner Bros.,633456,58800000,"December 25, 2014",R,2 hr 13 min,"[Action, Biography, Drama, War]"
Joker,/title/tt7286456/?ref_=bo_cso_table_4,Joker,4,335451311,62,2019,Warner Bros.,96202337,55000000,"October 2, 2019",R,2 hr 2 min,"[Crime, Drama, Thriller]"
It,/title/tt1396484/?ref_=bo_cso_table_5,It,5,328828874,69,2017,Warner Bros.,123403419,35000000,"September 6, 2017",R,2 hr 15 min,[Horror]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sinister 2,/title/tt2752772/?ref_=bo_cso_table_196,Sinister 2,996,27740955,2929,2015,Focus Features,10542116,10000000,"August 19, 2015",R,1 hr 37 min,"[Horror, Mystery, Thriller]"
Internal Affairs,/title/tt0099850/?ref_=bo_cso_table_197,Internal Affairs,997,27734391,2930,1990,Paramount Pictures,5043516,,"January 12, 1990",,1 hr 55 min,"[Crime, Drama, Thriller]"
Oculus,/title/tt2388715/?ref_=bo_cso_table_198,Oculus,998,27695246,2931,2014,Relativity Media,12005402,5000000,"April 9, 2014",R,1 hr 44 min,"[Horror, Mystery]"
Saw VI,/title/tt1233227/?ref_=bo_cso_table_199,Saw VI,999,27693292,2932,2009,Lionsgate,14118444,11000000,"October 22, 2009",R,1 hr 30 min,"[Horror, Mystery, Thriller]"


In [60]:
r_movies_full['MPAA'].unique()

array(['R'], dtype=object)

In [55]:
r_movies_full['MPAA'].fillna('R', inplace = True)

In [58]:
r_movies_full

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
The Passion of the Christ,/title/tt0335345/?ref_=bo_cso_table_1,The Passion of the Christ,1,370782930,47,2004,Newmarket Films,83848082,30000000,"February 25, 2004",R,2 hr 7 min,[Drama]
Deadpool,/title/tt1431045/?ref_=bo_cso_table_2,Deadpool,2,363070709,51,2016,Twentieth Century Fox,132434639,58000000,"February 9, 2016",R,1 hr 48 min,"[Action, Adventure, Comedy, Sci-Fi]"
American Sniper,/title/tt2179136/?ref_=bo_cso_table_3,American Sniper,3,350126372,56,2014,Warner Bros.,633456,58800000,"December 25, 2014",R,2 hr 13 min,"[Action, Biography, Drama, War]"
Joker,/title/tt7286456/?ref_=bo_cso_table_4,Joker,4,335451311,62,2019,Warner Bros.,96202337,55000000,"October 2, 2019",R,2 hr 2 min,"[Crime, Drama, Thriller]"
It,/title/tt1396484/?ref_=bo_cso_table_5,It,5,328828874,69,2017,Warner Bros.,123403419,35000000,"September 6, 2017",R,2 hr 15 min,[Horror]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sinister 2,/title/tt2752772/?ref_=bo_cso_table_196,Sinister 2,996,27740955,2929,2015,Focus Features,10542116,10000000,"August 19, 2015",R,1 hr 37 min,"[Horror, Mystery, Thriller]"
Internal Affairs,/title/tt0099850/?ref_=bo_cso_table_197,Internal Affairs,997,27734391,2930,1990,Paramount Pictures,5043516,,"January 12, 1990",R,1 hr 55 min,"[Crime, Drama, Thriller]"
Oculus,/title/tt2388715/?ref_=bo_cso_table_198,Oculus,998,27695246,2931,2014,Relativity Media,12005402,5000000,"April 9, 2014",R,1 hr 44 min,"[Horror, Mystery]"
Saw VI,/title/tt1233227/?ref_=bo_cso_table_199,Saw VI,999,27693292,2932,2009,Lionsgate,14118444,11000000,"October 22, 2009",R,1 hr 30 min,"[Horror, Mystery, Thriller]"


In [59]:
r_movies_full.to_csv('r_movies_data.csv')