In [5]:
from bs4 import BeautifulSoup 
import requests
import numpy as np
import pandas as pd
import re

In [6]:
# create a function to scrape 
def nc17_info(url):
    '''
    Extract info from the main table layout,
    we will do this five times since each table on a page contains 200 movies
    '''
    # connect to the url
    response = requests.get(url)
    
    # get page HTML code using response.text
    page = response.text
    
    # parse the page's HTML code using Beautiful Soup and display in lxml
    soup = BeautifulSoup(page, 'lxml')
    
    # all the rows are nested inbetween <table><table>, so we first want to locate <table>,
    # then since each row is wrapped between <tr><tr>, we want to find all <tr>'s, should be 200 in total
    rows = [row for row in soup.find('table').find_all('tr')]
    
    # now we go through each row in the table
    # we start at index 1 since index 0 contains the table headers
    for row in rows[1:]:
        
        # get movie title, the title is contained in the first instance of <td> which is index 0   
        title = row.find_all('td')[0].text
        
        # get link, the link is also contained in the first instance of <td> which is also index 0
        # the link is wrapped between <a a>, so we first want to find a, then since the link is set to
        # href, we want to get the value of href
        link = row.find_all('td')[0].find('a').get('href')
        
        # get rank amongst all PG-13 movies, rank is contained in the second instance of <td> which is index 1
        mpaa_rank = row.find_all('td')[1].text
        
        # get lifetime gross, lifetime gross is contained in the third instance of <td> which is index 2
        # once we get the lifetime gross, we want an integer, so we remove $ and commas, then turn it into an int
        lifetime_gross = int(row.find_all('td')[2].text.replace('$', '').replace(',', ''))
        
        # get rank amongst all movies regardless of MPAA, contained in fourth instance of <td> which is index 3
        overall_rank = row.find_all('td')[3].text
        
        # get release year, contained in fifth instance of <td> which is index 4
        release_year = row.find_all('td')[4].text
        
        # add information to dictionary using the title of the movie as the key, and list of all attributes as values
        movies[title] = [link,
                         title,
                         mpaa_rank,
                         lifetime_gross,
                         overall_rank,
                         release_year]

In [9]:
url_list = ['https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=NC-17']
movies = {}
for url in url_list:
    nc17_info(url)

In [10]:
nc17_movies = pd.DataFrame(movies).T  #transpose
nc17_movies.columns = ['link_stub', 'title', 'mpaa_rank', 'lifetime_gross', 'overall_rank', 'year']
nc17_movies

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year
Last Tango in Paris,/title/tt0070849/?ref_=bo_cso_table_1,Last Tango in Paris,1,36144000,2395,1973
Showgirls,/title/tt0114436/?ref_=bo_cso_table_2,Showgirls,2,20350754,3536,1995
Henry & June,/title/tt0099762/?ref_=bo_cso_table_3,Henry & June,3,11567449,4630,1990
"The Cook, the Thief, His Wife & Her Lover",/title/tt0097108/?ref_=bo_cso_table_4,"The Cook, the Thief, His Wife & Her Lover",4,7724701,5250,1990
Kids,/title/tt0113540/?ref_=bo_cso_table_5,Kids,5,7412216,5314,1995
Bad Education,/title/tt0275491/?ref_=bo_cso_table_6,Bad Education,6,5284284,5836,2004
"Lust, Caution",/title/tt0808357/?ref_=bo_cso_table_7,"Lust, Caution",7,4604982,6007,2007
Tie Me Up! Tie Me Down!,/title/tt0101026/?ref_=bo_cso_table_8,Tie Me Up! Tie Me Down!,8,4087361,6183,1990
Shame,/title/tt1723811/?ref_=bo_cso_table_9,Shame,9,3909002,6250,2011
Crash,/title/tt0115964/?ref_=bo_cso_table_10,Crash,10,2664812,6757,1996


In [11]:
def indiv_movie_info(df):
    '''
    Get detailed individual movie info 
    '''
    url_temp = 'https://www.boxofficemojo.com'
    
    # we want to loop the function for each row in the df that we pass into the function
    for i in range(df.shape[0]):
        
        #getting movie link
        movie_title = df.iloc[i, 1]
        link_stub = df.iloc[i, 0]
        url = url_temp + link_stub
        
         # connect to the url
        response = requests.get(url)
        
        # get page HTML code using response.text
        page = response.text
        
        # parse the page's HTML code using Beautiful Soup and display in lxml
        soup = BeautifulSoup(page, 'lxml')
        
        # look for the div that has the class of mojo-summary-values, this tag contains all the detailed info
        # that we want
        div = soup.find('div', class_ = 'mojo-summary-values')
        
        # get domestic distributor within the div tag that we want to find <span>, domestic distributor is
        # contained in the second element of <span> which has an index of 1
        # 'see full company information' is also containted within the second element of <span>, but we don't want
        # that portion so we split it
        try:
            domestic_distributor = div.find_all('span')[1].text.split('See')[0]
        except:
            domestic_distributor = None
        
        # get domestic opening, within the div tag that we want to find <span> and the money class, first instance
        # so index of 0
        # once we get the domestic opening, we want to remove $ and commas, then turn it into an int
        try:
            domestic_opening = int(div.find_all('span', class_ = 'money')[0].text.replace('$', '').replace(',', ''))
        except:
            domestic_opening = None
        
        # get budget, within the div tag that we want to find <span> and the money class, second instance
        # so index of 1
        # once we get the budget, we want to remove $ and commas, then turn it into an int
        try:
            budget = int(div.find_all('span', class_ = 'money')[1].text.replace('$', '').replace(',', ''))
        except:
            budget = None
    
        # get earliest release date
        try:
            regex = re.compile('Earliest Release')
            erd = soup.find(text = regex)
            earliest_release_string = erd.findNext().text
            earliest_release_date = earliest_release_string.split('\n')[0]
        except:
            earliest_release_date = None 
            
        # get rating
        try:
            regex = re.compile('MPAA')
            MPAA_string = soup.find(text = regex)
            MPAA = MPAA_string.findNext().text
        except:
            MPAA = None
            
        # get run time
        try:
            runtime_regex = re.compile('Run')
            rt_string = soup.find(text = runtime_regex)
            run_time = rt_string.findNext().text
        except:
            run_time = None
        
        # get genre
        try:
            gen = soup.find(text = 'Genres')
            genres_string = gen.findNext().text
            genres = genres_string.replace('\n', '').split()
        except:
            genres = None
        
        # add each individual movie to indiv_movie dict  
        indiv_movie[movie_title] = [domestic_distributor,
                                    domestic_opening,
                                    budget,
                                    earliest_release_date,
                                    MPAA,
                                    run_time, 
                                    genres]

In [12]:
indiv_movie = {}
indiv_movie_info(nc17_movies)

In [13]:
nc17_movies_det = pd.DataFrame(indiv_movie).transpose()
nc17_movies_det.columns = ['domestic_distributor',
                           'domestic_opening',
                           'budget',
                           'earliest_release_date',
                           'MPAA',
                           'run_time', 
                           'genres']
nc17_movies_det

Unnamed: 0,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Last Tango in Paris,United Artists,1250000.0,,"January 27, 1973",NC-17,2 hr 9 min,"[Drama, Romance]"
Showgirls,Metro-Goldwyn-Mayer (MGM),8112627.0,45000000.0,"September 22, 1995",NC-17,2 hr 8 min,[Drama]
Henry & June,Universal Pictures,1032942.0,,"October 5, 1990",,2 hr 16 min,"[Biography, Drama]"
"The Cook, the Thief, His Wife & Her Lover",Miramax,252223.0,,"April 6, 1990",,2 hr 4 min,"[Crime, Drama]"
Kids,Miramax,85709.0,1500000.0,"July 21, 1995",,1 hr 31 min,[Drama]
Bad Education,Sony Pictures Classics,147370.0,5000000.0,"March 19, 2004",NC-17,1 hr 46 min,"[Crime, Drama]"
"Lust, Caution",Focus Features,63918.0,15000000.0,"September 24, 2007",NC-17,2 hr 37 min,"[Drama, History, Romance, Thriller, War]"
Tie Me Up! Tie Me Down!,Miramax,65299.0,,"May 4, 1990",NC-17,1 hr 41 min,"[Comedy, Crime, Drama, Romance]"
Shame,Fox Searchlight Pictures,361000.0,,"December 2, 2011",NC-17,1 hr 41 min,[Drama]
Crash,Fine Line Features,738339.0,,"October 4, 1996",NC-17,1 hr 40 min,[Drama]


In [14]:
nc17_movies_full = nc17_movies.merge(nc17_movies_det, left_index = True, right_index = True)
nc17_movies_full

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Last Tango in Paris,/title/tt0070849/?ref_=bo_cso_table_1,Last Tango in Paris,1,36144000,2395,1973,United Artists,1250000.0,,"January 27, 1973",NC-17,2 hr 9 min,"[Drama, Romance]"
Showgirls,/title/tt0114436/?ref_=bo_cso_table_2,Showgirls,2,20350754,3536,1995,Metro-Goldwyn-Mayer (MGM),8112627.0,45000000.0,"September 22, 1995",NC-17,2 hr 8 min,[Drama]
Henry & June,/title/tt0099762/?ref_=bo_cso_table_3,Henry & June,3,11567449,4630,1990,Universal Pictures,1032942.0,,"October 5, 1990",,2 hr 16 min,"[Biography, Drama]"
"The Cook, the Thief, His Wife & Her Lover",/title/tt0097108/?ref_=bo_cso_table_4,"The Cook, the Thief, His Wife & Her Lover",4,7724701,5250,1990,Miramax,252223.0,,"April 6, 1990",,2 hr 4 min,"[Crime, Drama]"
Kids,/title/tt0113540/?ref_=bo_cso_table_5,Kids,5,7412216,5314,1995,Miramax,85709.0,1500000.0,"July 21, 1995",,1 hr 31 min,[Drama]
Bad Education,/title/tt0275491/?ref_=bo_cso_table_6,Bad Education,6,5284284,5836,2004,Sony Pictures Classics,147370.0,5000000.0,"March 19, 2004",NC-17,1 hr 46 min,"[Crime, Drama]"
"Lust, Caution",/title/tt0808357/?ref_=bo_cso_table_7,"Lust, Caution",7,4604982,6007,2007,Focus Features,63918.0,15000000.0,"September 24, 2007",NC-17,2 hr 37 min,"[Drama, History, Romance, Thriller, War]"
Tie Me Up! Tie Me Down!,/title/tt0101026/?ref_=bo_cso_table_8,Tie Me Up! Tie Me Down!,8,4087361,6183,1990,Miramax,65299.0,,"May 4, 1990",NC-17,1 hr 41 min,"[Comedy, Crime, Drama, Romance]"
Shame,/title/tt1723811/?ref_=bo_cso_table_9,Shame,9,3909002,6250,2011,Fox Searchlight Pictures,361000.0,,"December 2, 2011",NC-17,1 hr 41 min,[Drama]
Crash,/title/tt0115964/?ref_=bo_cso_table_10,Crash,10,2664812,6757,1996,Fine Line Features,738339.0,,"October 4, 1996",NC-17,1 hr 40 min,[Drama]


In [17]:
# check what values in MPAA don't have a rating filled in
nc17_movies_full['MPAA'].unique()

array(['NC-17', None], dtype=object)

In [18]:
nc17_movies_full['MPAA'].fillna('NC-17', inplace = True)

In [19]:
nc17_movies_full

Unnamed: 0,link_stub,title,mpaa_rank,lifetime_gross,overall_rank,year,domestic_distributor,domestic_opening,budget,earliest_release_date,MPAA,run_time,genres
Last Tango in Paris,/title/tt0070849/?ref_=bo_cso_table_1,Last Tango in Paris,1,36144000,2395,1973,United Artists,1250000.0,,"January 27, 1973",NC-17,2 hr 9 min,"[Drama, Romance]"
Showgirls,/title/tt0114436/?ref_=bo_cso_table_2,Showgirls,2,20350754,3536,1995,Metro-Goldwyn-Mayer (MGM),8112627.0,45000000.0,"September 22, 1995",NC-17,2 hr 8 min,[Drama]
Henry & June,/title/tt0099762/?ref_=bo_cso_table_3,Henry & June,3,11567449,4630,1990,Universal Pictures,1032942.0,,"October 5, 1990",NC-17,2 hr 16 min,"[Biography, Drama]"
"The Cook, the Thief, His Wife & Her Lover",/title/tt0097108/?ref_=bo_cso_table_4,"The Cook, the Thief, His Wife & Her Lover",4,7724701,5250,1990,Miramax,252223.0,,"April 6, 1990",NC-17,2 hr 4 min,"[Crime, Drama]"
Kids,/title/tt0113540/?ref_=bo_cso_table_5,Kids,5,7412216,5314,1995,Miramax,85709.0,1500000.0,"July 21, 1995",NC-17,1 hr 31 min,[Drama]
Bad Education,/title/tt0275491/?ref_=bo_cso_table_6,Bad Education,6,5284284,5836,2004,Sony Pictures Classics,147370.0,5000000.0,"March 19, 2004",NC-17,1 hr 46 min,"[Crime, Drama]"
"Lust, Caution",/title/tt0808357/?ref_=bo_cso_table_7,"Lust, Caution",7,4604982,6007,2007,Focus Features,63918.0,15000000.0,"September 24, 2007",NC-17,2 hr 37 min,"[Drama, History, Romance, Thriller, War]"
Tie Me Up! Tie Me Down!,/title/tt0101026/?ref_=bo_cso_table_8,Tie Me Up! Tie Me Down!,8,4087361,6183,1990,Miramax,65299.0,,"May 4, 1990",NC-17,1 hr 41 min,"[Comedy, Crime, Drama, Romance]"
Shame,/title/tt1723811/?ref_=bo_cso_table_9,Shame,9,3909002,6250,2011,Fox Searchlight Pictures,361000.0,,"December 2, 2011",NC-17,1 hr 41 min,[Drama]
Crash,/title/tt0115964/?ref_=bo_cso_table_10,Crash,10,2664812,6757,1996,Fine Line Features,738339.0,,"October 4, 1996",NC-17,1 hr 40 min,[Drama]


In [20]:
nc17_movies_full.to_csv('nc17_movies_data.csv')