# Web Scraping

This notebooks does web scraping from Box Office Mojo but getting each field required different approaches because of the site's HTML structure.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [3]:
### Initial setup for Beautiful Soup for web scraping ###

url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross_adjusted/?adjust_gross_to=2020&offset=0'
html_page = requests.get(url) 
soup = BeautifulSoup(html_page.content, 'html.parser') 

In [16]:
### Getting the headers of the table to be used as column names later ###

header_list = []

### Headers are in th tags ###
for record in soup.findAll('th'):
    header = record.text.rstrip('\n')
    header_list.append(header)
    
header_list

['Rank',
 'Title',
 'Adj. Lifetime Gross',
 'Lifetime Gross',
 'Est. Num Tickets',
 'Year']

In [17]:
### Get table values and in all of the pages (5 in total) ###

offset=0
movie_list = []
row_list = []

### Get details of 200 movies per page, with 1000 total movies ###
while offset < 1000:
    url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross_adjusted/?adjust_gross_to=2020&offset=' + str(offset)
    html_page = requests.get(url) 
    soup = BeautifulSoup(html_page.content, 'html.parser')
    offset+=200
    
    ### The values are in all td tags under tr tags ###
    for record in soup.findAll('tr'):
        for data in record.findAll('td'):
            row_list.append(data.text)
        if row_list == []:
            pass
        else:
            movie_list.append(row_list)
            row_list = []
            
### Prints the list to check completeness ###        
movie_list

[['1',
  'Gone with the Wind',
  '$1,895,421,694',
  '$200,852,579',
  '202,286,200',
  '1939'],
 ['2',
  'Star Wars: Episode IV - A New Hope',
  '$1,668,979,715',
  '$460,998,507',
  '178,119,500',
  '1977'],
 ['3',
  'The Sound of Music',
  '$1,335,086,324',
  '$159,287,539',
  '142,485,200',
  '1965'],
 ['4',
  'E.T. the Extra-Terrestrial',
  '$1,329,174,791',
  '$435,110,554',
  '141,854,300',
  '1982'],
 ['5', 'Titanic', '$1,270,101,626', '$659,363,944', '135,549,800', '1997'],
 ['6',
  'The Ten Commandments',
  '$1,227,470,000',
  '$65,500,000',
  '131,000,000',
  '1956'],
 ['7', 'Jaws', '$1,200,098,356', '$260,000,000', '128,078,800', '1975'],
 ['8',
  'Doctor Zhivago',
  '$1,163,149,635',
  '$111,721,910',
  '124,135,500',
  '1965'],
 ['9',
  'The Exorcist',
  '$1,036,314,504',
  '$232,906,145',
  '110,599,200',
  '1973'],
 ['10',
  'Snow White and the Seven Dwarfs',
  '$1,021,330,000',
  '$184,925,486',
  '109,000,000',
  '1937'],
 ['11',
  'Star Wars: Episode VII - The Force 

In [18]:
### Convert the list of list into a dataframe, including headers ###

df = pd.DataFrame(movie_list, columns = header_list) 
df

Unnamed: 0,Rank,Title,Adj. Lifetime Gross,Lifetime Gross,Est. Num Tickets,Year
0,1,Gone with the Wind,"$1,895,421,694","$200,852,579",202286200,1939
1,2,Star Wars: Episode IV - A New Hope,"$1,668,979,715","$460,998,507",178119500,1977
2,3,The Sound of Music,"$1,335,086,324","$159,287,539",142485200,1965
3,4,E.T. the Extra-Terrestrial,"$1,329,174,791","$435,110,554",141854300,1982
4,5,Titanic,"$1,270,101,626","$659,363,944",135549800,1997
...,...,...,...,...,...,...
995,996,Spy Kids 2: Island of Lost Dreams,"$138,438,939","$85,846,429",14774700,2002
996,997,Mystic River,"$138,398,648","$90,135,191",14770400,2003
997,998,Sea of Love,"$138,240,295","$58,571,513",14753500,1989
998,999,Scrooged,"$138,234,673","$60,328,558",14752900,1988


In [39]:
### Get the href of each movie in the main table to get to the details page ###

offset=0
link_list = []

### Go through each page to get the href of each movie ###
while offset < 1000:
    
    ### Individual soup setup for each page ###
    url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross_adjusted/?adjust_gross_to=2020&offset=' + str(offset)
    html_page = requests.get(url) 
    soup = BeautifulSoup(html_page.content, 'html.parser')
    offset+=200
    
    ### Find all hrefs within the table ###
    for record in soup.findAll('tr'):
        for data in record.findAll('td'):
            for link in data.findAll('a', href=True):
                
                ### Exclude the links from the year each movie came out ###
                if "year" in link['href']:
                    pass
                else:
                    link_list.append(link['href'])

link_list


['/title/tt0031381/?ref_=bo_cso_table_1',
 '/title/tt0076759/?ref_=bo_cso_table_2',
 '/title/tt0059742/?ref_=bo_cso_table_3',
 '/title/tt0083866/?ref_=bo_cso_table_4',
 '/title/tt0120338/?ref_=bo_cso_table_5',
 '/title/tt0049833/?ref_=bo_cso_table_6',
 '/title/tt0073195/?ref_=bo_cso_table_7',
 '/title/tt0059113/?ref_=bo_cso_table_8',
 '/title/tt0070047/?ref_=bo_cso_table_9',
 '/title/tt0029583/?ref_=bo_cso_table_10',
 '/title/tt2488496/?ref_=bo_cso_table_11',
 '/title/tt0055254/?ref_=bo_cso_table_12',
 '/title/tt0080684/?ref_=bo_cso_table_13',
 '/title/tt0052618/?ref_=bo_cso_table_14',
 '/title/tt0499549/?ref_=bo_cso_table_15',
 '/title/tt4154796/?ref_=bo_cso_table_16',
 '/title/tt0086190/?ref_=bo_cso_table_17',
 '/title/tt0107290/?ref_=bo_cso_table_18',
 '/title/tt0120915/?ref_=bo_cso_table_19',
 '/title/tt0110357/?ref_=bo_cso_table_20',
 '/title/tt0070735/?ref_=bo_cso_table_21',
 '/title/tt0082971/?ref_=bo_cso_table_22',
 '/title/tt0061722/?ref_=bo_cso_table_23',
 '/title/tt0032455/?

In [22]:
### Double check the length of list to ensure completeness ###

len(link_list)

1000

In [40]:
### Append main URL to each href to complete the link ###

link_list_final = []
for link in link_list:
    link_list_final.append("https://www.boxofficemojo.com" + link)

link_list_final

['https://www.boxofficemojo.com/title/tt0031381/?ref_=bo_cso_table_1',
 'https://www.boxofficemojo.com/title/tt0076759/?ref_=bo_cso_table_2',
 'https://www.boxofficemojo.com/title/tt0059742/?ref_=bo_cso_table_3',
 'https://www.boxofficemojo.com/title/tt0083866/?ref_=bo_cso_table_4',
 'https://www.boxofficemojo.com/title/tt0120338/?ref_=bo_cso_table_5',
 'https://www.boxofficemojo.com/title/tt0049833/?ref_=bo_cso_table_6',
 'https://www.boxofficemojo.com/title/tt0073195/?ref_=bo_cso_table_7',
 'https://www.boxofficemojo.com/title/tt0059113/?ref_=bo_cso_table_8',
 'https://www.boxofficemojo.com/title/tt0070047/?ref_=bo_cso_table_9',
 'https://www.boxofficemojo.com/title/tt0029583/?ref_=bo_cso_table_10',
 'https://www.boxofficemojo.com/title/tt2488496/?ref_=bo_cso_table_11',
 'https://www.boxofficemojo.com/title/tt0055254/?ref_=bo_cso_table_12',
 'https://www.boxofficemojo.com/title/tt0080684/?ref_=bo_cso_table_13',
 'https://www.boxofficemojo.com/title/tt0052618/?ref_=bo_cso_table_14',
 

In [261]:
### Enter each movie's detail page then get the budget ###

url_movie = ''
df['Budget'] = ''

### Go through each of the link in the above cell ###
for link in link_list_final:

    url_movie = link
    html_page_movie = requests.get(url_movie) 
    soup_movie = BeautifulSoup(html_page_movie.content, 'html.parser')
    cols = list(df.columns)
    
    ### Find all div with corresponding class to pinpoint where the budget is located ###
    for div in soup_movie.findAll("div", {"class": "mojo-summary-values"}):
        budget = []
        
        ### Budget is located in a span with money as a class
        for span in div.findAll("span", {"class": "money"}):
            budget.append(span.text)
        
        ### If budget exists, get the last value because another value has the same class but budget is the last to be appended ###
        if budget:
            df.at[link_list_final.index(link), 'Budget'] = budget[-1]
            print(link) 

        ### Set the budget to 0 if it does not exist; for cleaning purposes ###
        else:
            df.at[link_list_final.index(link), 'Budget'] = 0
            print(link) 



https://www.boxofficemojo.com//title/tt0031381/?ref_=bo_cso_table_1
https://www.boxofficemojo.com//title/tt0076759/?ref_=bo_cso_table_2
https://www.boxofficemojo.com//title/tt0059742/?ref_=bo_cso_table_3
https://www.boxofficemojo.com//title/tt0083866/?ref_=bo_cso_table_4
https://www.boxofficemojo.com//title/tt0120338/?ref_=bo_cso_table_5
https://www.boxofficemojo.com//title/tt0049833/?ref_=bo_cso_table_6
https://www.boxofficemojo.com//title/tt0073195/?ref_=bo_cso_table_7
https://www.boxofficemojo.com//title/tt0059113/?ref_=bo_cso_table_8
https://www.boxofficemojo.com//title/tt0070047/?ref_=bo_cso_table_9
https://www.boxofficemojo.com//title/tt0029583/?ref_=bo_cso_table_10
https://www.boxofficemojo.com//title/tt2488496/?ref_=bo_cso_table_11
https://www.boxofficemojo.com//title/tt0055254/?ref_=bo_cso_table_12
https://www.boxofficemojo.com//title/tt0080684/?ref_=bo_cso_table_13
https://www.boxofficemojo.com//title/tt0052618/?ref_=bo_cso_table_14
https://www.boxofficemojo.com//title/tt0499

https://www.boxofficemojo.com//title/tt1201607/?ref_=bo_cso_table_120
https://www.boxofficemojo.com//title/tt1979376/?ref_=bo_cso_table_121
https://www.boxofficemojo.com//title/tt0034167/?ref_=bo_cso_table_122
https://www.boxofficemojo.com//title/tt0170016/?ref_=bo_cso_table_123
https://www.boxofficemojo.com//title/tt0092099/?ref_=bo_cso_table_124
https://www.boxofficemojo.com//title/tt0114709/?ref_=bo_cso_table_125
https://www.boxofficemojo.com//title/tt0120363/?ref_=bo_cso_table_126
https://www.boxofficemojo.com//title/tt4154664/?ref_=bo_cso_table_127
https://www.boxofficemojo.com//title/tt0126029/?ref_=bo_cso_table_128
https://www.boxofficemojo.com//title/tt0413267/?ref_=bo_cso_table_129
https://www.boxofficemojo.com//title/tt1690953/?ref_=bo_cso_table_130
https://www.boxofficemojo.com//title/tt3498820/?ref_=bo_cso_table_131
https://www.boxofficemojo.com//title/tt0234215/?ref_=bo_cso_table_132
https://www.boxofficemojo.com//title/tt0418279/?ref_=bo_cso_table_133
https://www.boxoffic

https://www.boxofficemojo.com//title/tt1324999/?ref_=bo_cso_table_38
https://www.boxofficemojo.com//title/tt0062512/?ref_=bo_cso_table_39
https://www.boxofficemojo.com//title/tt7286456/?ref_=bo_cso_table_40
https://www.boxofficemojo.com//title/tt0376994/?ref_=bo_cso_table_41
https://www.boxofficemojo.com//title/tt0209163/?ref_=bo_cso_table_42
https://www.boxofficemojo.com//title/tt0290334/?ref_=bo_cso_table_43
https://www.boxofficemojo.com//title/tt0091763/?ref_=bo_cso_table_44
https://www.boxofficemojo.com//title/tt0089927/?ref_=bo_cso_table_45
https://www.boxofficemojo.com//title/tt0770828/?ref_=bo_cso_table_46
https://www.boxofficemojo.com//title/tt0213149/?ref_=bo_cso_table_47
https://www.boxofficemojo.com//title/tt0111503/?ref_=bo_cso_table_48
https://www.boxofficemojo.com//title/tt0077663/?ref_=bo_cso_table_49
https://www.boxofficemojo.com//title/tt0104714/?ref_=bo_cso_table_50
https://www.boxofficemojo.com//title/tt0097778/?ref_=bo_cso_table_51
https://www.boxofficemojo.com//tit

https://www.boxofficemojo.com//title/tt0109444/?ref_=bo_cso_table_156
https://www.boxofficemojo.com//title/tt0120903/?ref_=bo_cso_table_157
https://www.boxofficemojo.com//title/tt0212985/?ref_=bo_cso_table_158
https://www.boxofficemojo.com//title/tt1587310/?ref_=bo_cso_table_159
https://www.boxofficemojo.com//title/tt0117218/?ref_=bo_cso_table_160
https://www.boxofficemojo.com//title/tt0175142/?ref_=bo_cso_table_161
https://www.boxofficemojo.com//title/tt0356910/?ref_=bo_cso_table_162
https://www.boxofficemojo.com//title/tt0120685/?ref_=bo_cso_table_163
https://www.boxofficemojo.com//title/tt0111257/?ref_=bo_cso_table_164
https://www.boxofficemojo.com//title/tt0092007/?ref_=bo_cso_table_165
https://www.boxofficemojo.com//title/tt0099810/?ref_=bo_cso_table_166
https://www.boxofficemojo.com//title/tt0161081/?ref_=bo_cso_table_167
https://www.boxofficemojo.com//title/tt0319343/?ref_=bo_cso_table_168
https://www.boxofficemojo.com//title/tt0086465/?ref_=bo_cso_table_169
https://www.boxoffic

https://www.boxofficemojo.com//title/tt0113189/?ref_=bo_cso_table_75
https://www.boxofficemojo.com//title/tt0105695/?ref_=bo_cso_table_76
https://www.boxofficemojo.com//title/tt0072890/?ref_=bo_cso_table_77
https://www.boxofficemojo.com//title/tt1872181/?ref_=bo_cso_table_78
https://www.boxofficemojo.com//title/tt0454921/?ref_=bo_cso_table_79
https://www.boxofficemojo.com//title/tt0110478/?ref_=bo_cso_table_80
https://www.boxofficemojo.com//title/tt1661199/?ref_=bo_cso_table_81
https://www.boxofficemojo.com//title/tt0107798/?ref_=bo_cso_table_82
https://www.boxofficemojo.com//title/tt0087928/?ref_=bo_cso_table_83
https://www.boxofficemojo.com//title/tt0097165/?ref_=bo_cso_table_84
https://www.boxofficemojo.com//title/tt0120667/?ref_=bo_cso_table_85
https://www.boxofficemojo.com//title/tt0102945/?ref_=bo_cso_table_86
https://www.boxofficemojo.com//title/tt0458525/?ref_=bo_cso_table_87
https://www.boxofficemojo.com//title/tt0831387/?ref_=bo_cso_table_88
https://www.boxofficemojo.com//tit

https://www.boxofficemojo.com//title/tt0134067/?ref_=bo_cso_table_193
https://www.boxofficemojo.com//title/tt0078163/?ref_=bo_cso_table_194
https://www.boxofficemojo.com//title/tt0071206/?ref_=bo_cso_table_195
https://www.boxofficemojo.com//title/tt0371606/?ref_=bo_cso_table_196
https://www.boxofficemojo.com//title/tt4633694/?ref_=bo_cso_table_197
https://www.boxofficemojo.com//title/tt0322259/?ref_=bo_cso_table_198
https://www.boxofficemojo.com//title/tt1502397/?ref_=bo_cso_table_199
https://www.boxofficemojo.com//title/tt0098621/?ref_=bo_cso_table_200
https://www.boxofficemojo.com//title/tt0122933/?ref_=bo_cso_table_1
https://www.boxofficemojo.com//title/tt0187393/?ref_=bo_cso_table_2
https://www.boxofficemojo.com//title/tt1478338/?ref_=bo_cso_table_3
https://www.boxofficemojo.com//title/tt1375670/?ref_=bo_cso_table_4
https://www.boxofficemojo.com//title/tt0112740/?ref_=bo_cso_table_5
https://www.boxofficemojo.com//title/tt0076752/?ref_=bo_cso_table_6
https://www.boxofficemojo.com//t

https://www.boxofficemojo.com//title/tt0486576/?ref_=bo_cso_table_112
https://www.boxofficemojo.com//title/tt0141369/?ref_=bo_cso_table_113
https://www.boxofficemojo.com//title/tt0247638/?ref_=bo_cso_table_114
https://www.boxofficemojo.com//title/tt0212346/?ref_=bo_cso_table_115
https://www.boxofficemojo.com//title/tt0099044/?ref_=bo_cso_table_116
https://www.boxofficemojo.com//title/tt1723121/?ref_=bo_cso_table_117
https://www.boxofficemojo.com//title/tt4196776/?ref_=bo_cso_table_118
https://www.boxofficemojo.com//title/tt5028340/?ref_=bo_cso_table_119
https://www.boxofficemojo.com//title/tt0448694/?ref_=bo_cso_table_120
https://www.boxofficemojo.com//title/tt0458352/?ref_=bo_cso_table_121
https://www.boxofficemojo.com//title/tt0094889/?ref_=bo_cso_table_122
https://www.boxofficemojo.com//title/tt0086393/?ref_=bo_cso_table_123
https://www.boxofficemojo.com//title/tt0073026/?ref_=bo_cso_table_124
https://www.boxofficemojo.com//title/tt1010048/?ref_=bo_cso_table_125
https://www.boxoffic

https://www.boxofficemojo.com//title/tt0076538/?ref_=bo_cso_table_30
https://www.boxofficemojo.com//title/tt0079116/?ref_=bo_cso_table_31
https://www.boxofficemojo.com//title/tt0107048/?ref_=bo_cso_table_32
https://www.boxofficemojo.com//title/tt0111282/?ref_=bo_cso_table_33
https://www.boxofficemojo.com//title/tt0120888/?ref_=bo_cso_table_34
https://www.boxofficemojo.com//title/tt3783958/?ref_=bo_cso_table_35
https://www.boxofficemojo.com//title/tt3606752/?ref_=bo_cso_table_36
https://www.boxofficemojo.com//title/tt0938283/?ref_=bo_cso_table_37
https://www.boxofficemojo.com//title/tt0405422/?ref_=bo_cso_table_38
https://www.boxofficemojo.com//title/tt0230600/?ref_=bo_cso_table_39
https://www.boxofficemojo.com//title/tt0250494/?ref_=bo_cso_table_40
https://www.boxofficemojo.com//title/tt0079510/?ref_=bo_cso_table_41
https://www.boxofficemojo.com//title/tt2191701/?ref_=bo_cso_table_42
https://www.boxofficemojo.com//title/tt0496806/?ref_=bo_cso_table_43
https://www.boxofficemojo.com//tit

https://www.boxofficemojo.com//title/tt0111742/?ref_=bo_cso_table_149
https://www.boxofficemojo.com//title/tt0084809/?ref_=bo_cso_table_150
https://www.boxofficemojo.com//title/tt0311113/?ref_=bo_cso_table_151
https://www.boxofficemojo.com//title/tt1136608/?ref_=bo_cso_table_152
https://www.boxofficemojo.com//title/tt0080520/?ref_=bo_cso_table_153
https://www.boxofficemojo.com//title/tt1386588/?ref_=bo_cso_table_154
https://www.boxofficemojo.com//title/tt0159365/?ref_=bo_cso_table_155
https://www.boxofficemojo.com//title/tt0134119/?ref_=bo_cso_table_156
https://www.boxofficemojo.com//title/tt0942385/?ref_=bo_cso_table_157
https://www.boxofficemojo.com//title/tt0113277/?ref_=bo_cso_table_158
https://www.boxofficemojo.com//title/tt0944835/?ref_=bo_cso_table_159
https://www.boxofficemojo.com//title/tt0368933/?ref_=bo_cso_table_160
https://www.boxofficemojo.com//title/tt0110322/?ref_=bo_cso_table_161
https://www.boxofficemojo.com//title/tt0106673/?ref_=bo_cso_table_162
https://www.boxoffic

In [263]:
df.head(20)

Unnamed: 0,Rank,Title,Adj. Lifetime Gross,Lifetime Gross,Est. Num Tickets,Year,Budget
0,1,Gone with the Wind,"$1,895,421,694","$200,852,579",202286200,1939,0
1,2,Star Wars: Episode IV - A New Hope,"$1,668,979,715","$460,998,507",178119500,1977,"$11,000,000"
2,3,The Sound of Music,"$1,335,086,324","$159,287,539",142485200,1965,"$8,200,000"
3,4,E.T. the Extra-Terrestrial,"$1,329,174,791","$435,110,554",141854300,1982,"$10,500,000"
4,5,Titanic,"$1,270,101,626","$659,363,944",135549800,1997,"$200,000,000"
5,6,The Ten Commandments,"$1,227,470,000","$65,500,000",131000000,1956,0
6,7,Jaws,"$1,200,098,356","$260,000,000",128078800,1975,"$7,000,000"
7,8,Doctor Zhivago,"$1,163,149,635","$111,721,910",124135500,1965,0
8,9,The Exorcist,"$1,036,314,504","$232,906,145",110599200,1973,"$11,000,000"
9,10,Snow White and the Seven Dwarfs,"$1,021,330,000","$184,925,486",109000000,1937,"$1,499,000"


In [41]:
### Get the link of a tab from the details page for further drilling down to get more info ###


main_page = []
sep = '?'

### Build the links based on patterns found in the URL ###
for link in link_list_final:
    main_page.append(link.split(sep, 1)[0] + 'credits/?ref_=bo_tt_tab#tabs')

main_page



['https://www.boxofficemojo.com/title/tt0031381/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0076759/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0059742/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0083866/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0120338/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0049833/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0073195/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0059113/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0070047/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0029583/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt2488496/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0055254/credits/?ref_=bo_tt_tab#tabs',
 'https://www.boxofficemojo.com/title/tt0080684/cred

In [269]:
len(main_page)

1000

In [283]:
### Enter Cast & Crew tab of each movie's detail page and scrape director names ###

url_movie = ''
df['Director'] = ''
director = []

### Go through all of the new links built ###
for link in main_page:

    url_movie = link
    html_page_movie = requests.get(url_movie)
    soup_movie = BeautifulSoup(html_page_movie.content, 'html.parser') 

    ### Find all td with Director as text, then go to its left to get the director names ###
    for record in soup_movie.findAll('td', text="Director"):
        director_name = record.find_previous_sibling("td").text.rstrip('\n\n')
        director.append(director_name)
    df.at[main_page.index(link), 'Director'] = director
    print(link)
    director = []


https://www.boxofficemojo.com/title/tt0031381/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0076759/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0059742/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0083866/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120338/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0049833/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0073195/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0059113/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0070047/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0029583/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2488496/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0055254/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0080684/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0045888/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0047396/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0119567/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0097576/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0198781/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2294629/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0413300/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1300854/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0103064/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1201607/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1979376/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0034167/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0170016/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0081562/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0075265/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0129387/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2250912/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0317219/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0926084/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1119646/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1477834/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0097733/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0477347/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0480249/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0295178/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0407304/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0070511/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0319262/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0366548/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0382932/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0441773/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2709692/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0163187/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1217209/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120647/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1298650/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120812/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0078721/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0438097/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0381061/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0070903/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0110912/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0086567/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt4630562/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0097239/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0398286/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0098067/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0110148/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0275847/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0479952/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0816711/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0080549/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0349205/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt5848272/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1318514/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0458339/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0298130/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1409024/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0109446/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0118880/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0305669/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0118884/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1041829/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0181865/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0286716/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0349903/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0082158/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt6644200/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2279373/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0358082/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0070328/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120746/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0240462/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0329575/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0227538/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0103874/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt5052448/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt4116284/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0452594/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2126355/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1067106/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0111280/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2004420/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0102713/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0171363/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0094721/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0257044/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0072653/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0472181/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1392190/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0098536/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0096969/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0118883/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0134084/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0265029/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0090056/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0072271/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0102059/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0239395/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0844471/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt4779682/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0081505/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt3450958/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0112579/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt1327773/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1499658/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0082186/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0080761/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0071877/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0287717/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0327056/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0098273/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0096061/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1489889/credits/?ref_=bo_tt_tab#tabs


In [284]:
df

Unnamed: 0,Rank,Title,Adj. Lifetime Gross,Lifetime Gross,Est. Num Tickets,Year,Budget,Director
0,1,Gone with the Wind,"$1,895,421,694","$200,852,579",202286200,1939,0,"[Victor Fleming, George Cukor, Sam Wood]"
1,2,Star Wars: Episode IV - A New Hope,"$1,668,979,715","$460,998,507",178119500,1977,"$11,000,000",[George Lucas]
2,3,The Sound of Music,"$1,335,086,324","$159,287,539",142485200,1965,"$8,200,000",[Robert Wise]
3,4,E.T. the Extra-Terrestrial,"$1,329,174,791","$435,110,554",141854300,1982,"$10,500,000",[Steven Spielberg]
4,5,Titanic,"$1,270,101,626","$659,363,944",135549800,1997,"$200,000,000",[James Cameron]
...,...,...,...,...,...,...,...,...
995,996,Spy Kids 2: Island of Lost Dreams,"$138,438,939","$85,846,429",14774700,2002,"$38,000,000",[Robert Rodriguez]
996,997,Mystic River,"$138,398,648","$90,135,191",14770400,2003,"$25,000,000",[Clint Eastwood]
997,998,Sea of Love,"$138,240,295","$58,571,513",14753500,1989,"$10,017,840",[Harold Becker]
998,999,Scrooged,"$138,234,673","$60,328,558",14752900,1988,"$13,027,842",[Richard Donner]


In [335]:
### Enter Cast & Crew tab of each movie's detail page and scrape actors ###

url_movie = ''
df['Actor'] = ''
actor = []

### Go through each of the link again ###
for link in main_page:
    
    url_movie = link
    html_page_movie = requests.get(url_movie) # Make a get request to retrieve the page
    soup_movie = BeautifulSoup(html_page_movie.content, 'html.parser') # Pass the page contents to beautiful soup for parsing

    ### Find the table with a unique ID to pinpoint location where actors are displayed ###
    for record in soup_movie.findAll('table', id="principalCast"):
        
        ### Since all actor names are links, this gets the values of all actor names ###
        for td in record.findAll('a', href=True):
            
            ### Clean the results to exclude see more links after each actor ###
            if 'See more' in td.text:
                pass
            elif td.text == '':
                pass
            else:
                actor.append(td.text.replace('\n\n',''))
        df.at[main_page.index(link), 'Actor'] = actor
        actor = []
        print(link)
    


https://www.boxofficemojo.com/title/tt0031381/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0076759/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0059742/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0083866/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120338/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0049833/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0073195/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0059113/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0070047/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0029583/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2488496/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0055254/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0080684/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0045888/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0047396/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0119567/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0097576/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0198781/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2294629/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0413300/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1300854/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0103064/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1201607/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1979376/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0034167/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0170016/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0081562/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0075265/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0129387/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2250912/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0317219/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0926084/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1119646/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1477834/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0097733/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0477347/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0480249/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0295178/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0407304/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0070511/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0319262/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0366548/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0382932/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0441773/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2709692/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0163187/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1217209/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120647/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1298650/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120812/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0078721/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0438097/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0381061/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0070903/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0110912/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0086567/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt4630562/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0097239/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0398286/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0098067/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0110148/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0275847/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0479952/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0816711/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0080549/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0349205/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt5848272/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1318514/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0458339/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0298130/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1409024/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0109446/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0118880/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0305669/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0118884/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1041829/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0181865/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0286716/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0349903/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0082158/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt6644200/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2279373/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0358082/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0070328/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120746/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0240462/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0329575/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0227538/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0103874/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt5052448/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt4116284/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0452594/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2126355/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1067106/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0111280/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2004420/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0102713/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0171363/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0094721/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0257044/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0072653/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0472181/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1392190/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0098536/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0096969/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0118883/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0134084/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0265029/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0090056/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0072271/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0102059/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0239395/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0844471/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt4779682/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0081505/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt3450958/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0112579/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt1327773/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1499658/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0082186/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0080761/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0071877/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0287717/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0327056/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0098273/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0096061/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1489889/credits/?ref_=bo_tt_tab#tabs


In [336]:
df

Unnamed: 0,Rank,Title,Adj. Lifetime Gross,Lifetime Gross,Est. Num Tickets,Year,Budget,Director,Actor
0,1,Gone with the Wind,"$1,895,421,694","$200,852,579",202286200,1939,0,"[Victor Fleming, George Cukor, Sam Wood]","[Clark Gable, Vivien Leigh, Thomas Mitchell, B..."
1,2,Star Wars: Episode IV - A New Hope,"$1,668,979,715","$460,998,507",178119500,1977,"$11,000,000",[George Lucas],"[Mark Hamill, Harrison Ford, Carrie Fisher, Al..."
2,3,The Sound of Music,"$1,335,086,324","$159,287,539",142485200,1965,"$8,200,000",[Robert Wise],"[Julie Andrews, Christopher Plummer, Eleanor P..."
3,4,E.T. the Extra-Terrestrial,"$1,329,174,791","$435,110,554",141854300,1982,"$10,500,000",[Steven Spielberg],"[Henry Thomas, Drew Barrymore, Peter Coyote, D..."
4,5,Titanic,"$1,270,101,626","$659,363,944",135549800,1997,"$200,000,000",[James Cameron],"[Leonardo DiCaprio, Kate Winslet, Billy Zane, ..."
...,...,...,...,...,...,...,...,...,...
995,996,Spy Kids 2: Island of Lost Dreams,"$138,438,939","$85,846,429",14774700,2002,"$38,000,000",[Robert Rodriguez],"[Alexa PenaVega, Daryl Sabara, Antonio Bandera..."
996,997,Mystic River,"$138,398,648","$90,135,191",14770400,2003,"$25,000,000",[Clint Eastwood],"[Sean Penn, Tim Robbins, Kevin Bacon, Emmy Ros..."
997,998,Sea of Love,"$138,240,295","$58,571,513",14753500,1989,"$10,017,840",[Harold Becker],"[Al Pacino, Ellen Barkin, John Goodman, Michae..."
998,999,Scrooged,"$138,234,673","$60,328,558",14752900,1988,"$13,027,842",[Richard Donner],"[Bill Murray, Karen Allen, John Forsythe, John..."


In [388]:
### Enter the details page of each movie to get genres ###
    
url_movie = ''
df['Genre'] = ''
genre = []

### Go through each movie link ###
for link in main_page:
    
    url_movie = link
    html_page_movie = requests.get(url_movie) # Make a get request to retrieve the page
    soup_movie = BeautifulSoup(html_page_movie.content, 'html.parser') # Pass the page contents to beautiful soup for parsing

    ### Pinpoint that genres are located in a div with the class name below ###
    for div in soup_movie.findAll("div", {"class": "mojo-summary-values"}):
        for record in div.findAll('span', text="Genres"):
            
            ### Genre are displayed a span beside the span with Genres indicated ###
            data = record.find_next_sibling('span').get_text()
            data_clean = data.split()
            df.at[main_page.index(link), 'Genre'] = data_clean
            print(link)
      



https://www.boxofficemojo.com/title/tt0031381/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0076759/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0059742/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0083866/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120338/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0049833/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0073195/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0059113/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0070047/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0029583/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2488496/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0055254/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0080684/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0045888/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0047396/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0119567/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0097576/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0198781/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2294629/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0413300/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1300854/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0103064/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1201607/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1979376/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0034167/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0170016/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0081562/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0075265/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0129387/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2250912/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0317219/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0926084/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1119646/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1477834/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0097733/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0477347/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0480249/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0295178/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0407304/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0070511/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0319262/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0366548/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0382932/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0441773/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2709692/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0163187/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1217209/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120647/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1298650/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120812/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0078721/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0438097/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0381061/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0070903/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0110912/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0086567/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt4630562/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0097239/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0398286/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0098067/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0110148/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0275847/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0479952/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0816711/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0080549/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0349205/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt5848272/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1318514/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0458339/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0298130/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1409024/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0109446/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0118880/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0305669/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0118884/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1041829/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0181865/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0286716/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0349903/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0082158/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt6644200/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2279373/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0358082/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0070328/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0120746/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0240462/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0329575/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0227538/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0103874/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt5052448/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt4116284/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0452594/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2126355/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1067106/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0111280/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt2004420/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0102713/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0171363/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0094721/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0257044/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0072653/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0472181/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1392190/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0098536/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt0096969/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0118883/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0134084/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0265029/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0090056/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0072271/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0102059/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0239395/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0844471/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt4779682/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0081505/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt3450958/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0112579/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo

https://www.boxofficemojo.com/title/tt1327773/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1499658/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0082186/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0080761/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0071877/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0287717/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0327056/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0098273/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt0096061/credits/?ref_=bo_tt_tab#tabs
https://www.boxofficemojo.com/title/tt1489889/credits/?ref_=bo_tt_tab#tabs


In [390]:
### Save data to csv so not to lose progress ###

df.to_csv (r'movies_raw_data.csv', index = False, header=True)

In [36]:
### Reload the data ###

import pandas as pd
df = pd.read_csv('movies_raw_data.csv', encoding='latin-1')

In [37]:
df

Unnamed: 0,Rank,Title,Adj. Lifetime Gross,Lifetime Gross,Est. Num Tickets,Year,Budget,Director,Actor,Genre
0,1,Gone with the Wind,"$1,895,421,694","$200,852,579",202286200,1939,0,"['Victor Fleming', 'George Cukor', 'Sam Wood']","['Clark Gable', 'Vivien Leigh', 'Thomas Mitche...","['Drama', 'History', 'Romance', 'War']"
1,2,Star Wars: Episode IV - A New Hope,"$1,668,979,715","$460,998,507",178119500,1977,"$11,000,000",['George Lucas'],"['Mark Hamill', 'Harrison Ford', 'Carrie Fishe...","['Action', 'Adventure', 'Fantasy', 'Sci-Fi']"
2,3,The Sound of Music,"$1,335,086,324","$159,287,539",142485200,1965,"$8,200,000",['Robert Wise'],"['Julie Andrews', 'Christopher Plummer', 'Elea...","['Biography', 'Drama', 'Family', 'Musical', 'R..."
3,4,E.T. the Extra-Terrestrial,"$1,329,174,791","$435,110,554",141854300,1982,"$10,500,000",['Steven Spielberg'],"['Henry Thomas', 'Drew Barrymore', 'Peter Coyo...","['Family', 'Sci-Fi']"
4,5,Titanic,"$1,270,101,626","$659,363,944",135549800,1997,"$200,000,000",['James Cameron'],"['Leonardo DiCaprio', 'Kate Winslet', 'Billy Z...","['Drama', 'Romance']"
...,...,...,...,...,...,...,...,...,...,...
995,996,Spy Kids 2: Island of Lost Dreams,"$138,438,939","$85,846,429",14774700,2002,"$38,000,000",['Robert Rodriguez'],"['Alexa PenaVega', 'Daryl Sabara', 'Antonio Ba...","['Action', 'Adventure', 'Comedy', 'Family', 'S..."
996,997,Mystic River,"$138,398,648","$90,135,191",14770400,2003,"$25,000,000",['Clint Eastwood'],"['Sean Penn', 'Tim Robbins', 'Kevin Bacon', 'E...","['Crime', 'Drama', 'Mystery', 'Thriller']"
997,998,Sea of Love,"$138,240,295","$58,571,513",14753500,1989,"$10,017,840",['Harold Becker'],"['Al Pacino', 'Ellen Barkin', 'John Goodman', ...","['Crime', 'Drama', 'Mystery', 'Romance', 'Thri..."
998,999,Scrooged,"$138,234,673","$60,328,558",14752900,1988,"$13,027,842",['Richard Donner'],"['Bill Murray', 'Karen Allen', 'John Forsythe'...","['Comedy', 'Drama', 'Fantasy', 'Romance']"


In [42]:
### Get the IMDB ID of each movie for merging later ###


for link in main_page:
    df.at[main_page.index(link), 'IMDB ID'] = link[36:45]
    print (count)



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [43]:
df

Unnamed: 0,Rank,Title,Adj. Lifetime Gross,Lifetime Gross,Est. Num Tickets,Year,Budget,Director,Actor,Genre,IMDB ID
0,1,Gone with the Wind,"$1,895,421,694","$200,852,579",202286200,1939,0,"['Victor Fleming', 'George Cukor', 'Sam Wood']","['Clark Gable', 'Vivien Leigh', 'Thomas Mitche...","['Drama', 'History', 'Romance', 'War']",tt0031381
1,2,Star Wars: Episode IV - A New Hope,"$1,668,979,715","$460,998,507",178119500,1977,"$11,000,000",['George Lucas'],"['Mark Hamill', 'Harrison Ford', 'Carrie Fishe...","['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",tt0076759
2,3,The Sound of Music,"$1,335,086,324","$159,287,539",142485200,1965,"$8,200,000",['Robert Wise'],"['Julie Andrews', 'Christopher Plummer', 'Elea...","['Biography', 'Drama', 'Family', 'Musical', 'R...",tt0059742
3,4,E.T. the Extra-Terrestrial,"$1,329,174,791","$435,110,554",141854300,1982,"$10,500,000",['Steven Spielberg'],"['Henry Thomas', 'Drew Barrymore', 'Peter Coyo...","['Family', 'Sci-Fi']",tt0083866
4,5,Titanic,"$1,270,101,626","$659,363,944",135549800,1997,"$200,000,000",['James Cameron'],"['Leonardo DiCaprio', 'Kate Winslet', 'Billy Z...","['Drama', 'Romance']",tt0120338
...,...,...,...,...,...,...,...,...,...,...,...
995,996,Spy Kids 2: Island of Lost Dreams,"$138,438,939","$85,846,429",14774700,2002,"$38,000,000",['Robert Rodriguez'],"['Alexa PenaVega', 'Daryl Sabara', 'Antonio Ba...","['Action', 'Adventure', 'Comedy', 'Family', 'S...",tt0287717
996,997,Mystic River,"$138,398,648","$90,135,191",14770400,2003,"$25,000,000",['Clint Eastwood'],"['Sean Penn', 'Tim Robbins', 'Kevin Bacon', 'E...","['Crime', 'Drama', 'Mystery', 'Thriller']",tt0327056
997,998,Sea of Love,"$138,240,295","$58,571,513",14753500,1989,"$10,017,840",['Harold Becker'],"['Al Pacino', 'Ellen Barkin', 'John Goodman', ...","['Crime', 'Drama', 'Mystery', 'Romance', 'Thri...",tt0098273
998,999,Scrooged,"$138,234,673","$60,328,558",14752900,1988,"$13,027,842",['Richard Donner'],"['Bill Murray', 'Karen Allen', 'John Forsythe'...","['Comedy', 'Drama', 'Fantasy', 'Romance']",tt0096061


In [27]:
### Save progress ###

df.to_csv (r'movies_raw_data.csv', index = False, header=True)

In [44]:
### Get values from API calls to OMDB ###

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Iterate through each movie to get IMDB ID ###
count = 0
for index, row in df.iterrows():

    ### Unique API call URLs ###
    link = ("https://api.themoviedb.org/3/find/" + row['IMDB ID'] + "?api_key=4b1c7165172d96b4531d9d50f7464660&language=en-US&external_source=imdb_id")
    
    response = requests.get(link)
    data = response.json()
   
    ### Set the values we get to variables ###
    vote_count = data.get('movie_results', {})[0].get('vote_count')
    vote_average = data.get('movie_results', {})[0].get('vote_average')
    popularity = data.get('movie_results', {})[0].get('popularity')
    
    ### Save the value to the df ###
    df.at[index, 'Vote Count'] = vote_count
    df.at[index, 'Vote Average'] = vote_average
    df.at[index, 'Popularity'] = popularity

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [6]:
### Create a list of all IMDB IDs in our dataframe ###

imdb_list = []

for index, row in df.iterrows():
    imdb_list.append(row['IMDB ID'])


In [7]:
imdb_list

['tt0031381',
 'tt0076759',
 'tt0059742',
 'tt0083866',
 'tt0120338',
 'tt0049833',
 'tt0073195',
 'tt0059113',
 'tt0070047',
 'tt0029583',
 'tt2488496',
 'tt0055254',
 'tt0080684',
 'tt0052618',
 'tt0499549',
 'tt4154796',
 'tt0086190',
 'tt0107290',
 'tt0120915',
 'tt0110357',
 'tt0070735',
 'tt0082971',
 'tt0061722',
 'tt0032455',
 'tt0068646',
 'tt0109830',
 'tt0058331',
 'tt0077631',
 'tt0848228',
 'tt0369610',
 'tt1825683',
 'tt0059800',
 'tt0468569',
 'tt0061852',
 'tt0053285',
 'tt4154756',
 'tt0087332',
 'tt0298148',
 'tt0145487',
 'tt0064115',
 'tt0066011',
 'tt0116629',
 'tt0099785',
 'tt2527336',
 'tt0032910',
 'tt0056937',
 'tt0086960',
 'tt0058150',
 'tt3606756',
 'tt0065377',
 'tt0069704',
 'tt0046247',
 'tt0383574',
 'tt0048960',
 'tt0034492',
 'tt0071230',
 'tt0096895',
 'tt0037536',
 'tt0167260',
 'tt0266543',
 'tt0072308',
 'tt3748528',
 'tt6105098',
 'tt0042332',
 'tt0316654',
 'tt0058385',
 'tt0044672',
 'tt0077975',
 'tt0335345',
 'tt0121766',
 'tt0088763',
 'tt01

In [45]:
df

Unnamed: 0,Rank,Title,Adj. Lifetime Gross,Lifetime Gross,Est. Num Tickets,Year,Budget,Director,Actor,Genre,IMDB ID,Vote Count,Vote Average,Popularity
0,1,Gone with the Wind,"$1,895,421,694","$200,852,579",202286200,1939,0,"['Victor Fleming', 'George Cukor', 'Sam Wood']","['Clark Gable', 'Vivien Leigh', 'Thomas Mitche...","['Drama', 'History', 'Romance', 'War']",tt0031381,2053.0,7.9,18.795
1,2,Star Wars: Episode IV - A New Hope,"$1,668,979,715","$460,998,507",178119500,1977,"$11,000,000",['George Lucas'],"['Mark Hamill', 'Harrison Ford', 'Carrie Fishe...","['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",tt0076759,12970.0,8.2,41.110
2,3,The Sound of Music,"$1,335,086,324","$159,287,539",142485200,1965,"$8,200,000",['Robert Wise'],"['Julie Andrews', 'Christopher Plummer', 'Elea...","['Biography', 'Drama', 'Family', 'Musical', 'R...",tt0059742,1889.0,7.7,14.111
3,4,E.T. the Extra-Terrestrial,"$1,329,174,791","$435,110,554",141854300,1982,"$10,500,000",['Steven Spielberg'],"['Henry Thomas', 'Drew Barrymore', 'Peter Coyo...","['Family', 'Sci-Fi']",tt0083866,7280.0,7.5,17.959
4,5,Titanic,"$1,270,101,626","$659,363,944",135549800,1997,"$200,000,000",['James Cameron'],"['Leonardo DiCaprio', 'Kate Winslet', 'Billy Z...","['Drama', 'Romance']",tt0120338,16022.0,7.8,26.449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Spy Kids 2: Island of Lost Dreams,"$138,438,939","$85,846,429",14774700,2002,"$38,000,000",['Robert Rodriguez'],"['Alexa PenaVega', 'Daryl Sabara', 'Antonio Ba...","['Action', 'Adventure', 'Comedy', 'Family', 'S...",tt0287717,1297.0,5.5,10.703
996,997,Mystic River,"$138,398,648","$90,135,191",14770400,2003,"$25,000,000",['Clint Eastwood'],"['Sean Penn', 'Tim Robbins', 'Kevin Bacon', 'E...","['Crime', 'Drama', 'Mystery', 'Thriller']",tt0327056,3617.0,7.7,13.087
997,998,Sea of Love,"$138,240,295","$58,571,513",14753500,1989,"$10,017,840",['Harold Becker'],"['Al Pacino', 'Ellen Barkin', 'John Goodman', ...","['Crime', 'Drama', 'Mystery', 'Romance', 'Thri...",tt0098273,292.0,6.7,10.014
998,999,Scrooged,"$138,234,673","$60,328,558",14752900,1988,"$13,027,842",['Richard Donner'],"['Bill Murray', 'Karen Allen', 'John Forsythe'...","['Comedy', 'Drama', 'Fantasy', 'Romance']",tt0096061,800.0,6.9,13.511


In [1]:
### Save raw data ###

df.to_csv (r'merged_raw_data.csv', index = False, header=True)

NameError: name 'df' is not defined