In [16]:
import requests
import time
from bs4 import BeautifulSoup

In [2]:
def get_page_urls_array(link, maxpage):
    """in order to get the url list of each page"""
    pagination = 101
    results =[]
    results.append(link)
    while pagination <= maxpage:
        following_link = link + '/' + str(pagination)
        results.append(following_link)
        pagination += 100
    return results

In [3]:
def get_html(page_url):
    """to get the html format for each page"""
    response = requests.get(page_url)
    return response.text

In [4]:
def get_BoxOffice_ranking(urls, column_num):
    """use page urls - loop url in it - call get_html function to get the html for each page, and then extract the wated data from the html"""
    results = []
    for url in urls:
        raw_html = get_html(url) # this gives us html of a page 
        parsed_html = BeautifulSoup(raw_html, "html.parser")
        rows = parsed_html.select('center table tr')
        ths = rows[0].select('th')
        column_0 = ths[0].get_text()
        column_1 = ths[1].get_text()
        column_2_1 = ths[2].get_text()
        column_2_2 = ths[2].get_text() + "_href"
        column_3 = ths[3].get_text()
        column_4 = ths[4].get_text()
        column_5 = ths[5].get_text()
        if column_num > 6:
            column_6 = ths[6].get_text()

        for row in rows:
            #print('row is ', row)
            tds = row.select('td')
            if (len(tds) > 0):
                result = {}
                result[column_0] = tds[0].get_text()
                result[column_1] = tds[1].get_text()
                result[column_2_1] = tds[2].get_text()
                result[column_2_2] = tds[2].find("a")['href']
                result[column_3] = tds[3].get_text()
                result[column_4] = tds[4].get_text()
                result[column_5] = tds[5].get_text()
                if column_num > 6:
                    result[column_6] = tds[6].get_text()      
                results.append(result)
    return results

In [5]:
def get_csv_header(dict_list):
    header= list(dict_list[0].keys())
    for item in dict_list:
        for key in item.keys():
            if key not in header:
                header.append(key)
    return header

In [6]:
import csv

In [7]:
def create_csv(filename, dict_list, header):
    f = open(filename,'w',encoding='utf-8')
    with f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()
        for i in range(len(dict_list)):
            writer.writerow(dict_list[i])
    f.close()

Extract Boxoffice Data from the web

In [8]:
#call the function get_page_urls_array(link, maxpage) to get all the pagination links of boxoffice for later use. And then the result will be a list containing all links.
Boxoffice_first_link = "https://www.the-numbers.com/box-office-records/domestic/all-movies/cumulative/all-time"
total_pages_BO = 16901
all_Boxoffice_links = get_page_urls_array(Boxoffice_first_link,total_pages_BO)

In [9]:
#call the function get_BoxOffice_ranking(urls, column_num) to extract the boxoffice data column from the website. And the result will be a dict with keys (from web title) and values
column_num = 7
boxOffice_result = get_BoxOffice_ranking(all_Boxoffice_links, column_num)

In [10]:
#call the function get_csv_header(dict_list) to get the header for the boxoffice csv file.
boxoffice_csv_header=get_csv_header(boxOffice_result)

#call the function create_csv(filename, dict_list, header) to save the boxoffice data in a cav file with info of Rank, Year, Movie, Movie_href, Distributor, DomesticBox Office, InternationalBox Office, WorldwideBox Office
create_csv("Boxoffice_Metadata.csv", boxOffice_result, boxoffice_csv_header)


Extract Movie Budgets Data from the web

In [11]:
#call the function get_page_urls_array(link, maxpage) to get all the pagination links of budgets for later use. And then the result will be a list containing all links.
Budget_first_link = "https://www.the-numbers.com/movie/budgets/all"
total_pages_BG = 6065
all_Budget_links = get_page_urls_array(Budget_first_link,total_pages_BG)

In [12]:
#call the function get_BoxOffice_ranking(urls, column_num) to extract the budget data column from the website. And the result will be a dict with keys (from web title) and values
column_num = 6
budget_result = get_BoxOffice_ranking(all_Budget_links, column_num)

In [13]:
#call the function get_csv_header(dict_list) to get the header for the budget csv file.
budget_csv_header=get_csv_header(budget_result)
#call the function create_csv(filename, dict_list, header) to save the budget data in a cav file with info of Rank, ReleaseDate, Movie, Movie_href, ProductionBudget, DomesticGross, WorldwideGross
create_csv("Budget_Data.csv", budget_result, budget_csv_header)

Filter movies released after Year 2000 in boxOffice_result!

In [20]:
boxOffice_after2000 = []
for information in boxOffice_result:
    if int(information["Year"]) >= 2000:
        boxOffice_after2000.append(information)

In [21]:
boxOffice_after2000_csv_header=get_csv_header(boxOffice_after2000)

create_csv("boxOffice_after2000.csv", boxOffice_after2000, boxOffice_after2000_csv_header)

In [22]:
def get_genres_box_links(boxOffice_result):
    genres_links = []
    for infor in boxOffice_result:
        link = "https://www.the-numbers.com" + infor["Movie_href"]
        genres_links.append(link)
    return genres_links

In [33]:
def get_Genres_time_language(genres_links):
    """use page urls - loop url in it - call get_html function to get the html for each page, and then extract the wated data from the html"""
    results = []
    for url in genres_links:
        raw_html = get_html(url) # this gives us html of a page 
        parsed_html = BeautifulSoup(raw_html, "html.parser")
        tables = parsed_html.select('#summary table')
        for table in tables:
            trs = table.select('tr')
            if len(trs) > 10:
                result = {}
                for tr in trs:
                    tds = tr.find_all("td")
                    if len(tds) == 2:
                        result["href"] = url.replace("https://www.the-numbers.com","")
                        result[tds[0].get_text()] = tds[1].get_text()
                results.append(result)
    return results

In [34]:
genres_links = get_genres_box_links(boxOffice_after2000)

In [38]:
Genres_time_language = get_Genres_time_language(genres_links[0:1])
Genres_time_language

[{'href': '/movie/Star-Wars-Ep-VII-The-Force-Awakens#tab=summary',
  'Domestic Releases:': 'December 18th, 2015 (IMAX) by Walt DisneyDecember 18th, 2015 (Wide) by Walt Disney',
  'International Releases:': 'December 16th, 2015 (Wide) (Belgium)\nDecember 16th, 2015 (Wide) (Denmark)\nDecember 16th, 2015 (Wide) (Egypt)\nDecember 16th, 2015 (Wide) (Finland)\nDecember 16th, 2015 (Wide) (France)\n... Show all releases\n',
  'Video\xa0Release:': 'April 1st, 2016 by Disney/Lucasfilm',
  'MPAA\xa0Rating:': 'PG-13 for sci-fi action violence.(Rating bulletin 2401 (Cert #50155), 11/25/2015)',
  'Running Time:': '136 minutes',
  'Franchise:': 'Star Wars',
  'Comparisons:': 'All-Time Top-Grossing FilmsBlockbusters, 2015vs. AvatarCreate your own comparison chart…',
  'Keywords:': 'Space Opera, Good vs. Evil, Delayed Sequel, Intertitle, Robot, Sequels With Returning Stars, Dysfunctional Family, Gratuitous Cameos, Motion Capture Performance, 3-D, 3-D - Post-production Conversion, IMAX: DMR, Filmed in E

In [36]:
Genres_time_language_csv_header=get_csv_header(Genres_time_language)

create_csv("Genres_time_language.csv", Genres_time_language, Genres_time_language_csv_header)

In [28]:
def get_internal_box_links(boxOffice_result):
    international_box_links = []
    for infor in boxOffice_result:
        genres_link = "https://www.the-numbers.com" + infor["Movie_href"]
        international_box_link = genres_link.replace(genres_link[-7:], "international")
        international_box_links.append(international_box_link)
    return international_box_links

In [191]:
def get_international_box(international_box_links):
    """use page urls - loop url in it - call get_html function to get the html for each page, and then extract the wated data from the html"""
    results = []
    for url in international_box_links:
        begin = time.time()
        raw_html = get_html(url) # this gives us html of a page 
        end = time.time()
        print(end-begin)
        print("----")
        begin = time.time()
        parsed_html = BeautifulSoup(raw_html, "html.parser")
        divs = parsed_html.select('#international div')
        #print(divs[0])
        rows = divs[0].find_all("tr")
        #print(trs)
        #print(rows[1].select('td'))
        result = {}
        for row in rows:
            #print('row is ', row)
            tds = row.select('td')
            if (len(tds) > 3):
                country_name = tds[0].get_text()
                result["href"] = url.replace("https://www.the-numbers.com","")
                result[country_name] = tds[6].get_text()
        results.append(result)
        end = time.time()
        print(end-begin)
        
    return results

In [29]:
international_box_links = get_internal_box_links(boxOffice_after2000)

In [32]:
international_box = get_international_box(international_box_links[1:10])

2.3132030963897705
----
0.301192045211792
2.2793920040130615
----
0.5374338626861572
2.3101141452789307
----
0.41162896156311035
2.1100871562957764
----
0.2905118465423584
2.325961112976074
----
0.42122387886047363
2.196120262145996
----
0.3143618106842041
2.314481019973755
----
0.35820698738098145
2.1183269023895264
----
0.30638599395751953
2.062474012374878
----
0.327089786529541


In [None]:
international_box_csv_header=get_csv_header(international_box)

create_csv("international_box.csv", international_box, international_box_csv_header)

In [37]:
international_box

[{'href': '/movie/Avengers-Endgame-(2019)#tab=international',
  'Argentina': '$17,723,546',
  'Australia*': '$59,101,640',
  'Brazil': '$85,660,664',
  'Bulgaria': '$1,469,132',
  'China': '$629,100,000',
  'Czech Republic': '$6,694,694',
  'France': '$62,270,584',
  'Germany': '$64,094,836',
  'Hong Kong': '$26,500,000',
  'India': '$61,300,000',
  'Indonesia': '$34,674,656',
  'Italy': '$34,025,304',
  'Japan': '$54,661,640',
  'Lithuania': '$627,881',
  'Malaysia': '$19,100,000',
  'Mexico': '$77,600,000',
  'Netherlands': '$15,055,533',
  'New Zealand': '$9,323,924',
  'Philippines': '$32,400,000',
  'Poland': '$10,557,531',
  'Portugal': '$4,298,423',
  'Russia (CIS)': '$44,767,708',
  'Slovakia': '$1,994,570',
  'South Korea': '$105,229,504',
  'Spain': '$32,658,448',
  'Taiwan': '$29,385,526',
  'Thailand': '$25,600,000',
  'Turkey': '$7,878,857',
  'United Kingdom': '$115,040,263'},
 {'href': '/movie/Avatar#tab=international',
  'Argentina': '$1,851,813',
  'Australia*': '$76,6