In [None]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures


def get_page_urls_array(link, maxpage):
    pagination = 101
    results = []
    results.append(link)
    while pagination <= maxpage:
        following_link = link + '/' + str(pagination)
        results.append(following_link)
        pagination += 100
    return results
       
   
def get_html(page_url):
    """to get the html format for each page"""
    print('parallel url: ' + page_url)
    response = requests.get(page_url)
    return response.text


# returns array of raninking
def extract_ranking_from_html(raw_html):
    results = []
    parsed_html = BeautifulSoup(raw_html, "html.parser")
    rows = parsed_html.select('center table tr')
    ths = rows[0].select('th')
    column_0 = ths[0].get_text()
    column_1 = ths[1].get_text()
    column_2_1 = ths[2].get_text()
    column_2_2 = ths[2].get_text() + "_href"
    column_3 = ths[3].get_text()
    column_4 = ths[4].get_text()
    column_5 = ths[5].get_text()
    if column_num > 6:
        column_6 = ths[6].get_text()

    for row in rows:
        # print('row is ', row)
        tds = row.select('td')
        if (len(tds) > 0):
            result = {}
            result[column_0] = tds[0].get_text()
            result[column_1] = tds[1].get_text()
            result[column_2_1] = tds[2].get_text()
            result[column_2_2] = tds[2].find("a")['href']
            result[column_3] = tds[3].get_text()
            result[column_4] = tds[4].get_text()
            result[column_5] = tds[5].get_text()
            if column_num > 6:
                result[column_6] = tds[6].get_text()      
            results.append(result)
           
    print('got ' + str(len(results)) + ' results')
    return results
   
def get_BoxOffice_ranking_parallel(urls, column_num):
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers = 500) as executor:

        future_to_url = {executor.submit(get_html, url): url for url in urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                results.extend(extract_ranking_from_html(data))
                print ('total have ' + str(len(results)) + ' results')
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            #else:
                #print('%r page is %d bytes' % (url, len(data)))
                # print(data)
                # results.append(data)
    return results
   
def get_BoxOffice_ranking(urls, column_num):
    results = []
    for url in urls:
        print('doing url ' + url)
        raw_html = get_html(url) # this gives us html of a page
        parsed_html = BeautifulSoup(raw_html, "html.parser")
        rows = parsed_html.select('center table tr')
        ths = rows[0].select('th')
        column_0 = ths[0].get_text()
        column_1 = ths[1].get_text()
        column_2_1 = ths[2].get_text()
        column_2_2 = ths[2].get_text() + "_href"
        column_3 = ths[3].get_text()
        column_4 = ths[4].get_text()
        column_5 = ths[5].get_text()
        if column_num > 6:
            column_6 = ths[6].get_text()

        for row in rows:
            # print('row is ', row)
            tds = row.select('td')
            if (len(tds) > 0):
                result = {}
                result[column_0] = tds[0].get_text()
                result[column_1] = tds[1].get_text()
                result[column_2_1] = tds[2].get_text()
                result[column_2_2] = tds[2].find("a")['href']
                result[column_3] = tds[3].get_text()
                result[column_4] = tds[4].get_text()
                result[column_5] = tds[5].get_text()
                if column_num > 6:
                    result[column_6] = tds[6].get_text()      
                results.append(result)
               
    return results
   

#  Main entry point - starting url of the page with data:
Boxoffice_first_link = "https://www.the-numbers.com/box-office-records/domestic/all-movies/cumulative/all-time"
total_pages_BO = 16901

# goes thru all paginated results and gets the url for every page
all_Boxoffice_links = get_page_urls_array(Boxoffice_first_link, total_pages_BO)
#print(all_Boxoffice_links)
# now we got list of pages with boxoffice results

# for each page extract box office links:

column_num = 7
# from each page in paginated data gather data:
print(len(all_Boxoffice_links))
boxOffice_result = get_BoxOffice_ranking_parallel(all_Boxoffice_links, column_num)

print(len(boxOffice_result))


In [6]:
def get_csv_header(dict_list):
    header= list(dict_list[0].keys())
    for item in dict_list:
        for key in item.keys():
            if key not in header:
                header.append(key)
    return header

import csv

def create_csv(filename, dict_list, header):
    f = open(filename,'w',encoding='utf-8')
    with f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()
        for i in range(len(dict_list)):
            writer.writerow(dict_list[i])
    f.close()
#call the function get_csv_header(dict_list) to get the header for the boxoffice csv file.
boxoffice_csv_header=get_csv_header(boxOffice_result)

#call the function create_csv(filename, dict_list, header) to save the boxoffice data in a cav file with info of Rank, Year, Movie, Movie_href, Distributor, DomesticBox Office, InternationalBox Office, WorldwideBox Office
create_csv("Boxoffice_Metadata1.csv", boxOffice_result, boxoffice_csv_header)

In [8]:
Budget_first_link = "https://www.the-numbers.com/movie/budgets/all"
total_pages_BG = 6065
all_Budget_links = get_page_urls_array(Budget_first_link,total_pages_BG)

#print(all_Boxoffice_links)
# now we got list of pages with boxoffice results

# for each page extract box office links:

column_num = 6
# from each page in paginated data gather data:
print(len(all_Budget_links))
budget_result = get_BoxOffice_ranking_parallel(all_Budget_links, column_num)

print(len(budget_result))

61
parallel url: https://www.the-numbers.com/movie/budgets/allparallel url: https://www.the-numbers.com/movie/budgets/all/101

parallel url: https://www.the-numbers.com/movie/budgets/all/201
parallel url: https://www.the-numbers.com/movie/budgets/all/301
parallel url: https://www.the-numbers.com/movie/budgets/all/401parallel url: https://www.the-numbers.com/movie/budgets/all/501

parallel url: https://www.the-numbers.com/movie/budgets/all/601
parallel url: https://www.the-numbers.com/movie/budgets/all/701parallel url: https://www.the-numbers.com/movie/budgets/all/801

parallel url: https://www.the-numbers.com/movie/budgets/all/901
parallel url: https://www.the-numbers.com/movie/budgets/all/1001
parallel url: https://www.the-numbers.com/movie/budgets/all/1101
parallel url: https://www.the-numbers.com/movie/budgets/all/1201
parallel url: https://www.the-numbers.com/movie/budgets/all/1301
parallel url: https://www.the-numbers.com/movie/budgets/all/1401parallel url: https://www.the-numbers

In [9]:
#call the function get_csv_header(dict_list) to get the header for the boxoffice csv file.
budget_csv_header=get_csv_header(budget_result)

#call the function create_csv(filename, dict_list, header) to save the boxoffice data in a cav file with info of Rank, Year, Movie, Movie_href, Distributor, DomesticBox Office, InternationalBox Office, WorldwideBox Office
create_csv("Budget_Metadata1.csv", budget_result, budget_csv_header)

In [None]:
def get_genres_box_links(boxOffice_result):
    genres_links = []
    for infor in boxOffice_result:
        link = "https://www.the-numbers.com" + infor["Movie_href"]
        genres_links.append(link)
    return genres_links

def get_Genres_time_language(genres_links):
    results = []
    """use page urls - loop url in it - call get_html function to get the html for each page, and then extract the wated data from the html"""
    with concurrent.futures.ThreadPoolExecutor(max_workers = 500) as executor:
        future_to_url = {executor.submit(get_html, url): url for url in genres_links}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                raw_html = future.result()
                parsed_html = BeautifulSoup(raw_html, "html.parser")
                tables = parsed_html.select('#summary table')
                for table in tables:
                    trs = table.select('tr')
                    if len(trs) > 10:
                        result = {}
                        for tr in trs:
                            tds = tr.find_all("td")
                            if len(tds) == 2:
                                result["href"] = url.replace("https://www.the-numbers.com","")
                                result[tds[0].get_text()] = tds[1].get_text()
                        results.append(result)
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            # else:
                #print('%r page is %d bytes' % (url, len(data)))
                #results.append(data)
    print(len(results))
    return results


genres_links = get_genres_box_links(boxOffice_result) # covert links to URLs

genres_time_lanugage = get_Genres_time_language(genres_links)

print(genres_time_lanugage)

parallel url: https://www.the-numbers.com/movie/Lethal-Weapon-3#tab=summary
parallel url: https://www.the-numbers.com/movie/Fast-and-the-Furious-The#tab=summary
parallel url: https://www.the-numbers.com/movie/Doctor-Dolittle-(1998)#tab=summary
parallel url: https://www.the-numbers.com/movie/Mamma-Mia#tab=summary
parallel url: https://www.the-numbers.com/movie/Pokemon-Detective-Pikachu-(2019)#tab=summary
parallel url: https://www.the-numbers.com/movie/Rio#tab=summaryparallel url: https://www.the-numbers.com/movie/Kung-Fu-Panda-3#tab=summary

parallel url: https://www.the-numbers.com/movie/Juno#tab=summaryparallel url: https://www.the-numbers.com/movie/Marley-and-Me#tab=summary

parallel url: https://www.the-numbers.com/movie/Smurfs-The#tab=summary
parallel url: https://www.the-numbers.com/movie/Once-Upon-a-Time-in-Hollywood-(2019)#tab=summaryparallel url: https://www.the-numbers.com/movie/xXx#tab=summary

parallel url: https://www.the-numbers.com/movie/Jungle-Book-The-(1967)#tab=summary