In [1]:
# Dependencies
import pandas as pd

In [2]:
# Wiki url to pull html on highest grossing films page
wiki_url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_openings_for_films'

In [3]:
# Use pd to read html into df
tables = pd.read_html(wiki_url)

In [4]:
# World wide record holders
wiki_records = tables[0]
wiki_records = wiki_records[['Rank', 'Film', 'Opening (USD)[1]']]

In [5]:
# Clean dataframe, select tables that will be used
wiki_records = wiki_records.rename(columns={'Rank': 'rank', 'Film': 'title', 
                                                      'Year': 'release_year', 'Opening (USD)[1]': 'gross_opening_usd'})
wiki_records = wiki_records.set_index('title', drop=True)
wiki_records.head()

Unnamed: 0_level_0,rank,gross_opening_usd
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Avengers: Endgame,1,"$1,223,641,414"
Avengers: Infinity War,2,"$640,521,291"
The Fate of the Furious,3,"$541,937,239"
Star Wars: The Force Awakens,4,"$528,966,675"
Jurassic World,5,"$524,909,010"


In [6]:
# Save to csv
wiki_records.to_csv('highest_grossing_films_wiki.csv',index=True)

Wiki only offered the top 50 grossing movies, so I decided to look into another site to scrape data.

In [7]:
# Read highest grossing from box office mojo site
box_office_mojo_url = 'https://www.boxofficemojo.com/alltime/world/?pagenum=1&sort=rank&order=ASC&p=.htm'
box_office_mojo = pd.read_html(box_office_mojo_url)

In [8]:
# Save to df
top_100_grossing = box_office_mojo[1]
top_100_grossing.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Rank,Title,Studio,Worldwide,Domestic / %,Domestic / %,Overseas / %,Overseas / %,Year^
1,1,Avengers: Endgame,BV,"$2,796.3",$858.4,30.7%,"$1,937.9",69.3%,2019
2,2,Avatar,Fox,"$2,789.7",$760.5,27.3%,"$2,029.2",72.7%,2009^
3,3,Titanic,Par.,"$2,187.5",$659.4,30.1%,"$1,528.1",69.9%,1997^
4,4,Star Wars: The Force Awakens,BV,"$2,068.2",$936.7,45.3%,"$1,131.6",54.7%,2015


## Notes on the column headers:
* Rank = movies are ranked by worldwide gross revenue
* Grossing amounts are in millions of dollars
* '^' in the Year column indicates that the movie made their grossing amount over multiple releases

In [9]:
# Clean df, set index to movie title
header = top_100_grossing.iloc[0]
top_100_df = top_100_grossing[1:]
top_100_df.columns = header
top_100_df = top_100_df.set_index('Title', drop=True)
top_100_df.head()

Unnamed: 0_level_0,Rank,Studio,Worldwide,Domestic / %,Domestic / %,Overseas / %,Overseas / %,Year^
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Avengers: Endgame,1,BV,"$2,796.3",$858.4,30.7%,"$1,937.9",69.3%,2019
Avatar,2,Fox,"$2,789.7",$760.5,27.3%,"$2,029.2",72.7%,2009^
Titanic,3,Par.,"$2,187.5",$659.4,30.1%,"$1,528.1",69.9%,1997^
Star Wars: The Force Awakens,4,BV,"$2,068.2",$936.7,45.3%,"$1,131.6",54.7%,2015
Avengers: Infinity War,5,BV,"$2,048.4",$678.8,33.1%,"$1,369.5",66.9%,2018


In [10]:
# Save to csv
top_100_df.to_csv('highest_grossing_mojo.csv',index=True)