In [8]:
#Import necessary things
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from bs4 import BeautifulSoup

In [20]:
#Request data from Box Office Mojo and save the files
import requests

for i in range(1,145):
    url = "http://www.boxofficemojo.com/alltime/domestic.htm?page=" + str(i) + "&p=.htm"
    response = requests.get(url)
    filename = "html/page" + str(i) + ".html"

    #Now save that file
    with open(filename, 'w') as f:
        f.write(response.text.encode('utf-8'))        

In [55]:
#Loop through all 144 pages of movies and create two lists
#One list capture title, studio, gross, and year
#The other captures URL

import os  
mojo_info = []
all_urls = []
for file in os.listdir('html/'):

    soup = BeautifulSoup(open(os.path.join('./html/',file)).read(), 'lxml')
    table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('cellpadding') and tag['cellpadding']=="5") 

    movie_info = []
    url = []
    header = ['number', 'title', 'studio', 'gross', 'year']

    for row in table.findAll('tr'):
        url.append(row.find('a')['href'])
        row_dict = {}
        for i, cell in enumerate(row.findAll('td')):
            row_dict[header[i]] = cell.findAll(text = True)
        movie_info.append(row_dict)
    
    #Strip out the first dictionary in movie_info and url because they just contain the table headers
    url = url[1:]
    movie_info = movie_info[1:]
    
    mojo_info += movie_info
    all_urls += url

In [64]:
#Put URL info into the dictionaries for each movie
for i in range(len(mojo_info)):
    mojo_info[i]['url'] = all_urls[i]   

In [66]:
print mojo_info[0:5]

[{'gross': [u'$936,662,225'], 'title': [u'Star Wars: The Force Awakens'], 'url': '/movies/?id=starwars7.htm', 'number': [u'1'], 'studio': [u'BV'], 'year': [u'2015']}, {'gross': [u'$760,507,625'], 'title': [u'Avatar'], 'url': '/movies/?page=releases&id=avatar.htm', 'number': [u'2'], 'studio': [u'Fox'], 'year': [u'2009^']}, {'gross': [u'$658,672,302'], 'title': [u'Titanic'], 'url': '/movies/?page=releases&id=titanic.htm', 'number': [u'3'], 'studio': [u'Par.'], 'year': [u'1997^']}, {'gross': [u'$652,270,625'], 'title': [u'Jurassic World'], 'url': '/movies/?id=jurassicpark4.htm', 'number': [u'4'], 'studio': [u'Uni.'], 'year': [u'2015']}, {'gross': [u'$623,357,910'], 'title': [u"Marvel's The Avengers"], 'url': '/movies/?id=avengers11.htm', 'number': [u'5'], 'studio': [u'BV'], 'year': [u'2012']}]


In [70]:
mojo_info[0]['url']

'/movies/?id=starwars7.htm'

In [73]:
#Create lists to hold the info I want to keep (might as well keep all but 'number')
title = []
studio = []
gross = []
year = []
url = []

#Put data into these lists:
for i in mojo_info:
    title.append(i['title'][0])
    studio.append(i['studio'][0])
    gross.append(i['gross'][0])
    year.append(i['year'][0])
    url.append(i['url'])

In [95]:
# Now use regex to get years correct

#But first, found some movies missing the year, looked up on IMDB, and was able to replace
#Where I replaced year with 0000, it wasn't actually a movie, and I will want to go through and remove from the dataframe
year[9738] = '1988'
year[10150] = '1984'
year[11879] = '1996'
year[11889] = '0000'
year[12019] = '1999'
year[12064] = '0000'
year[14353] = '1973'

import re
year_regex = []
for i in year:
    string = re.search( r'\d\d\d\d', i)
    year_regex.append(string.group(0))

year = year_regex

14385
[u'2015', u'2009', u'1997', u'2015', u'2012', u'2008', u'2016', u'1999', u'1977', u'2015', u'2012', u'2004', u'1982', u'2013', u'2006', u'1994', u'2010', u'2013', u'2016', u'2012']


In [98]:
# Create a dataframe from these lists
mojo_df = pd.DataFrame()
mojo_df['title'] = title
mojo_df['year'] = year
mojo_df['url'] = url
mojo_df['studio'] = studio
mojo_df['gross'] = gross

print mojo_df.info()
mojo_df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14385 entries, 0 to 14384
Data columns (total 5 columns):
title     14385 non-null object
year      14385 non-null object
url       14385 non-null object
studio    14385 non-null object
gross     14385 non-null object
dtypes: object(5)
memory usage: 562.0+ KB
None


Unnamed: 0,title,year,url,studio,gross
0,Star Wars: The Force Awakens,2015,/movies/?id=starwars7.htm,BV,"$936,662,225"
1,Avatar,2009,/movies/?page=releases&id=avatar.htm,Fox,"$760,507,625"
2,Titanic,1997,/movies/?page=releases&id=titanic.htm,Par.,"$658,672,302"
3,Jurassic World,2015,/movies/?id=jurassicpark4.htm,Uni.,"$652,270,625"
4,Marvel's The Avengers,2012,/movies/?id=avengers11.htm,BV,"$623,357,910"
5,The Dark Knight,2008,/movies/?page=releases&id=darkknight.htm,WB,"$534,858,444"
6,Finding Dory,2016,/movies/?id=pixar2015.htm,BV,"$484,234,949"
7,Star Wars: Episode I - The Phantom Menace,1999,/movies/?page=releases&id=starwars.htm,Fox,"$474,544,677"
8,Star Wars,1977,/movies/?page=releases&id=starwars4.htm,Fox,"$460,998,007"
9,Avengers: Age of Ultron,2015,/movies/?id=avengers2.htm,BV,"$459,005,868"


In [100]:
#Save my results to a CSV file locally
mojo_df.to_csv('mojo_list.csv', encoding='utf-8')