# Movie Budgets Data Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
from helper import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Let's start our scraping by taking a look at the first page:

In [2]:
page = requests.get('http://www.the-numbers.com/movie/budgets/all/1')

In [3]:
page.content[0:10000]

b'<!DOCTYPE html>\n<html>\n<head>\n<meta http-equiv="PICS-Label" content=\'(PICS-1.1 "http://www.icra.org/ratingsv02.html" l gen true for "http://www.the-numbers.com/" r (cb 1 lz 1 nz 1 oz 1 vz 1) "http://www.rsac.org/ratingsv01.html" l gen true for "http://www.the-numbers.com/" r (n 0 s 0 v 0 l 0))\'>\n<!--<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" >-->\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta name="format-detection" content="telephone=no">   <!-- for apple mobile --> \n<meta property="fb:admins" content="521546213" />\n\n\n<meta name="viewport" content="initial-scale=1">\n<meta name="description" content="The budget of (nearly) every movie ever made">\n<meta name="robots" content="NOODP">\n<meta name="keywords" content="movies, box office, The Numbers, Numbers, daily box office, weekly box office, movie stars, dvd sales, Blu-ray sales, release schedule">\n<title>The Numbers - Movie Budgets</title>\n<link rel="stylesheet"

In the table section there is a problem with some tags that aren't closed properly; we replace them in order to have a correct format:

In [4]:
html = page.text.replace('<tr>\n<tr>', '</tr>\n<tr>')

In [5]:
soup = BeautifulSoup(html, 'html.parser')

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta content='(PICS-1.1 "http://www.icra.org/ratingsv02.html" l gen true for "http://www.the-numbers.com/" r (cb 1 lz 1 nz 1 oz 1 vz 1) "http://www.rsac.org/ratingsv01.html" l gen true for "http://www.the-numbers.com/" r (n 0 s 0 v 0 l 0))' http-equiv="PICS-Label">
   <!--<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" >-->
   <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
    <meta content="telephone=no" name="format-detection">
     <!-- for apple mobile -->
     <meta content="521546213" property="fb:admins"/>
     <meta content="initial-scale=1" name="viewport">
      <meta content="The budget of (nearly) every movie ever made" name="description">
       <meta content="NOODP" name="robots">
        <meta content="movies, box office, The Numbers, Numbers, daily box office, weekly box office, movie stars, dvd sales, Blu-ray sales, release schedule" name="keywords">
         <title>
          The Numbers - 

Now it looks good, we can read the table in the page:

In [7]:
table = soup.find('table')

and the rows and columns.

In [8]:
data = []

for i, row in enumerate(table.find_all('tr')[1:6]):
    data_row = []
    for j, col in enumerate(row.find_all('td')):
        data_row.append(col.text)
    data.append(data_row)

data

[['1',
  '12/18/2009',
  'Avatar',
  '$425,000,000',
  '$760,507,625',
  '$2,783,918,982'],
 ['2',
  '12/18/2015',
  'Star Wars Ep. VII: The Force Awakens',
  '$306,000,000',
  '$936,662,225',
  '$2,058,662,225'],
 ['3',
  '5/24/2007',
  'Pirates of the Caribbean: At Worldâ\x80\x99s End',
  '$300,000,000',
  '$309,420,425',
  '$963,420,425'],
 ['4', '11/6/2015', 'Spectre', '$300,000,000', '$200,074,175', '$879,620,923'],
 ['5',
  '7/20/2012',
  'The Dark Knight Rises',
  '$275,000,000',
  '$448,139,099',
  '$1,084,439,099']]

We now define a function and use it to scrape all pages:

In [9]:
def scrape_movie_budgets(url, page_number):
    page = requests.get(url + str(page_number))
    html = page.text.replace('<tr>\n<tr>', '</tr>\n<tr>')
    
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table')
    
    data = []
    for i, row in enumerate(table.find_all('tr')):
        data_row = []
        for j, col in enumerate(row.find_all('td')):
            # print(i, j, col.text)
            data_row.append(col.text)
        if data_row != []:
            data.append(data_row)

    return data

In [10]:
all_data = []
for i in range(1, 5501, 100):
    data = scrape_movie_budgets('http://www.the-numbers.com/movie/budgets/all/', i)
    all_data = all_data + data
    print('{} movies scraped'.format(i+100-1))

100 movies scraped
200 movies scraped
300 movies scraped
400 movies scraped
500 movies scraped
600 movies scraped
700 movies scraped
800 movies scraped
900 movies scraped
1000 movies scraped
1100 movies scraped
1200 movies scraped
1300 movies scraped
1400 movies scraped
1500 movies scraped
1600 movies scraped
1700 movies scraped
1800 movies scraped
1900 movies scraped
2000 movies scraped
2100 movies scraped
2200 movies scraped
2300 movies scraped
2400 movies scraped
2500 movies scraped
2600 movies scraped
2700 movies scraped
2800 movies scraped
2900 movies scraped
3000 movies scraped
3100 movies scraped
3200 movies scraped
3300 movies scraped
3400 movies scraped
3500 movies scraped
3600 movies scraped
3700 movies scraped
3800 movies scraped
3900 movies scraped
4000 movies scraped
4100 movies scraped
4200 movies scraped
4300 movies scraped
4400 movies scraped
4500 movies scraped
4600 movies scraped
4700 movies scraped
4800 movies scraped
4900 movies scraped
5000 movies scraped
5100 movi

Finally, we create a dataframe, clean it a little bit and save it as a pickle.

In [24]:
budgets = pd.DataFrame(all_data, columns=['id', 'release_date', 'movie', 'production_budget', 'domestic_gross', 'worldwide_gross'])

In [25]:
budgets.drop('id', axis=1, inplace=True)

In [26]:
budgets.release_date = pd.to_datetime(budgets.release_date)

In [27]:
budgets.production_budget = budgets.production_budget.str.replace(',', '').str.replace('$', '').astype('float')
budgets.domestic_gross = budgets.domestic_gross.str.replace(',', '').str.replace('$', '').astype('float')
budgets.worldwide_gross = budgets.worldwide_gross.str.replace(',', '').str.replace('$', '').astype('float')

We have some odd characters instead of apostrophes, let's correct them:

In [28]:
budgets[budgets.movie.str.contains('â')]

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross
2,2007-05-24,Pirates of the Caribbean: At Worldâs End,300000000.0,309420425.0,963420400.0
22,2006-07-07,Pirates of the Caribbean: Dead Manâs Chest,225000000.0,423315812.0,1066216000.0
232,2001-11-16,Harry Potter and the Sorcererâs Stone,125000000.0,317575550.0,974755400.0
249,2016-12-21,Assassinâs Creed,125000000.0,54647948.0,240497900.0
277,2016-04-22,The Huntsman: Winterâs War,115000000.0,48003015.0,165149300.0
295,2016-09-30,Miss Peregrineâs Home for Peculiar Children,110000000.0,87242834.0,296642800.0
452,2009-12-25,Itâs Complicated,85000000.0,112735375.0,224614700.0
683,2014-05-09,Legends of Oz: Dorothyâs Return,70000000.0,8462347.0,20107930.0
725,2016-08-12,Peteâs Dragon,65000000.0,76233151.0,137769000.0
1007,2015-12-25,Daddyâs Home,50000000.0,150357137.0,238757100.0


In [29]:
budgets.movie = budgets.movie.str.replace('â', '\'')

In [30]:
budgets.to_pickle('pickle/budgets_scraped.p')