In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import re
import requests
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

sns.set(context='notebook', style='whitegrid', font_scale=1.2)

### Getting one movie to soup

In [13]:
url = 'https://www.boxofficemojo.com/title/tt0499549/?ref_=bo_cso_table_1' 

response = requests.get(url)

In [14]:
response.status_code

200

In [15]:
page = response.text

In [16]:
soup = BeautifulSoup(page, "lxml")

### Function getting movie values

In [60]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

### All useful functions to clean data

In [22]:
import dateutil.parser

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

## FIRST TABLE

### Getting title

In [25]:
title_string = soup.find('title').text
title = title_string.split('-')[0].strip()

In [27]:
title

'Avatar'

### Getting domestic gross

In [31]:
raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
domestic_total_gross = money_to_int(raw_domestic_total_gross)

In [32]:
domestic_total_gross

760507625

### Getting International gross

In [33]:
raw_international_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[1]
                                    .text
                               )
international_total_gross = money_to_int(raw_international_total_gross)

In [34]:
international_total_gross

2086738578

### Total worldwide gross

In [266]:
worldwide_total_gross = domestic_total_gross + international_total_gross

### Getting domestic distributor

In [77]:
domestic_distributor = get_movie_value(soup,'Domestic Distributor').replace('See full company information','')
print(domestic_distributor)

Twentieth Century Fox




### Getting domestic opening

In [78]:
domestic_opening = get_movie_value(soup,'Domestic Opening')
domestic_opening = money_to_int(domestic_opening)
domestic_opening

77025481

### Getting budget

In [84]:
budget = get_movie_value(soup,'Budget')
budget = money_to_int(budget)
budget

237000000

### Getting release date

In [231]:
release_date = get_movie_value(soup,'Release Date')
release_date = release_date.split('\n')[0]  #Select the only the date
print(release_date)

December 16, 2009


### Getting MPAA rating

In [85]:
rating = get_movie_value(soup,'MPAA')
print(rating)

PG-13


### Getting run time

In [87]:
runtime = get_movie_value(soup,'Run')
runtime = runtime_to_minutes(runtime)
print(runtime)

162


### Getting Genres

In [271]:
genres = get_movie_value(soup,'Genres').replace(' ','').replace('\n','').replace('Sci-Fi','Science fiction')
genres = re.findall('[A-Z][^A-Z]*', genres)
print(genres)

['Action', 'Adventure', 'Fantasy', 'Science fiction']


## SECOND TABLE

### Getting Original Release row

In [272]:
if soup.find_all('h3')[0].text == 'By Release':
    original_release_worldwide = soup.find('table').find_all('td')[5].text
    original_release_worldwide = money_to_int(original_release_worldwide)
else:
    original_release_worldwide = worldwide_total_gross

In [268]:
print(original_release_worldwide)

2743577587
