In [None]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

Our standard imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## First, an HTML refresher
HTML is the basic language used to create a web page. 

It tells the web browser what text/media to display, where to display it, and how to display it (style)

HTML is very structured/hirarchical. 

Every page is made up of discrete "elements."

Elements are labeled with "tags."

For example:

    <p>You are beginning to learn HTML.</p>

A start tag also often contains "attributes" with info about the element.

Attributes usually have a name and value.

Example:

    <p class="my_red_sentences">You are beginning to learn HTML.</p>

A full HTML document has a structure more like this:

```
<html> 
  <head> </head>
  <body>
     <p class="red">You are beginning to learn HTML.</p>
     <h1> This is a header </h1>
     <a href="www.google.com"> Some link </a>
  </body>
</html>
```

Let's explore some live HTML!

Go to http://boxofficemojo.com/movies/?id=biglebowski.htm in your browser,
click Inspect Element, also click on View Page Source.




In [None]:
#conda install beautifulsoup4

from bs4 import BeautifulSoup



### Get the HTML from a page and convert to a BeautifulSoup object

We'll start by scraping some of that information about [The Big Lebowski](http://boxofficemojo.com/movies/?id=biglebowski.htm).

In [None]:
# if needed: pip install requests
import requests

url = 'http://boxofficemojo.com/movies/?id=biglebowski.htm'

response = requests.get(url)

For information on HTTP status codes, see:

https://en.wikipedia.org/wiki/List_of_HTTP_status_codes

In [None]:
response.status_code

In [None]:
print(response.text)

In [None]:
page = response.text
soup = BeautifulSoup(page)
print(soup)

In [None]:
print(soup.prettify())

## `soup.find()`

`soup.find()` is the most common function we will use from this package.  

Let's try out some common variations of `soup.find()`

In [None]:
# soup.find() returns the first matched tag it finds.
# It searches the entire tree.

# Search for a type of tag by using the tag as a string
# (like 'body','div','p','a') as an argument.

print(soup.find('a'))

In [None]:
# Equivalently:
print(soup.a)

In [None]:
# soup.find_all() returns a list of all matches

for link in soup.find_all('a'): 
    print(link)

In [None]:
# retrieve the url from an anchor tag
soup.find('a')['href']

In [None]:
# You can match on an attribute like an id or class.
# Take a look at what the 'mp_box_content' classes
# look like on the webpage, with Inspect Element.

for element in soup.find_all(class_='mp_box_content'):
    print(element, '\n')

In [None]:
# We can find all the columns in the first mp_box_content table
# by "chaining" `find` and `find_all`.

print(soup.find(class_='mp_box_content').find_all('td'))

In [None]:
# To extract just the value of interest:

soup.find(class_='mp_box_content').find_all('td')[1].text

In [None]:
# find with an "id". (ID is unique.)

print(soup.find(id='hp_footer'))

### Consistency Web scraping is made simple by the consistent format of information among like pages of a website.
### Items to scrape for each movie:
movie title
total domestic gross
release date
runtime
rating

In [None]:
# Movie Title

print(soup.find('title'))

In [None]:
title_string = soup.find('title').text
print(title_string)

In [None]:
print(title_string.split('('))

In [None]:
title = title_string.split('(')[0].strip()
print(title)

In [None]:
# Domestic Total Gross

## text does an exact match search!
print(soup.find(text="Domestic Total Gross"))

In [None]:
# You could find a perfect match:

print(soup.find(text="Domestic Total Gross: "))

#### You could also use regular expressions
![regular expressions](http://imgs.xkcd.com/comics/regular_expressions.png)

[Handy Tool for making RegEx](http://pythex.org/)

In [None]:
import re
domestic_total_regex = re.compile('Domestic Total')
soup.find(text=domestic_total_regex)

In [None]:
dtg_string = soup.find(text=re.compile('Domestic Total'))
print(dtg_string)

In [None]:
print(dtg_string.findNextSibling())

In [None]:
dtg = dtg_string.findNextSibling().text
dtg = dtg.replace('$','').replace(',','')
domestic_total_gross = int(dtg)
print(domestic_total_gross)

### We can actually do several of these using the text matching method, so let's make a function for that

In [None]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

In [None]:
# domestic total gross
dtg = get_movie_value(soup,'Domestic Total')
print(dtg)

In [None]:
# runtime
runtime = get_movie_value(soup,'Runtime')
print(runtime)

In [None]:
# rating
rating = get_movie_value(soup,'MPAA Rating')
print(rating)

In [None]:
release_date = get_movie_value(soup,'Release Date')
print(release_date)

### We need a few helper methods to parse the strings we've gotten

In [None]:
import dateutil.parser

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

In [None]:
# Let's get these again and format them all in one swoop

from pprint import pprint

raw_release_date = get_movie_value(soup,'Release Date')
release_date = to_date(raw_release_date)

raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
domestic_total_gross = money_to_int(raw_domestic_total_gross)

raw_runtime = get_movie_value(soup,'Runtime')
runtime = runtime_to_minutes(raw_runtime)

headers = ['movie title', 'domestic total gross',
           'release date', 'runtime (mins)', 'rating']

movie_data = []
movie_dict = dict(zip(headers, [title,
                                domestic_total_gross,
                                release_date,
                                runtime,
                                rating]))
movie_data.append(movie_dict)

pprint(movie_data)

### What about scraping tables? 

In [None]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_highest-grossing_films")
soup = BeautifulSoup(response.text)

movie_list = soup.find("table",{ "class" : "wikitable sortable plainrowheaders" })
print(movie_list)

In [None]:
movie_data = []

header = ['Rank','Peak','Worldwide Gross','Year','Reference(s)']
for row in movie_list.findAll("tr"):
    row_dict={}
    for i,cell in enumerate(row.findAll("td")):
        row_dict[header[i]] = cell.find(text=True)
    link = row.find("th").find("a")
    if link:
        row_dict['Title'] = link.text
    movie_data.append(row_dict)
    
movies_df = pd.DataFrame(movie_data)
movies_df.dropna()

### Scraping all relevant movie links on a page

In [None]:
url2 = 'http://www.boxofficemojo.com/alltime/adjusted.htm'
soup2 = BeautifulSoup(requests.get(url2).text,'html.parser')

In [None]:
all_links = soup2.findAll('a',href = re.compile('/movies/\?*id='))
for link in all_links:
    print (link['href'])

# Scraping the Unscrapable

### What happens if I try to parse my gmail with `requests` and `BeautifulSoup`?

In [None]:
import requests
from bs4 import BeautifulSoup

gmail_url="https://mail.google.com"
soup=BeautifulSoup(requests.get(gmail_url).text)
print(soup.prettify())

In [None]:
new_url = "https://mail.google.com/mail"

# get method will navigate the requested url.. 
soup =BeautifulSoup(requests.get(new_url).text)
print(soup.prettify())

In [None]:
print(soup.find(id='Email'))

In [None]:
# pip install selenium 

#http://chromedriver.storage.googleapis.com/index.html?path=2.19/

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import os
chromedriver = "/Users/username/Downloads/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver


driver = webdriver.Chrome(chromedriver)
driver.get("https://mail.google.com")

# Alternatives to Chrome:
# Firefox, phantomjs

#### Fill out username and password, hit enter to log in

In [None]:
username_form = driver.find_element_by_id("Email")
username_form.send_keys("username@gmail.com")

In [None]:
username_form.send_keys(Keys.RETURN)

In [None]:
password_form=driver.find_element_by_id('Passwd')
password_form.send_keys('########')

In [None]:
password_form.send_keys(Keys.RETURN)

#### Click compose button to start a new email draft!

In [None]:
compose_button=driver.find_element_by_xpath('//div[text()="COMPOSE"]')
compose_button.click()

In [None]:
to_field = driver.find_element_by_name("to")
to_field.send_keys("username@gmail.com")

In [None]:
subject = driver.find_element_by_name("subjectbox")
subject.send_keys("This is an alert!")

In [None]:
message_body = driver.find_element_by_xpath("//div[@aria-label='Message Body']")
message_body.send_keys("Hello,")
message_body.send_keys([Keys.RETURN, Keys.RETURN])
message_body.send_keys("I am a computer and I just became self aware.")

In [None]:
send_button = driver.find_element_by_xpath("//div[contains(@aria-label, 'Send')]")
send_button.click()

References: 
- Documentation on finding elements:
- http://selenium-python.readthedocs.org/en/latest/locating-elements.html
- Xpath tutorial:
-  http://www.w3schools.com/xpath/xpath_syntax.asp