# Web Scraping with Python

In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from IPython.display import display

What we are scraping can be found here:

https://mafudge.github.io/web-scraping/


## Reading an HTML table

Is trivial, thanks to Pandas. The read_html method returns a list of all tables on the page.

In [6]:
# Table read example
tables = pd.read_html("https://mafudge.github.io/web-scraping/emptable.html")
table = tables[0] # read_html returns a list of all tables on the page.
display(table)

Unnamed: 0,Name,Email,Department
0,Barb Barion,bbarion@fudgemart.com,Customer Service
1,Al Fresco,afresco@fudgemart.com,Housewares
2,Bob Enweave,benweave@fudgemart.com,Housewares
3,Carrie Dababbi,cdababbi@fudgemart.com,Electronics
4,Rowan Debote,rdebote@fudgemart.com,Customer Service
5,Sandi Shores,sshores@fudgemart.com,Customer Service


## Read JSON Data 

Is also trivial... Thanks to Pandas!

In [7]:
# Read JSON example
data = pd.read_json("https://mafudge.github.io/web-scraping/empjson.json")
display(data)

Unnamed: 0,Department,Email,Name
0,Customer Service,bbarion@fudgemart.com,Barb Barion
1,Housewares,afresco@fudgemart.com,Al Fresco
2,Housewares,benweave@fudgemart.com,Bob Enweave
3,Electronics,cdababbi@fudgemart.com,Carrie Dababbi
4,Customer Service,rdebote@fudgemart.com,Rowan Debote
5,Customer Service,sshores@fudgemart.com,Sandi Shores


## Web Scraping Data 

Is a bit more involved because we must use Python logic to get the structure off the page

The document we are scraping is here: https://mafudge.github.io/web-scraping/empweb.html

You can't scrape without the knowing the HTML structure. When the HTML changes, we must re-write our code. This is why an API is always preferable to scraping. Web scraping is a method of last resort!

In [61]:
# first we request the contents of the page and send them to Beautiful Soup
response = requests.get("https://mafudge.github.io/web-scraping/empweb.html")
soup = BeautifulSoup(response.text, "lxml")
print(str(soup)[:200]) # there's a lot, so only print the first 200 chars

<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<!-- Beg


In [62]:
# let's get some departments, which are in <h3> tags:
for h3_tag in soup.select("h3"):
    print(h3_tag.text)
    
# select() searches the html for that tag, returning a list
print(soup.select("h3"))

Customer Service
Housewares
Electronics
[<h3 id="customer-service">Customer Service</h3>, <h3 id="housewares">Housewares</h3>, <h3 id="electronics">Electronics</h3>]


In [29]:
# the employees are in li tags:
for li_tag in soup.select("li"):
    print(li_tag.text)
    
print(soup.select("li"))

Barb Barion
Rowan Debote
Sandi Shores
Al Fresco
Bob Enweave
Carrie Dababbi
[<li><a href="mailto:bbarion@fudgemart.com">Barb Barion</a></li>, <li><a href="mailto:rdebote@fudgemart.com">Rowan Debote</a></li>, <li><a href="mailto:sshores@fudgemart.com">Sandi Shores</a></li>, <li><a href="mailto:afresco@fudgemart.com">Al Fresco</a></li>, <li><a href="mailto:benweave@fudgemart.com">Bob Enweave</a></li>, <li><a href="mailto:cdababbi@fudgemart.com">Carrie Dababbi</a></li>]


In [33]:
# how do you associate the employee with their department?
# the tags are nested, and so must code be!
h3_tags = soup.select("h3") #get the departments as a list 
tag_index =0
for ul_tag in soup.select("ul"):
    for li_tag in ul_tag.select("li"):
        print("Name: {0} Department: {1}".format(
            li_tag.text, 
            h3_tags[tag_index].text)
             )
    tag_index+=1 

Name: Barb Barion Department: Customer Service
Name: Rowan Debote Department: Customer Service
Name: Sandi Shores Department: Customer Service
Name: Al Fresco Department: Housewares
Name: Bob Enweave Department: Housewares
Name: Carrie Dababbi Department: Electronics


In [48]:
#how do we extract the email? Well that's part of the a href
print("Entire list item =======> ", li_tag)
print("Anchor (a) Tag inside ==> ", li_tag.select_one("a")) # do not return a list in this case
print("Href attribute in (a) ==> ", li_tag.select_one("a")["href"]) #dictionary key
print("Strip out :mailto ======> ", li_tag.select_one("a")["href"].replace("mailto:",""))


Anchor (a) Tag inside ==>  <a href="mailto:cdababbi@fudgemart.com">Carrie Dababbi</a>
Href attribute in (a) ==>  mailto:cdababbi@fudgemart.com


In [53]:
# so much logic here, belongs in its own function for readability
def get_email_from_li(tag):
    return tag.select_one("a")["href"].replace("mailto:","")

# testing
print(get_email_from_li(li_tag))

cdababbi@fudgemart.com


In [56]:
# And now we write the entire scrape as:
h3_tags = soup.select("h3") 
tag_index =0
for ul_tag in soup.select("ul"):
    for li_tag in ul_tag.select("li"):
        print("Name: {0} Department: {1} Email: {2}".format(
            li_tag.text, 
            h3_tags[tag_index].text,
            get_email_from_li(li_tag))
             )
    tag_index+=1 

Name: Barb Barion Department: Customer Service Email: bbarion@fudgemart.com
Name: Rowan Debote Department: Customer Service Email: rdebote@fudgemart.com
Name: Sandi Shores Department: Customer Service Email: sshores@fudgemart.com
Name: Al Fresco Department: Housewares Email: afresco@fudgemart.com
Name: Bob Enweave Department: Housewares Email: benweave@fudgemart.com
Name: Carrie Dababbi Department: Electronics Email: cdababbi@fudgemart.com


In [57]:
# list of dictionary is more workable than printing, of course
employees = []
h3_tags = soup.select("h3") 
tag_index =0
for ul_tag in soup.select("ul"):
    for li_tag in ul_tag.select("li"):
        # build a dictionary
        employee = { 
            "Name" : li_tag.text, 
            "Department" : h3_tags[tag_index].text,
            "Email" : get_email_from_li(li_tag)
        }
        # add to the list
        employees.append(employee)
    tag_index+=1 
    
data = pd.DataFrame(employees)
display(data)

Unnamed: 0,Department,Email,Name
0,Customer Service,bbarion@fudgemart.com,Barb Barion
1,Customer Service,rdebote@fudgemart.com,Rowan Debote
2,Customer Service,sshores@fudgemart.com,Sandi Shores
3,Housewares,afresco@fudgemart.com,Al Fresco
4,Housewares,benweave@fudgemart.com,Bob Enweave
5,Electronics,cdababbi@fudgemart.com,Carrie Dababbi


In [59]:
# how about writing this as a function to return a python list of dictionary
def scrape_fudgemart_employees():
    response = requests.get("https://mafudge.github.io/web-scraping/empweb.html")
    soup = BeautifulSoup(response.text, "lxml")
    employees = []
    h3_tags = soup.select("h3") 
    tag_index =0
    for ul_tag in soup.select("ul"):
        for li_tag in ul_tag.select("li"):
            employee = { 
                "Name" : li_tag.text, 
                "Department" : h3_tags[tag_index].text,
                "Email" : get_email_from_li(li_tag)
            }
            employees.append(employee)
        tag_index+=1 
    return employees


In [60]:
# and now it's trivial :-)
employees = scrape_fudgemart_employees()
data = pd.DataFrame(employees)
display(data)

Unnamed: 0,Department,Email,Name
0,Customer Service,bbarion@fudgemart.com,Barb Barion
1,Customer Service,rdebote@fudgemart.com,Rowan Debote
2,Customer Service,sshores@fudgemart.com,Sandi Shores
3,Housewares,afresco@fudgemart.com,Al Fresco
4,Housewares,benweave@fudgemart.com,Bob Enweave
5,Electronics,cdababbi@fudgemart.com,Carrie Dababbi
