# Web Scraping with Python

In [1]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
from IPython.display import display

What we are scraping can be found here:

https://mafudge.github.io/web-scraping/


## Reading an HTML table

Is trivial, thanks to Pandas. The read_html method returns a list of all tables on the page.

In [None]:
# Table read example
tables = pd.read_html("https://mafudge.github.io/web-scraping/emptable.html")
table = tables[0] # read_html returns a list of all tables on the page.
display(table)

## Read JSON Data 

Is also trivial... Thanks to Pandas!

In [None]:
# Read JSON example
data = pd.read_json("https://mafudge.github.io/web-scraping/empjson.json")
display(data)

In [None]:
# or this if you don't want Pandas... 
response = requests.get("https://mafudge.github.io/web-scraping/empjson.json")
data = json.loads(response.text) 
data

## Web Scraping Data 

Is a bit more involved because we must use Python logic to get the structure off the page

The document we are scraping is here: https://mafudge.github.io/web-scraping/empweb.html

You can't scrape without the knowing the HTML structure. When the HTML changes, we must re-write our code. This is why an API is always preferable to scraping. Web scraping is a method of last resort!

In [None]:
# first we request the contents of the page and send them to Beautiful Soup
response = requests.get("https://mafudge.github.io/web-scraping/empweb.html")
soup = BeautifulSoup(response.text, "lxml")
print(str(soup)[:200]) # there's a lot, so only print the first 200 chars

In [None]:
# let's get some departments, which are in <h3> tags:
for h3_tag in soup.select("h3"):
    print(h3_tag.text)
    
# select() searches the html for that tag, returning a list
print(soup.select("h3"))

In [None]:
# the employees are in li tags:
for li_tag in soup.select("li"):
    print(li_tag.text)
    
print(soup.select("li"))

In [None]:
# how do you associate the employee with their department?
# the tags are nested, and so must code be!
h3_tags = soup.select("h3") #get the departments as a list 
tag_index =0
for ul_tag in soup.select("ul"):
    for li_tag in ul_tag.select("li"):
        print("Name: {0} Department: {1}".format(
            li_tag.text, 
            h3_tags[tag_index].text)
             )
    tag_index+=1 

In [None]:
#how do we extract the email? Well that's part of the a href
print("Entire list item =======> ", li_tag)
print("Anchor (a) Tag inside ==> ", li_tag.select_one("a")) # do not return a list in this case
print("Href attribute in (a) ==> ", li_tag.select_one("a")["href"]) #dictionary key
print("Strip out :mailto ======> ", li_tag.select_one("a")["href"].replace("mailto:",""))


In [None]:
# so much logic here, belongs in its own function for readability
def get_email_from_li(tag):
    return tag.select_one("a")["href"].replace("mailto:","")

# testing
print(get_email_from_li(li_tag))

In [None]:
# And now we write the entire scrape as:
h3_tags = soup.select("h3") 
tag_index =0
for ul_tag in soup.select("ul"):
    for li_tag in ul_tag.select("li"):
        print("Name: {0} Department: {1} Email: {2}".format(
            li_tag.text, 
            h3_tags[tag_index].text,
            get_email_from_li(li_tag))
             )
    tag_index+=1 

In [None]:
# list of dictionary is more workable than printing, of course
employees = []
h3_tags = soup.select("h3") 
tag_index =0
for ul_tag in soup.select("ul"):
    for li_tag in ul_tag.select("li"):
        # build a dictionary
        employee = { 
            "Name" : li_tag.text, 
            "Department" : h3_tags[tag_index].text,
            "Email" : get_email_from_li(li_tag)
        }
        # add to the list
        employees.append(employee)
    tag_index+=1 
    
data = pd.DataFrame(employees)
display(data)

In [None]:
# how about writing this as a function to return a python list of dictionary
def scrape_fudgemart_employees():
    response = requests.get("https://mafudge.github.io/web-scraping/empweb.html")
    soup = BeautifulSoup(response.text, "lxml")
    employees = []
    h3_tags = soup.select("h3") 
    tag_index =0
    for ul_tag in soup.select("ul"):
        for li_tag in ul_tag.select("li"):
            employee = { 
                "Name" : li_tag.text, 
                "Department" : h3_tags[tag_index].text,
                "Email" : get_email_from_li(li_tag)
            }
            employees.append(employee)
        tag_index+=1 
    return employees


In [None]:
# and now it's trivial :-)
employees = scrape_fudgemart_employees()
data = pd.DataFrame(employees)
display(data)