# Chapter 1. Beginning to Scrape

In [9]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

# this opens the html page returned from the url and
# prints the html to stdout
html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())
# type(html)

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [15]:
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")

# BeautifulSoup is allows us to read in the html page given and
# create a bs datatype that holds the data for the html page as
# object attributes
bsObj = BeautifulSoup(html.read());
print(bsObj.h1)


<html>
<head>
<title>A Useful Page</title>
</head>
<body>
<h1>An Interesting Title</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</body>
</html>

In [11]:
# the url open can fail if there does not exist a page with this url
# so we will include exceptions for the case with try/except blocks


try:
    html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
except HttpError as e:
    print(e)
else:
    # otherwise continue the program
    if html is None:
        print("URL not found!")
    else:
        print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [12]:
# print all tags that dont exist in the bsObj
print(bsObj.nonExistentTag)

# this line will produce an Attribute error since there is no attribute
# someTage to bsObj's nonExistentTag
# print(bsObj.nonExistentTag.someTag)

# to prevent above we can add try/excepts
try:
    badContent = bsObj.nonExistentTag.someTag
except AttributeError as e:
    print("Tag not found")
else:
    if badContent is None:
        print("Tag was not found")
    else:
        print(badContent)

None
Tag not found


In [13]:
# this function is similar to the try/except developed earlier but in a compact functional way
def getTitle(url):
    try:
        html = urlopen(url)
    except HttpError as e:
        # This is if the HTML errors out (no url exists)
        return None
    
    try:
        bsObj = BeautifulSoup(html.read())
        title = bsObj.html.h1
    except AttributeError as e:
        # This is if the html.h1 attribute does not exist in the bsObj object
        return None
    else:
        return title

title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")
if title is None:
    print("Title could not be found :(")
else:
    print(title)

<h1>An Interesting Title</h1>


# Chapter 2. Advanced HTML Parsing

In [35]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

# we will scrape and parse a webpage to help with poorly formatted html pages
# in this html, green represents proper nouns, names, titles
# and red is dialogue
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html)

In [36]:
# to grab all the proper nouns, we will focus to grab all the css values
# that have color green
nameList = bsObj.findAll("span", {"class":"green"})
for name in nameList:
    print(name.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


In [37]:
# one param will find all tags in the collection you ask
headers = bsObj.findAll({"h1","h2","h3","h4","h5","h6"})

# the second pram will find all tags of described type
# with the requested attributes
highlight = bsObj.findAll("span", {"class":"green", "class":"red"})

# this will find all instances of the text, and 
# print out how many times the key text appears
nameList = bsObj.findAll(text="the prince") 
print(len(nameList))

# next is k(ey)w(ord)arg, which will find all tags with the specified attr
allText = bsObj.findAll(id="text")
# print(allText[0].get_text())

7


In [38]:
# Navigating trees with another example webpage
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html)

# this will print all children of the table (the table rows)
for child in bsObj.find('table', {'id': 'giftList'}).children:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [44]:
# This will print all the siblings (table rows) of the title row of the table
for sibling in bsObj.find('table', {'id': 'giftList'}).tr.next_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [45]:
# This will find the img tag that has this img1 src 
# and print its parent's previous sibling
# the img tag is wrapped around a table data tag (td) and its previous
# sibling is the td representing the item's price
print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"
                       }).parent.previous_sibling.get_text())


$15.00



In [75]:
import re

# we will now use regex to find all the images
images = bsObj.findAll('img', {"src" : re.compile("\.\.\/img\/gifts/img.*\.jpg")})
print(images)
# print(type(images))
for image in images:
    print(image['src'])

[<img src="../img/gifts/img1.jpg"/>, <img src="../img/gifts/img2.jpg"/>, <img src="../img/gifts/img3.jpg"/>, <img src="../img/gifts/img4.jpg"/>, <img src="../img/gifts/img6.jpg"/>]
../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


In [77]:
# using lambda expressions so we can find all tags with 
# only two attributes
lda = bsObj.findAll(lambda tag: len(tag.attrs) == 2)
for e in lda:
    print(e.get_text())
    print(e.attrs)


{'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}

Vegetable Basket

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!

$15.00



{'id': 'gift1', 'class': ['gift']}

Russian Nesting Dolls

Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!

$10,000.52



{'id': 'gift2', 'class': ['gift']}

Fish Painting

If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!

$10,005.00



{'id': 'gift3', 'class': ['gift']}

Dead Parrot

This is an ex-parrot! Or maybe he's only resting?

$0.50



{'id': 'gift4', 'class': ['gift']}

Mystery Box

If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!

$1.50



{'id': 'gift5', 'class': ['gift']}


# Part i. Table -> Dataframe

In [None]:
# we will now transform the data from the 
# table in the html into a pandas dataframe
# column values will include: item title & cost
# import pandas as pd

