# BeautifulSoup Scrape and Parse Text from Websites

## Using `html.find()`

In [3]:
# Open and read content from specified URL
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/aphrodite"
page = urlopen(url)
html_bytes = page.read()
html = html_bytes.decode('utf-8')
print(html)

<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>



In [5]:
# Find URL page title after <title> tag using .find() method
title_index = html.find('<title>')
print(title_index)

# Get index of the first letter in the title
start_index = title_index + len('<title>')
print(start_index)

# Get index of the closing </title> tag
end_index = html.find('</title>')
print(end_index)

# Get page title
title = html[start_index:end_index]
print(title)

14
21
39
Profile: Aphrodite


## Using the `re` module

In [6]:
import re
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode('utf-8')

pattern = '<title.*?>.*?</title.*?>'
match_results = re.search(pattern, html, re.IGNORECASE)
title = match_results.group()
# Remove HTML tags
title = re.sub('<.*?>', '', title)

print(title)

Profile: Dionysus


## Using `BeautifulSoup4`

In [7]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
# HTML to be parsed. Python's built-in HTML parser used behind the scenes
soup = BeautifulSoup(html, "html.parser")
print(soup.get_text())



Profile: Dionysus





Name: Dionysus

Hometown: Mount Olympus

Favorite animal: Leopard 

Favorite Color: Wine






In [8]:
# Find URLs for all the images on the web page
soup.find_all('img')

[<img src="/static/dionysus.jpg"/>, <img src="/static/grapes.png"/>]

In [9]:
image1, image2 = soup.find_all('img')
print(image1.name)
# Get value (source) of image1 using the name
print(image1['src'])

img
/static/dionysus.jpg


In [11]:
# Access tags in HTML documents
print(soup.title)
# Retrieve just the string of the Tag object
print(soup.title.string)

<title>Profile: Dionysus</title>
Profile: Dionysus


In [14]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

base_url = "http://olympus.realpython.org"

html_page = urlopen(base_url + "/profiles")
html_text = html_page.read().decode("utf-8")

soup = BeautifulSoup(html_text, "html.parser")

for link in soup.find_all("a"):
    link_url = base_url + link["href"]
    print(link_url)

http://olympus.realpython.org/profiles/aphrodite
http://olympus.realpython.org/profiles/poseidon
http://olympus.realpython.org/profiles/dionysus
