# Introduction to BeautifulSoup

In [39]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

In [40]:
# Store html site as a string
html = """
<!DOCTYPE html>
<html lang="en-us">

<head>
  <meta charset="UTF-8">
  <title>My First Page</title>
</head>

<body>
  <!-- Header -->
  <h1>Hello World!</h1>

  <!-- Image -->
  <img
    src="https://static.wikia.nocookie.net/spongebob/images/4/46/SVG_SpongeBob_SquarePants.svg/revision/latest/scale-to-width-down/195?cb=20181117230211"
    alt="Spongebob!" />
  <br />
  <a href="https://www.bing.com">Bing</a>
  <!-- Link with New Tab -->
  <a href="https://www.google.com">Google</a>
  <br />

  <!-- An ordered list -->
  <ol>
    <li>Visit Grand Canyon</li>
    <li>Hike the trails</li>
    <li>Take photos</li>
  </ol>

  <ul>
    <li>Bach</li>
    <li>Mozart</li>
    <li>Beethoven</li>
    <li>Adele</li>
  </ul>
</body>

</html>
"""

In [41]:
# Create a BeautifulSoup object to parse the html code
soup = BeautifulSoup(html, 'html.parser')

In [42]:
# Print the parser
print(soup)


<!DOCTYPE html>

<html lang="en-us">
<head>
<meta charset="utf-8"/>
<title>My First Page</title>
</head>
<body>
<!-- Header -->
<h1>Hello World!</h1>
<!-- Image -->
<img alt="Spongebob!" src="https://static.wikia.nocookie.net/spongebob/images/4/46/SVG_SpongeBob_SquarePants.svg/revision/latest/scale-to-width-down/195?cb=20181117230211"/>
<br/>
<a href="https://www.bing.com">Bing</a>
<!-- Link with New Tab -->
<a href="https://www.google.com">Google</a>
<br/>
<!-- An ordered list -->
<ol>
<li>Visit Grand Canyon</li>
<li>Hike the trails</li>
<li>Take photos</li>
</ol>
<ul>
<li>Bach</li>
<li>Mozart</li>
<li>Beethoven</li>
<li>Adele</li>
</ul>
</body>
</html>



In [43]:
# Check type of parser object
type(soup)

bs4.BeautifulSoup

In [44]:
# Extract and print the head section
soup.head

<head>
<meta charset="utf-8"/>
<title>My First Page</title>
</head>

In [45]:
# Extract the title section
soup.title

<title>My First Page</title>

In [46]:
# Extract the title text from the title
soup.title.text

'My First Page'

In [47]:
print(soup.a)

<a href="https://www.bing.com">Bing</a>


In [48]:
# Use the find method to locate the first image
image_html = soup.find('img')
image_html['alt']

'Spongebob!'

In [51]:
for whatever in soup.findAll('a'):
    print(whatever)

<a href="https://www.bing.com">Bing</a>
<a href="https://www.google.com">Google</a>


In [53]:
# Use the find method to locate the first link
l = soup.find('a')
# Print the html for the link
print(l)
# Extract and print the URL
print(l['href'])
print(l.text)

<a href="https://www.bing.com">Bing</a>
https://www.bing.com
Bing


In [56]:
# Extract bulleted lists
b_l = soup.ul

In [65]:
# Extract the first list item
b_l.li

<li>Bach</li>

In [59]:
# Extract the text from the first list item
b_l.li.text

'Bach'

In [62]:
# Extract the source URL for the image
soup.find('img')['src']

'https://static.wikia.nocookie.net/spongebob/images/4/46/SVG_SpongeBob_SquarePants.svg/revision/latest/scale-to-width-down/195?cb=20181117230211'

In [63]:
# Extract the alt text for the image
soup.find('img')['alt']

'Spongebob!'

In [66]:
ol_tag = soup.find('ol')

In [70]:
ol_list_items = ol_tag.findAll('li')

In [72]:
for i in ol_list_items:
    print(i.text)

Visit Grand Canyon
Hike the trails
Take photos


In [73]:
soup.findAll('li')

[<li>Visit Grand Canyon</li>,
 <li>Hike the trails</li>,
 <li>Take photos</li>,
 <li>Bach</li>,
 <li>Mozart</li>,
 <li>Beethoven</li>,
 <li>Adele</li>]