In [1]:
from bs4 import BeautifulSoup

In [3]:
doc.text

'INSERT FOOTER HERE'

In [4]:
doc.find(text="INSERT FOOTER HERE").replace_with(footer)

'INSERT FOOTER HERE'

In [8]:
#comment.. special type of navigable string
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup, 'html.parser')
comment = soup.b.string

In [7]:
comment

'Hey, buddy. Want to buy a used parser?'

## Navigating the tree

### 1. Going Down

In [11]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [23]:
#prints the head tag
soup.head

<head><title>The Dormouse's story</title></head>

In [37]:
#prints the title tag inside head tag
soup.head.title  #or you can also do soup.title which finds the first title tag in the soup object

<title>The Dormouse's story</title>

In [38]:
#prints the first p tag
soup.p

<p class="title"><b>The Dormouse's story</b></p>

### Nested 


In [39]:
#b tag inside the body tag
soup.body.b

<b>The Dormouse's story</b>

In [35]:
#as these tags only return the first tag with the specified tagname
#you can use find_all() to find all the tags with that tagname
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

### Tags contents and children


In [40]:
# .contents and .children

In [58]:
#tag's children are available in a list called .contents
body_tag = soup.body
body_tag.contents[1]

<p class="title"><b>The Dormouse's story</b></p>

In [59]:
# you can also get the name of the tag with .name method
body_tag.contents[3].name

'p'

In [64]:
# or you can access the children by looping through them with .children
for child in body_tag.children:
    print(child.text)



The Dormouse's story


Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.


...




In [66]:
# .descendents lets you access all the tag's children, their children and so on
head_tag = soup.head
for child in head_tag.children:
    print(child)

#we can see that it only accessed the direct child of head that is the title tag

<title>The Dormouse's story</title>


In [68]:
for child in head_tag.descendants:
    print(child)

# we can see that it accessed all the child of head i.e title, and title's child

<title>The Dormouse's story</title>
The Dormouse's story


In [73]:
len(list(soup.children))   #direct child of soup.. one generation

2

In [74]:
len(list(soup.descendants))     #all children of soup..multiple generations

27

In [78]:
# if tag contains only navigablestring or only one tag that contains string, 
# it's content can be accessed by using the .string
head_tag.string

"The Dormouse's story"

In [80]:
# but if a tag contains multiple tags then it is not clear which string to return so .string
# returns None
print(body_tag.string)

None


In [88]:
# you can still access the string with .strings
#for string in body_tag.strings:
    #print(repr(string))
#or we can use .stripped_strings to remove extra whitespace
for string in body_tag.stripped_strings:
    print(string)

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...


### 2. Going Up

In [94]:
# .parent
title_tag = soup.title
title_tag

<title>The Dormouse's story</title>

In [95]:
title_tag.parent

<head><title>The Dormouse's story</title></head>

In [98]:
title_tag.string.parent #this equals tdirectly accessing the element

<title>The Dormouse's story</title>

In [111]:
# .parents helps to iterate over all the parent elements
for parent in title_tag.parents:
    print(parent.name)

head
html
[document]


### 3.Going Sideways

In [114]:
#siblings are the tags which have same direct parent and are on same level
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></a>", 'html.parser')
print(sibling_soup)
#b and c tags are siblings

<a><b>text1</b><c>text2</c></a>


In [115]:
#. next_sibling and .previous_sibling
sibling_soup.b.next_sibling

<c>text2</c>

In [116]:
sibling_soup.c.previous_sibling

<b>text1</b>

In [118]:
#. next_siblings and .previous_siblings
last_a_tag = soup.find('a', id='link3')
last_a_tag

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [132]:
last_a_tag.next_sibling   #this is the next sibling of a within the parent of tag a

';\nand they lived at the bottom of a well.'

In [133]:
last_a_tag.next_element   #next element of the starting a tag... <a>something</a>... tag is the opeining a, something and closing a... 
#next element is the immediate content after the opening a tag

'Tillie'

## Searching the Tree

In [138]:
# methods like find and find_all take filters as arguments
# 1. String
for tag in soup.find_all('a'):
    print(tag.string)

Elsie
Lacie
Tillie


In [144]:
#2. Regular expression
import re

for tag in soup.find_all(re.compile('^ti')):
    print(tag)
#print all tags that start with ti

<title>The Dormouse's story</title>


In [145]:
# 3. A list
soup.find_all(['a', 'b'])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [157]:
#find_all(name, attrs, recursive, string, limit, **kwargs)

In [163]:
soup.find_all('a', class_='sister')  #a=name, sister=class

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [164]:
import re
soup.find_all(string=re.compile('sisters'))

['Once upon a time there were three little sisters; and their names were\n']

In [165]:
soup.find_all(string="The Dormouse's story")

["The Dormouse's story", "The Dormouse's story"]

In [166]:
soup.find_all(attrs={'id': 'link3'})

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [173]:
#searching by css class
soup.find_all( 'a', class_='sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [170]:
# find_parents() and find_parent()
a_string = soup.find( string='Lacie')

In [187]:
a_string.find_parent('p', class_='story')

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

In [188]:
# insert_before() and insert_after()

In [191]:
soup = BeautifulSoup("<b>leave</b>", 'html.parser')
tag = soup.new_tag("i")
tag.string = "Don't"
soup.b.string.insert_before(tag)
soup.b

<b><i>Don't</i>leave</b>

In [192]:
#deleting a tag from the tree
# tag.decompose()

In [194]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup, 'html.parser')
a_tag = soup.a
i_tag = soup.i

i_tag.decompose()
soup

<a href="http://example.com/">I linked to </a>

In [204]:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup, 'html.parser')
newTag = soup.new_tag('b')
newTag.string = "new string"

a_tag = soup.a
i_tag = soup.i

a_tag.i.replace_with(newTag)
soup

<a href="http://example.com/">I linked to <b>new string</b></a>

### Prettify

In [206]:
markup = '<html><head><body><a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <body>
   <a href="http://example.com/">
    I linked to
    <i>
     example.com
    </i>
   </a>
  </body>
 </head>
</html>


### Line Number 

In [207]:
#keeping track of where each tag belongs to which line

In [209]:
markup = "<p\n>Paragraph 1</p>\n    <p>Paragraph 2</p>"
soup = BeautifulSoup(markup, 'html.parser')
for tag in soup.find_all('p'):
    print(tag.sourceline, tag.sourcepos, tag.string)

1 0 Paragraph 1
3 4 Paragraph 2
