In [2]:
from bs4 import BeautifulSoup
import lxml
import requests

## Make a soup

> Beautiful Soup transforms a complex HTML document into a complex tree of Python objects.

soup = BeautifulSoup(markup, features)

     :param markup: A string or a file-like object representing
         markup to be parsed.

     :param features: Desirable features of the parser to be
      used. This may be the name of a specific parser ("lxml",
      "lxml-xml", "html.parser", or "html5lib") or it may be the
      type of markup to be used ("html", "html5", "xml"). It's
      recommended that you name a specific parser, so that
      Beautiful Soup gives you the same results across platforms
      and virtual environments.


![https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup](parser-comparison.png)
Ref: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup

Difference bewteen parsers: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#differences-between-parsers

Full documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#calling-a-tag-is-like-calling-find-all

### Kinds of Objects

Empat jenis objek:
1. Tag ==> Represent Tag object.
2. NavigableString ==> Represent Text object inside Tag object.
3. BeautifulSoup ==> Represents the parsed document as a whole.
4. Comment ==> The comment object inside Tag object.

In [33]:
# BeautifulSoup object

doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml")
print(doc.name) # return document (since it's document object model)
print(type(doc))

[document]
<class 'bs4.BeautifulSoup'>


In [10]:
# Tag Object

soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b
type(tag)

bs4.element.Tag

In [89]:
# Get the "name" attribute from tag object
print("Full tag: ", tag)
print("Name of tag: ", tag.name)

# Get the "attribute" attribute from tag object
print("Attributes of tag: ", tag.attrs)
print("Get class attribute: ", tag['class'])

# NOTE: You can add, remove, and modify a tag’s attributes.
tag['id'] = 'verybold'
print("Aftering adding 'id' attribute : ", tag)
del tag['id']

# Get the "text" attribute from tag object
print("Text of tag: ", tag.text)

Full tag:  <b class="boldest">Extremely bold</b>
Name of tag:  b
Attributes of tag:  {'class': ['boldest']}
Get class attribute:  ['boldest']
Aftering adding 'id' attribute :  <b class="boldest" id="verybold">Extremely bold</b>
Text of tag:  Extremely bold


In [18]:
# NavigableString Object

type(tag.string)

bs4.element.NavigableString

In [26]:
# Convert NavigableString to a unicode string

unicode_string = str(tag.string)
print("Unicode string: ", unicode_string)
print("Unicode string type: ", type(unicode_string))

# Note: If we print tag.string object it will show the text too.

# We can't edit a string in place but we can replace one string with another
#  using 'replace_with'. It works like cut (return the replaced text)

tag.string.replace_with("No longer bold")
print(tag)
tag.string.replace_with("Extremely bold")

Unicode string:  Extremely bold
Unicode string type:  <class 'str'>
<b class="boldest">No longer bold</b>


'No longer bold'

In [29]:
# Comment Object

markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup, 'html.parser')
comment = soup.b.string
type(comment)

bs4.element.Comment

### Navigating the tree

#### A. Going Down

In [30]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [39]:
# Navigating using tag names

print("Get head Tag object: ", soup.head)
print("Get title Tag object: ", soup.title)
print("Get 'a' Tag object: ", soup.body.a) # Get the first 'a' tag element
print("Get 'a' tag object: ", soup.a) # Get the first 'a' tag element

Get head Tag object:  <head><title>The Dormouse's story</title></head>
Get title Tag object:  <title>The Dormouse's story</title>
Get 'a' Tag object:  <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Get 'a' tag object:  <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [48]:
# using 'contents' attribute ==> Return list of children object
print("Get all children list: \n", soup.body.contents)

# Using 'children' attribute ==> Return generator (not list)
print("Get all children list: (as generator) ", soup.body.children)

Get all children list: 
 ['\n', <p class="title"><b>The Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, '\n', <p class="story">...</p>, '\n']
Get all children list: (as generator)  <list_iterator object at 0x0000029707BC25C0>


In [46]:
# Using 'descendants' attributes ==> Return generator all descendants object 
#                                      (not only its direct children)

for child in soup.head.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


In [58]:
# Using 'string' attribute ==> Return NavigableString object
# Note: If the tag contains more than one thing, then it will return None
print("Using 'string' attribute: ", soup.head.string)

# Using 'strings' attribute ==> Return generator of all NavigableString objects
# Note: It will includes newlines and spaces that separate tags.
print("Using 'strings' attribute: \n")
for string in soup.strings:
    print(repr(string))

Using 'string' attribute:  The Dormouse's story
Using 'strings' attribute: 

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'


#### B. Going Up

In [60]:
# Using 'parent' attribute ==> Return element direct parent object
# Note: If the tag contains more than one thing, then it will return None

print(soup.title.parent)

# Using 'parents' attribute ==> Return generator of all ascending element objects
for parent in soup.a.parents:
    print(parent.name)

<head><title>The Dormouse's story</title></head>
p
body
html
[document]


#### C. Going sideways

In [62]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></a>", 'html.parser')
print(sibling_soup.prettify())

<a>
 <b>
  text1
 </b>
 <c>
  text2
 </c>
</a>



In [63]:
# Using 'next_sibling' or 'previous_sibling' attribute 
#   ==> Return direct next or previous sibling element object

print("Next sibling b tag element: ", sibling_soup.b.next_sibling)
print("Previous sibling c tag element: ", sibling_soup.c.previous_sibling)


# Note: In real documents, the .next_sibling or .previous_sibling of 
#         a tag will usually be a string containing whitespace.

Next sibling b tag element:  <c>text2</c>
Previous sibling c tag element:  <b>text1</b>


In [64]:
# Using 'next_sibling' or 'previous_sibling' attribute 
#   ==> Return generator all next or previous sibling element objects
# Note: This inlcude the newlines and spaces

for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'


### Searching The Tree


#### A. Descending search tree

The most common methods:
```
1. find()

find(name, attrs, recursive, string, **kwargs)

=> return Object

2. find_all()

find_all(name, attrs, recursive, string, limit, **kwargs)

=> return List

```

- name: The name of the tag you want to search for (e.g., 'div', 'a'). dtype=string, regex compiler, boolean function, list.
- attrs: A dictionary of attributes you want to search for (e.g., {'class': 'example'}).
- recursive: If set to True (default), the method will search all descendants; if set to False, it will search only direct children.
- string: A string to search for in the tag's content.
- **kwargs: Other keyword arguments can also be used as shortcuts for attributes (e.g., class_='example').
- limit: The maximum number of results to return.


Notes: Remember that find_all() and find() work their way down the tree, looking at tag’s descendants.

In [65]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [90]:
# Find all element using 'string' data type

import re

print("Find all elements using name argument string: ", soup.find_all(name='a'))
print()
print("Find all elements using name argument regex: ", soup.find_all(name=re.compile("^b")))
print()
print("Find all elements using name argument list: ", soup.find_all(name=['a', 'b']))
print()

def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
print("Find all elements using name argument function: ", soup.find_all(name=has_class_but_no_id))

Find all elements using name argument string:  [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

Find all elements using name argument regex:  [<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>, <b>The Dormouse's story</b>]

Find all elements using name argument list:  [<b>The Dormouse's story</b>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie

#### B. Ascending search tree

The most common methods:
```
find_parents(name, attrs, string, limit, **kwargs)

=> return List

find_parent(name, attrs, string, **kwargs)

=> return Object
```

- name: The name of the tag you want to search for (e.g., 'div', 'a'). dtype=string, regex compiler, boolean function, list.
- attrs: A dictionary of attributes you want to search for (e.g., {'class': 'example'}).
- recursive: If set to True (default), the method will search all descendants; if set to False, it will search only direct children.
- string: A string to search for in the tag's content.
- **kwargs: Other keyword arguments can also be used as shortcuts for attributes (e.g., class_='example').
- limit: The maximum number of results to return.


In [75]:
a_string = soup.find(string="Lacie")
a_string # The current node

print("Find the direct parent of the current node: ", a_string.find_parent())

Find the direct parent of the current node:  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>


In [76]:
print(len(a_string.find_parents()))
a_string.find_parents() # Find all the direct and indirect
                        # parent of the current node

5


[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <body>
 <p class="title"><b>The Dormouse's story</b></p>
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>
 <p class="story">...</p>
 </body>,
 <html><head><title>The Dormouse's story</title></head>
 <body>
 <p class="title"><b>The Dormouse's story</b></p>
 <