# Beautiful Soup

### Introduction

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://books.toscrape.com/"
resp = requests.get(url)

In [34]:
# html.parser --> written in python
soup = BeautifulSoup(resp.content, "html.parser")    # lxml - parser -> faster - written in C 

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <title>
   All products | Books to Scrape - Sandbox
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="24th Jun 2016 09:29" name="created"/>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="NOARCHIVE,NOCACHE" name="robots"/>
  <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
  <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
  <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
  <link href="static/oscar/css/styles.css" rel="stylesheet" type="tex

In [7]:
type(soup)

bs4.BeautifulSoup

In [8]:
soup.html

<html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>
    All products | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:29" name="created"/>
<meta content="" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.css" rel="stylesheet"/>
<link href="static/oscar/css/datetimepicker.css" rel="stylesheet" type="text/css"/>
</head>
<body class="default" id="default">
<header class="header container-fluid">
<div class="page_inner">
<div class

In [9]:
soup.name

'[document]'

### Tags

In [10]:
soup.title

<title>
    All products | Books to Scrape - Sandbox
</title>

In [11]:
soup.head

<head>
<title>
    All products | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:29" name="created"/>
<meta content="" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.css" rel="stylesheet"/>
<link href="static/oscar/css/datetimepicker.css" rel="stylesheet" type="text/css"/>
</head>

In [16]:
# Returns only first element
soup.h1

<h1>All products</h1>

In [21]:
first_div = soup.div
first_div

<div class="page_inner">
<div class="row">
<div class="col-sm-8 h1"><a href="index.html">Books to Scrape</a><small> We love being scraped!</small>
</div>
</div>
</div>

In [18]:
first_div.attrs

{'class': ['page_inner']}

In [22]:
first_div.div.div.attrs

{'class': ['col-sm-8', 'h1']}

In [26]:
first_div.attrs['class'].append("some-other-class")

In [27]:
first_div

<div class="page_inner some other class some-other-class">
<div class="row">
<div class="col-sm-8 h1"><a href="index.html">Books to Scrape</a><small> We love being scraped!</small>
</div>
</div>
</div>

In [28]:
# original tree is also mutated
soup.div

<div class="page_inner some other class some-other-class">
<div class="row">
<div class="col-sm-8 h1"><a href="index.html">Books to Scrape</a><small> We love being scraped!</small>
</div>
</div>
</div>

### Parent, Children and Descendents

In [29]:
soup.ul

<ul class="breadcrumb">
<li>
<a href="index.html">Home</a>
</li>
<li class="active">All products</li>
</ul>

In [31]:
print(soup.ul.prettify())

<ul class="breadcrumb">
 <li>
  <a href="index.html">
   Home
  </a>
 </li>
 <li class="active">
  All products
 </li>
</ul>



In [35]:
soup.ul.children

<list_iterator at 0x108e29900>

In [36]:
list(soup.ul.children)

['\n',
 <li>
 <a href="index.html">Home</a>
 </li>,
 '\n',
 <li class="active">All products</li>,
 '\n']

In [37]:
from bs4 import NavigableString

In [39]:
list(filter(lambda x: type(x) != NavigableString, soup.ul.children))

[<li>
 <a href="index.html">Home</a>
 </li>,
 <li class="active">All products</li>]

In [40]:
def no_nav_string(iterable):
    return list(filter(lambda x: type(x) != NavigableString, iterable))

In [41]:
no_nav_string(soup.ul.children)

[<li>
 <a href="index.html">Home</a>
 </li>,
 <li class="active">All products</li>]

In [46]:
desc = no_nav_string(soup.ul.descendants)

In [47]:
desc[0]

<li>
<a href="index.html">Home</a>
</li>

In [48]:
desc[0].parent

<ul class="breadcrumb">
<li>
<a href="index.html">Home</a>
</li>
<li class="active">All products</li>
</ul>

### Siblings

In [49]:
soup.ul.li

<li>
<a href="index.html">Home</a>
</li>

In [51]:
soup.ul.li.next_sibling.next_sibling

<li class="active">All products</li>

In [54]:
soup.ul.li \
    .next_sibling \
    .next_sibling \
    .previous_sibling \
    .previous_sibling

<li>
<a href="index.html">Home</a>
</li>

### Extracting Text

In [60]:
soup.a

<a href="index.html">Books to Scrape</a>

In [65]:
soup.a.attrs
soup.a.contents

['Books to Scrape']

In [66]:
soup.a.get_text()

'Books to Scrape'

In [68]:
soup.a.string

'Books to Scrape'

In [69]:
soup.a.text

'Books to Scrape'

- .get_text() and .text method behaves the same they return the text inside the tag and its descendents.
- But .string attribute return only the text contained by current tag not its descendents.

In [70]:
soup.ul

<ul class="breadcrumb">
<li>
<a href="index.html">Home</a>
</li>
<li class="active">All products</li>
</ul>

In [71]:
soup.ul.get_text()

'\n\nHome\n\nAll products\n'

In [72]:
soup.ul.text

'\n\nHome\n\nAll products\n'

In [74]:
# It doesn't return anything because ul tag have child tags only but no text
soup.ul.string

In [75]:
soup.ul.contents

['\n',
 <li>
 <a href="index.html">Home</a>
 </li>,
 '\n',
 <li class="active">All products</li>,
 '\n']

In [78]:
print(f"{soup.a.text} of type {type(soup.a.text)}")
print(f"{soup.a.get_text()} of type {type(soup.a.get_text())}")
print(f"{soup.a.string} of type {type(soup.a.string)}")

Books to Scrape of type <class 'str'>
Books to Scrape of type <class 'str'>
Books to Scrape of type <class 'bs4.element.NavigableString'>


In [79]:
soup.ul.text

'\n\nHome\n\nAll products\n'

In [80]:
soup.ul.get_text()

'\n\nHome\n\nAll products\n'

In [82]:
soup.ul.get_text(separator=",")

'\n,\n,Home,\n,\n,All products,\n'

In [83]:
soup.ul.get_text(separator=", ", strip=True)

'Home, All products'

### All Strings

In [85]:
soup.stripped_strings

<generator object PageElement.stripped_strings at 0x1089dfed0>

In [88]:
all_strings = list(soup.stripped_strings)

In [99]:
all_strings[:10]

['All products | Books to Scrape - Sandbox',
 'Books to Scrape',
 'We love being scraped!',
 'Home',
 'All products',
 'Books',
 'Travel',
 'Mystery',
 'Historical Fiction',
 'Sequential Art']

In [90]:
len(all_strings)

147

In [95]:
unstripped_strings = list(soup.strings)

In [96]:
len(unstripped_strings)

852

In [98]:
unstripped_strings[:10]

['\n',
 '\n',
 '\n',
 '\n',
 ' ',
 ' ',
 '\n',
 '\n',
 '\n    All products | Books to Scrape - Sandbox\n',
 '\n']

### Search

- find() -- like find_all() but returns only first result
- find_all()

In [100]:
soup.find_all()

[<html class="no-js" lang="en-us"> <!--<![endif]-->
 <head>
 <title>
     All products | Books to Scrape - Sandbox
 </title>
 <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
 <meta content="24th Jun 2016 09:29" name="created"/>
 <meta content="" name="description"/>
 <meta content="width=device-width" name="viewport"/>
 <meta content="NOARCHIVE,NOCACHE" name="robots"/>
 <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
 <!--[if lt IE 9]>
         <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
         <![endif]-->
 <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
 <link href="static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
 <link href="static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.css" rel="stylesheet"/>
 <link href="static/oscar/css/datetimepicker.css" rel="stylesheet" type="text/css"/>
 </head>
 <body class="default" id="default">
 <header class="header container-fluid">
 <div class="p

In [101]:
len(soup.find_all())

541

In [102]:
type(soup.find_all())

bs4.element.ResultSet

In [104]:
soup.find_all("a")

[<a href="index.html">Books to Scrape</a>,
 <a href="index.html">Home</a>,
 <a href="catalogue/category/books_1/index.html">
                             
                                 Books
                             
                         </a>,
 <a href="catalogue/category/books/travel_2/index.html">
                             
                                 Travel
                             
                         </a>,
 <a href="catalogue/category/books/mystery_3/index.html">
                             
                                 Mystery
                             
                         </a>,
 <a href="catalogue/category/books/historical-fiction_4/index.html">
                             
                                 Historical Fiction
                             
                         </a>,
 <a href="catalogue/category/books/sequential-art_5/index.html">
                             
                                 Sequential Art
            

In [103]:
len(soup.find_all("a"))

94

In [105]:
len(soup.find_all(["a", "p"]))

154

In [111]:
# tag_name: p
# attributes: {class: price_color}  

price_tags = soup.find_all("p", attrs={"class": "price_color"})
price_list = [price.get_text() for price in price_tags]
price_list

['£51.77',
 '£53.74',
 '£50.10',
 '£47.82',
 '£54.23',
 '£22.65',
 '£33.34',
 '£17.93',
 '£22.60',
 '£52.15',
 '£13.99',
 '£20.66',
 '£17.46',
 '£52.29',
 '£35.02',
 '£57.25',
 '£23.88',
 '£37.59',
 '£51.33',
 '£45.17']

In [112]:
soup.find_all("p", class_="price_color")

[<p class="price_color">£51.77</p>,
 <p class="price_color">£53.74</p>,
 <p class="price_color">£50.10</p>,
 <p class="price_color">£47.82</p>,
 <p class="price_color">£54.23</p>,
 <p class="price_color">£22.65</p>,
 <p class="price_color">£33.34</p>,
 <p class="price_color">£17.93</p>,
 <p class="price_color">£22.60</p>,
 <p class="price_color">£52.15</p>,
 <p class="price_color">£13.99</p>,
 <p class="price_color">£20.66</p>,
 <p class="price_color">£17.46</p>,
 <p class="price_color">£52.29</p>,
 <p class="price_color">£35.02</p>,
 <p class="price_color">£57.25</p>,
 <p class="price_color">£23.88</p>,
 <p class="price_color">£37.59</p>,
 <p class="price_color">£51.33</p>,
 <p class="price_color">£45.17</p>]

In [116]:
buttons = soup.find_all("button", attrs={"data-loading-text": "Adding..."})

In [117]:
len(buttons)

20

In [123]:
# contains "add" or "remove" case insensitive
add_buttons = soup.find_all("button", attrs={"data-loading-text": \
                                             lambda x: "add" in x.lower() or "remove" in x.lower()})
len(add_buttons)

20