# Beautiful Soup

Import relevant libraries

In [14]:
# import BeautifulSoup library
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait

We will use the following HTML document to parse with BeautifulSoup

In [15]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>First HTML Page</title>
</head>
<body>
  <div id="first">
    <h3 data-example="yes">hi</h3>
    <p>more text.</p>
  </div>
  <ol><li class="special">This list item is special.</li>
  <li class="special">This list item is also special.</li>
  <li>This list item is not special.</li>
  </ol>
  <div data-example="yes">bye</div>
</body>
</html>
"""

Initialise BeautifulSoup object and pass it the HTML document

In [16]:
soup = BeautifulSoup(html, "html.parser")
#help(BeautifulSoup)                    

In [17]:
type(soup)

bs4.BeautifulSoup

Print the HTML page fetched by BeautifulSoup

In [18]:
soup.prettify()
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   First HTML Page
  </title>
 </head>
 <body>
  <div id="first">
   <h3 data-example="yes">
    hi
   </h3>
   <p>
    more text.
   </p>
  </div>
  <ol>
   <li class="special">
    This list item is special.
   </li>
   <li class="special">
    This list item is also special.
   </li>
   <li>
    This list item is not special.
   </li>
  </ol>
  <div data-example="yes">
   bye
  </div>
 </body>
</html>



Let's see some commands to navigate the BeautifulSoup object. 

Navigate to a particular tag. 

In [19]:
soup.head

<head>
<meta charset="utf-8"/>
<title>First HTML Page</title>
</head>

Navigate a nested tag

In [20]:
soup.head.title

<title>First HTML Page</title>

Extract the text from the tag

In [21]:
soup.head.title.get_text()

'First HTML Page'

In [22]:
soup.div.h3.get_text()

'hi'

In [23]:
soup.ol.li

<li class="special">This list item is special.</li>

Extract the css attribute of a tag

In [24]:
soup.div.h3.attrs

{'data-example': 'yes'}

In [25]:
soup.div.h3.attrs['data-example']

'yes'

Experiment with some more commonly used tags. 

In [26]:
soup.body

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol><li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>

In [27]:
soup.body.contents

['\n',
 <div id="first">
 <h3 data-example="yes">hi</h3>
 <p>more text.</p>
 </div>,
 '\n',
 <ol><li class="special">This list item is special.</li>
 <li class="special">This list item is also special.</li>
 <li>This list item is not special.</li>
 </ol>,
 '\n',
 <div data-example="yes">bye</div>,
 '\n']

In [28]:
soup.body.contents[1]

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>

Find an element by its class name

In [29]:
soup.find(class_='special')

<li class="special">This list item is special.</li>

Find all the elements matching a class name

In [30]:
soup.find_all(class_='special')

[<li class="special">This list item is special.</li>,
 <li class="special">This list item is also special.</li>]

In [31]:
soup.find_all(class_='special')[1].text

'This list item is also special.'

In [32]:
soup.find(class_='special').find_parent

<bound method PageElement.find_parent of <li class="special">This list item is special.</li>>

In [33]:
soup.find_all('li')[2].text

'This list item is not special.'

In [34]:
soup.find(id="first").p.get_text()

'more text.'

In [35]:
soup.head.contents

['\n', <meta charset="utf-8"/>, '\n', <title>First HTML Page</title>, '\n']

In [36]:
print(soup.select("#first")[0].get_text())


hi
more text.



In [37]:
soup.select(".first")

[]

## Example

Use the requests library to fetch a page from the internet. Use the requests module again to fetch the next pages. 

In [38]:
import requests
URL = "http://quotes.toscrape.com"
html = requests.get(URL)
soup = BeautifulSoup(html.text, "html.parser")


In [39]:
print(html.text)

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Quotes to Scrape</title>
    <link rel="stylesheet" href="/static/bootstrap.min.css">
    <link rel="stylesheet" href="/static/main.css">
</head>
<body>
    <div class="container">
        <div class="row header-box">
            <div class="col-md-8">
                <h1>
                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>
                </h1>
            </div>
            <div class="col-md-4">
                <p>
                
                    <a href="/login">Login</a>
                
                </p>
            </div>
        </div>
    

<div class="row">
    <div class="col-md-8">

    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
        <span>by <small class="author" itempr

In [40]:
while True:
    # scrape each page
    for i in soup.find_all(class_="quote"):
        print(i.find(class_='text').text)
        print(i.find(class_='author').text)
        print('------------------------------')
        
    # going to the next page to scrape
    nextlink = soup.find(class_="next")
    
    if nextlink:
        nextlink = URL + nextlink.a['href']
        subhtml = requests.get(nextlink)
        soup = BeautifulSoup(subhtml.text, "html.parser")
    else:
        break
        

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
Albert Einstein
------------------------------
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
J.K. Rowling
------------------------------
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
Albert Einstein
------------------------------
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
Jane Austen
------------------------------
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
Marilyn Monroe
------------------------------
“Try not to become a man of success. Rather become a man of value.”
Albert Einstein
------------------------------
“It is better to be hated for what you are than to be loved for what you are not.”
André Gide
-------------

Now, lets get a page that is rendered dynamically using Javascript. In this case, we will use Selenium to fetch the page.

In [41]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait

In [42]:
URL = "http://quotes.toscrape.com/js"
html = requests.get(URL)
soup = BeautifulSoup(html.text, "html.parser")
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<script src="/static/jquery.js"></script>
<script>
    var data = [
    {
        "tags": [
            "change",
            "deep-thoughts",
            "thinking",
            "world"
        ],
        "author": {
            "name": "Albert Einstein",
            "goodreads_link": "/author/show/9810.Albert_Einstein",
            "slug": "Albert-Einstein"
        },
        "text": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d"
    },
    {
        "tags": [
   

In [44]:
# Use Selenium to access the HTML AFTER simulating going to the webpage

URL = "http://quotes.toscrape.com/js"
driver = webdriver.Chrome()
driver.get(URL)
subhtml = driver.page_source
soup = BeautifulSoup(subhtml, "html.parser")
    
while True:
    # scrape each page
    for i in soup.find_all(class_="quote"):
        print(i.find(class_='text').text)
        print(i.find(class_='author').text)
        print('------------------------------')
        
    # going to the next page to scrape
    nextlink = soup.find(class_="next")

    if nextlink:
        nextlink = URL[:-2] + nextlink.a['href']
        driver.get(nextlink)
        subhtml = driver.page_source
        soup = BeautifulSoup(subhtml, "html.parser")
    else:
        break


WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


### References

Beautiful Soup Documentation - https://www.crummy.com/software/BeautifulSoup/bs4/doc/