In [66]:
html_doc = """
<html>
<head>
<title>My First webpage</title>
</head>
<body bgcolor="lightgreen">
<p class="title">
<b>Overview of Beautiful Soup</b>   
</p>
<p class="story">
Beautiful is used for Web scraping or extracting contents from a wesbite
Some websites are
<a class="wikipedia" href="https://www.wikipedia.org/" id="link1">
Wikipedia
</a> Browse  Wiki
<a class="isro" href="https://www.isro.gov.in/" id="link2">
ISRO
</a> Browse ISRO and
<a class="inner_engineering" href="https://www.innerengineering.com/" id="link3">
Inner Engineering
</a> Welcome
</p>
<p class="other_details">
Few random paragraphs. Some arbitary content without any relevance or significane to anyone living or dead.

</p>
</body>
</html>

"""

In [67]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [68]:
print(soup.prettify())

<html>
 <head>
  <title>
   My First webpage
  </title>
 </head>
 <body bgcolor="lightgreen">
  <p class="title">
   <b>
    Overview of Beautiful Soup
   </b>
  </p>
  <p class="story">
   Beautiful is used for Web scraping or extracting contents from a wesbite
Some websites are
   <a class="wikipedia" href="https://www.wikipedia.org/" id="link1">
    Wikipedia
   </a>
   Browse  Wiki
   <a class="isro" href="https://www.isro.gov.in/" id="link2">
    ISRO
   </a>
   Browse ISRO and
   <a class="inner_engineering" href="https://www.innerengineering.com/" id="link3">
    Inner Engineering
   </a>
   Welcome
  </p>
  <p class="other_details">
   Few random paragraphs. Some arbitary content without any relevance or significane to anyone living or dead.
  </p>
 </body>
</html>



#### Extract head tag of the page

In [69]:
print(soup.head)

<head>
<title>My First webpage</title>
</head>


#### Extract contents and children inside head

<b> Note </b> - The .contents and .children attributes only consider a tag’s direct children

In [70]:
print(soup.head.contents)
print(soup.head.contents[1])

['\n', <title>My First webpage</title>, '\n']
<title>My First webpage</title>


In [71]:
title_tag = soup.head.contents[1]
print(title_tag)

<title>My First webpage</title>


In [72]:
for child in title_tag.children:
    print(child)

My First webpage


#### Extract descendents

In [73]:
head_tag = soup.head
for child in head_tag.descendants:
    print(child)



<title>My First webpage</title>
My First webpage




In [74]:
for i in soup.children:
    print(i)



<html>
<head>
<title>My First webpage</title>
</head>
<body bgcolor="lightgreen">
<p class="title">
<b>Overview of Beautiful Soup</b>
</p>
<p class="story">
Beautiful is used for Web scraping or extracting contents from a wesbite
Some websites are
<a class="wikipedia" href="https://www.wikipedia.org/" id="link1">
Wikipedia
</a> Browse  Wiki
<a class="isro" href="https://www.isro.gov.in/" id="link2">
ISRO
</a> Browse ISRO and
<a class="inner_engineering" href="https://www.innerengineering.com/" id="link3">
Inner Engineering
</a> Welcome
</p>
<p class="other_details">
Few random paragraphs. Some arbitary content without any relevance or significane to anyone living or dead.

</p>
</body>
</html>




In [75]:
print(len(list(soup.children)))
print(len(list(soup.descendants)))

3
34


#### Extract string

Note - If a tag has only one child, and that child is a NavigableString, the child is made available as .string

In [76]:
head_tag.contents

['\n', <title>My First webpage</title>, '\n']

In [77]:
head_tag.string

<b>.stripped_string </b>- Remove whiltespace

#### Extract all strings

In [78]:
for string in soup.strings:
    print(repr(string))

'\n'
'\n'
'\n'
'My First webpage'
'\n'
'\n'
'\n'
'\n'
'Overview of Beautiful Soup'
'\n'
'\n'
'\nBeautiful is used for Web scraping or extracting contents from a wesbite\nSome websites are\n'
'\nWikipedia\n'
' Browse  Wiki\n'
'\nISRO\n'
' Browse ISRO and\n'
'\nInner Engineering\n'
' Welcome\n'
'\n'
'\nFew random paragraphs. Some arbitary content without any relevance or significane to anyone living or dead.\n\n'
'\n'
'\n'
'\n'


#### Extract all stripped strings

In [79]:
for string in soup.stripped_strings:
    print(repr(string))

'My First webpage'
'Overview of Beautiful Soup'
'Beautiful is used for Web scraping or extracting contents from a wesbite\nSome websites are'
'Wikipedia'
'Browse  Wiki'
'ISRO'
'Browse ISRO and'
'Inner Engineering'
'Welcome'
'Few random paragraphs. Some arbitary content without any relevance or significane to anyone living or dead.'


#### 
.parent

In [80]:
title_tag = soup.title
print(title_tag)
print('-----------')
print(title_tag.parent)

<title>My First webpage</title>
-----------
<head>
<title>My First webpage</title>
</head>


In [81]:
print(title_tag.string.parent)

<title>My First webpage</title>


In [82]:
html_tag = soup.html
type(html_tag.parent)

bs4.BeautifulSoup

#### .next_sibling and .previous_sibling

In [87]:
last_a_tag = soup.find("a", id="link2")
print(last_a_tag)
print('-------')
print(last_a_tag.next_sibling)

<a class="isro" href="https://www.isro.gov.in/" id="link2">
ISRO
</a>
-------
 Browse ISRO and



In [88]:
print(last_a_tag.previous_sibling)

 Browse  Wiki



<b>.next_element</b>  - For that anchor tag, returns the thing that was parsed immediately after the anchor tag, is not the rest of that sentence: it’s the word "ISRO" (for the given example)<br>
<b>.previous_element </b>- It points to whatever element was parsed immediately before this one</b>

In [89]:
print(last_a_tag.previous_element)
print(last_a_tag.next_element)

 Browse  Wiki


ISRO



In [90]:
for element in last_a_tag.next_elements:
    print(repr(element))

'\nISRO\n'
' Browse ISRO and\n'
<a class="inner_engineering" href="https://www.innerengineering.com/" id="link3">
Inner Engineering
</a>
'\nInner Engineering\n'
' Welcome\n'
'\n'
<p class="other_details">
Few random paragraphs. Some arbitary content without any relevance or significane to anyone living or dead.

</p>
'\nFew random paragraphs. Some arbitary content without any relevance or significane to anyone living or dead.\n\n'
'\n'
'\n'
'\n'


In [91]:
for element in last_a_tag.previous_elements:
    print(repr(element))

' Browse  Wiki\n'
'\nWikipedia\n'
<a class="wikipedia" href="https://www.wikipedia.org/" id="link1">
Wikipedia
</a>
'\nBeautiful is used for Web scraping or extracting contents from a wesbite\nSome websites are\n'
<p class="story">
Beautiful is used for Web scraping or extracting contents from a wesbite
Some websites are
<a class="wikipedia" href="https://www.wikipedia.org/" id="link1">
Wikipedia
</a> Browse  Wiki
<a class="isro" href="https://www.isro.gov.in/" id="link2">
ISRO
</a> Browse ISRO and
<a class="inner_engineering" href="https://www.innerengineering.com/" id="link3">
Inner Engineering
</a> Welcome
</p>
'\n'
'\n'
'Overview of Beautiful Soup'
<b>Overview of Beautiful Soup</b>
'\n'
<p class="title">
<b>Overview of Beautiful Soup</b>
</p>
'\n'
<body bgcolor="lightgreen">
<p class="title">
<b>Overview of Beautiful Soup</b>
</p>
<p class="story">
Beautiful is used for Web scraping or extracting contents from a wesbite
Some websites are
<a class="wikipedia" href="https://www.wikip

#### Find title of webpage

In [7]:
print(soup.title)
print(soup.title.string)

<title>My First webpage</title>
My First webpage


#### Find first bold tag inside body

In [17]:
print(soup.body.b)

<b>Overview of Beautiful Soup</b>


#### Find all bold tags

In [18]:
print(soup.find_all('b'))

[<b>Overview of Beautiful Soup</b>]


#### Find first paragraph with a class

In [8]:
print(soup.p)
print(soup.p['class'])

<p class="title">
<b>Overview of Beautiful Soup</b>
</p>
['title']


#### Find first anchor tag

In [16]:
print(soup.a)

<a class="wikipedia" href="https://www.wikipedia.org/" id="link1">
Wikipedia
</a>


####  Find all anchor tags

In [12]:
print(soup.find_all('a'))

[<a class="wikipedia" href="https://www.wikipedia.org/" id="link1">
Wikipedia
</a>, <a class="isro" href="https://www.isro.gov.in/" id="link2">
ISRO
</a>, <a class="inner_engineering" href="https://www.innerengineering.com/" id="link3">
Inner Engineering
</a>]


#### Find anchor tag corresponding to a particular link

In [13]:
soup.find(id="link3")

<a class="inner_engineering" href="https://www.innerengineering.com/" id="link3">
Inner Engineering
</a>

#### Find all links (URLs)

In [11]:
for i in soup.find_all('a'):
    print(i.get('href'))

https://www.wikipedia.org/
https://www.isro.gov.in/
https://www.innerengineering.com/


#### Extract all text from a page

In [14]:
print(soup.get_text())




My First webpage



Overview of Beautiful Soup


Beautiful is used for Web scraping or extracting contents from a wesbite
Some websites are

Wikipedia

   ,

ISRO

and

Inner Engineering



Few random paragraphs. Some arbitary content without any relevance or significane to anyone living or dead.







#### Find all

In [96]:
print(soup.find_all("title"))

[<title>My First webpage</title>]


In [92]:
print(soup.find_all("p", "title"))

[<p class="title">
<b>Overview of Beautiful Soup</b>
</p>]


In [93]:
print(soup.find_all("a"))

[<a class="wikipedia" href="https://www.wikipedia.org/" id="link1">
Wikipedia
</a>, <a class="isro" href="https://www.isro.gov.in/" id="link2">
ISRO
</a>, <a class="inner_engineering" href="https://www.innerengineering.com/" id="link3">
Inner Engineering
</a>]


In [95]:
print(soup.find_all(id="link2"))

[<a class="isro" href="https://www.isro.gov.in/" id="link2">
ISRO
</a>]


In [97]:
import re
soup.find(string=re.compile("sisters"))

#### Searching by CSS class

In [98]:
soup.find_all("a", class_="isro")

[<a class="isro" href="https://www.isro.gov.in/" id="link2">
 ISRO
 </a>]

In [99]:
soup.find_all("p", class_="title")

[<p class="title">
 <b>Overview of Beautiful Soup</b>
 </p>]

In [100]:
soup.find('title')

<title>My First webpage</title>

In [101]:
soup.find_all('title')

[<title>My First webpage</title>]