In [1]:
html='''<!DOCTYPE html> 
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Basic Website Structure (No CSS)</title>

</head>
<body>

    <header>
        
        <h1 id="site-title">My Website</h1>

        <nav id="main-nav">
            <ul>
                <li><a href="#" class="nav-link">Home</a></li>
                <li><a href="#" class="nav-link">About</a></li>
                <li><a href="#" class="nav-link">Contact</a></li>
            </ul>
        </nav>
    </header>

    <hr>

    <main id="main-content">

        <section id="content-area">

            <article class="post">
                <h2 id="post-title-1">First Article Title</h2>
                <p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
            </article>

            <article class="post">
                <h2 id="post-title-2">Second Article Title</h2>
                <p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles.</p>
            </article>

        </section>

        <aside id="sidebar">
            <h3>Sidebar</h3>
            <ul>
                <li><a href="#">Link 1</a></li>
                <li><a href="#">Link 2</a></li>
                <li><a href="#">Link 3</a></li>
            </ul>
        </aside>

    </main>

    <hr>

    <footer id="page-footer">
        <p class="copyright">&copy; 2025 My Website. All rights reserved.</p>
    </footer>

</body>
</html>
'''

In [2]:
from bs4 import BeautifulSoup

In [3]:
# syntax file, parser
data=BeautifulSoup(html,'html.parser')
data

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Basic Website Structure (No CSS)</title>
</head>
<body>
<header>
<h1 id="site-title">My Website</h1>
<nav id="main-nav">
<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
</nav>
</header>
<hr/>
<main id="main-content">
<section id="content-area">
<article class="post">
<h2 id="post-title-1">First Article Title</h2>
<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
</article>
<article class="post">
<h2 id="post-title-2">Second Article Title</h2>
<p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elem

In [4]:
type(data)

bs4.BeautifulSoup

In [5]:
print(data.prettify())  # function to print data in formatted way

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Basic Website Structure (No CSS)
  </title>
 </head>
 <body>
  <header>
   <h1 id="site-title">
    My Website
   </h1>
   <nav id="main-nav">
    <ul>
     <li>
      <a class="nav-link" href="#">
       Home
      </a>
     </li>
     <li>
      <a class="nav-link" href="#">
       About
      </a>
     </li>
     <li>
      <a class="nav-link" href="#">
       Contact
      </a>
     </li>
    </ul>
   </nav>
  </header>
  <hr/>
  <main id="main-content">
   <section id="content-area">
    <article class="post">
     <h2 id="post-title-1">
      First Article Title
     </h2>
     <p>
      This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.
     </p>
    </article>
  

In [6]:
#extracting whole tag data

#data.tagname
data.title

<title>Basic Website Structure (No CSS)</title>

In [7]:
data.head

<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Basic Website Structure (No CSS)</title>
</head>

In [8]:
data.h1

<h1 id="site-title">My Website</h1>

In [9]:
data.p  #get first tag if multiple tags available

<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>

In [10]:
data.kkkkj  # do not give error

In [11]:
data.h2

<h2 id="post-title-1">First Article Title</h2>

In [12]:
#getting name of tag

print(data.title)
print(data.title.name) #get name of tag
print(data.title.string) #getting text of title tag


<title>Basic Website Structure (No CSS)</title>
title
Basic Website Structure (No CSS)


In [13]:
print(data.title.attrs)  #getting attribute of tag like id class etc 
#we get this in the form of dictionary

{}


In [14]:
data.meta.attrs

{'charset': 'UTF-8'}

In [15]:
data.article.attrs

{'class': ['post']}

In [16]:
#getting value of a attribute

print(data.article['class'])

#or

print(data.article.get('class')) #if attr not aviallable--give error

['post']
['post']


In [17]:
# getting all text in file without tag,attr etc

data.get_text()

'\n\n\n\n\nBasic Website Structure (No CSS)\n\n\n\nMy Website\n\n\nHome\nAbout\nContact\n\n\n\n\n\n\n\nFirst Article Title\nThis is the main content of the first article. An \'id\' attribute (like "post-title-1") must be unique on the entire page. It\'s often used for internal page links (anchors) or for JavaScript to find a specific element.\n\n\nSecond Article Title\nThis is the second article. It has the same \'post\' class as the one above. A \'class\' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles.\n\n\n\nSidebar\n\nLink 1\nLink 2\nLink 3\n\n\n\n\n\n© 2025 My Website. All rights reserved.\n\n\n\n'

In [18]:
# find any tag detail--return first occurance
#similar to data.tagname

print(data.find('Website'))  #find only tags
print(data.find('p'))
print(data.find('nav'))

None
<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
<nav id="main-nav">
<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
</nav>


In [19]:
# find all tags -- find all occurances
#provide list 

data.find_all('p')

[<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>,
 <p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles.</p>,
 <p class="copyright">© 2025 My Website. All rights reserved.</p>]

In [20]:
l=data.find_all('p')

for i in l:
    print(i)

<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
<p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles.</p>
<p class="copyright">© 2025 My Website. All rights reserved.</p>


# Navigating parse tree

In [21]:
data.find_all(['p','a'])  #getting list of all tags start with a and p

[<a class="nav-link" href="#">Home</a>,
 <a class="nav-link" href="#">About</a>,
 <a class="nav-link" href="#">Contact</a>,
 <p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>,
 <p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles.</p>,
 <a href="#">Link 1</a>,
 <a href="#">Link 2</a>,
 <a href="#">Link 3</a>,
 <p class="copyright">© 2025 My Website. All rights reserved.</p>]

In [22]:
# getting all tags without any filter

data.find_all(True)

[<html lang="en">
 <head>
 <meta charset="utf-8"/>
 <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
 <title>Basic Website Structure (No CSS)</title>
 </head>
 <body>
 <header>
 <h1 id="site-title">My Website</h1>
 <nav id="main-nav">
 <ul>
 <li><a class="nav-link" href="#">Home</a></li>
 <li><a class="nav-link" href="#">About</a></li>
 <li><a class="nav-link" href="#">Contact</a></li>
 </ul>
 </nav>
 </header>
 <hr/>
 <main id="main-content">
 <section id="content-area">
 <article class="post">
 <h2 id="post-title-1">First Article Title</h2>
 <p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
 </article>
 <article class="post">
 <h2 id="post-title-2">Second Article Title</h2>
 <p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for gro

In [24]:
data.find_all(id='post-title-1')

[<h2 id="post-title-1">First Article Title</h2>]

In [26]:
data.find_all(class_='post')

[<article class="post">
 <h2 id="post-title-1">First Article Title</h2>
 <p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
 </article>,
 <article class="post">
 <h2 id="post-title-2">Second Article Title</h2>
 <p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles.</p>
 </article>]

In [27]:
# going downward in parse tree

In [28]:
print(data.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Basic Website Structure (No CSS)
  </title>
 </head>
 <body>
  <header>
   <h1 id="site-title">
    My Website
   </h1>
   <nav id="main-nav">
    <ul>
     <li>
      <a class="nav-link" href="#">
       Home
      </a>
     </li>
     <li>
      <a class="nav-link" href="#">
       About
      </a>
     </li>
     <li>
      <a class="nav-link" href="#">
       Contact
      </a>
     </li>
    </ul>
   </nav>
  </header>
  <hr/>
  <main id="main-content">
   <section id="content-area">
    <article class="post">
     <h2 id="post-title-1">
      First Article Title
     </h2>
     <p>
      This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.
     </p>
    </article>
  

In [29]:
data.head

<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Basic Website Structure (No CSS)</title>
</head>

In [30]:
data.head.title  #data--head--title

<title>Basic Website Structure (No CSS)</title>

In [32]:
data.p  #p is data 

<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>

In [33]:
data.footer.p  # p in footer of data

<p class="copyright">© 2025 My Website. All rights reserved.</p>

In [34]:
data.footer.p.string   #.string -- get text

'© 2025 My Website. All rights reserved.'

In [37]:
l=data.find_all('p')
for i in l:
    print(i)
    
print()
for i in l:
    print(i.string)   #getting text of all p tag

<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
<p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles.</p>
<p class="copyright">© 2025 My Website. All rights reserved.</p>

This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.
This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles.
© 2025 My Website. All rights reserved.


In [38]:
print()
for i in l:
    print(i.strings)


<generator object Tag._all_strings at 0x00000256D9B09640>
<generator object Tag._all_strings at 0x00000256D9B09640>
<generator object Tag._all_strings at 0x00000256D9B09640>


In [39]:
print()
for i in l:
    print(list(i.strings))


['This is the main content of the first article. An \'id\' attribute (like "post-title-1") must be unique on the entire page. It\'s often used for internal page links (anchors) or for JavaScript to find a specific element.']
["This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles."]
['© 2025 My Website. All rights reserved.']


In [40]:
#using .stripped_strings

print()
for i in l:
    print(list(i.stripped_strings))


['This is the main content of the first article. An \'id\' attribute (like "post-title-1") must be unique on the entire page. It\'s often used for internal page links (anchors) or for JavaScript to find a specific element.']
["This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or would (with CSS) share visual styles."]
['© 2025 My Website. All rights reserved.']


In [41]:
data.html

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Basic Website Structure (No CSS)</title>
</head>
<body>
<header>
<h1 id="site-title">My Website</h1>
<nav id="main-nav">
<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
</nav>
</header>
<hr/>
<main id="main-content">
<section id="content-area">
<article class="post">
<h2 id="post-title-1">First Article Title</h2>
<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
</article>
<article class="post">
<h2 id="post-title-2">Second Article Title</h2>
<p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a

In [43]:
#getting every children
l=data.html.contents  #give list of all children
print(len(l))    #conting new line as a tag also
print(l)

5
['\n', <head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Basic Website Structure (No CSS)</title>
</head>, '\n', <body>
<header>
<h1 id="site-title">My Website</h1>
<nav id="main-nav">
<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
</nav>
</header>
<hr/>
<main id="main-content">
<section id="content-area">
<article class="post">
<h2 id="post-title-1">First Article Title</h2>
<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
</article>
<article class="post">
<h2 id="post-title-2">Second Article Title</h2>
<p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a 

In [44]:
data.html.children

<list_iterator at 0x256da0a0910>

In [45]:
#we can iterate over iterator 
c=data.html.children               
for i in c:
    print(i)



<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Basic Website Structure (No CSS)</title>
</head>


<body>
<header>
<h1 id="site-title">My Website</h1>
<nav id="main-nav">
<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
</nav>
</header>
<hr/>
<main id="main-content">
<section id="content-area">
<article class="post">
<h2 id="post-title-1">First Article Title</h2>
<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
</article>
<article class="post">
<h2 id="post-title-2">Second Article Title</h2>
<p>This is the second article. It has the same 'post' class as the one above. A 'class' attribute is perfect for grouping elements that share a purpose or w

In [48]:
# descendants--give all the descendants(below includes child of child also) as list of html

desc=list(data.html.descendants)
print(len(desc))
print(desc)

93
['\n', <head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Basic Website Structure (No CSS)</title>
</head>, '\n', <meta charset="utf-8"/>, '\n', <meta content="width=device-width, initial-scale=1.0" name="viewport"/>, '\n', <title>Basic Website Structure (No CSS)</title>, 'Basic Website Structure (No CSS)', '\n', '\n', <body>
<header>
<h1 id="site-title">My Website</h1>
<nav id="main-nav">
<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
</nav>
</header>
<hr/>
<main id="main-content">
<section id="content-area">
<article class="post">
<h2 id="post-title-1">First Article Title</h2>
<p>This is the main content of the first article. An 'id' attribute (like "post-title-1") must be unique on the entire page. It's often used for internal page links (anchors) or for JavaScript to find a specific element.</p>
</article>
<articl

In [51]:
data.li.parent

<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>

In [53]:
l=data.li.parents
for i in l:
    print(i)

<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
<nav id="main-nav">
<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
</nav>
<header>
<h1 id="site-title">My Website</h1>
<nav id="main-nav">
<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
</nav>
</header>
<body>
<header>
<h1 id="site-title">My Website</h1>
<nav id="main-nav">
<ul>
<li><a class="nav-link" href="#">Home</a></li>
<li><a class="nav-link" href="#">About</a></li>
<li><a class="nav-link" href="#">Contact</a></li>
</ul>
</nav>
</header>
<hr/>
<main id="main-content">
<section id="content-area">
<article class="post">
<h2 id="post-title-1">First Article Title</h2>
<p>This is the main content of the first article.