In [1]:
from bs4 import BeautifulSoup
from urllib import request
from lxml import html

html = BeautifulSoup(request.urlopen('https://www.python.org'))
html.title

<title>Welcome to Python.org</title>

In [2]:
html.title.text

'Welcome to Python.org'

In [3]:
html.h1

<h1 class="site-headline">
<a href="/"><img alt="python™" class="python-logo" src="/static/img/python-logo.png"/></a>
</h1>

In [4]:
html.find('h1')

<h1 class="site-headline">
<a href="/"><img alt="python™" class="python-logo" src="/static/img/python-logo.png"/></a>
</h1>

In [5]:
html.h1.img

<img alt="python™" class="python-logo" src="/static/img/python-logo.png"/>

In [6]:
html.h1.img.attrs

{'class': ['python-logo'],
 'src': '/static/img/python-logo.png',
 'alt': 'python™'}

In [7]:
html.h1.img['src']

'/static/img/python-logo.png'

In [8]:
html.find(id='back-to-top-1')

<a class="jump-link" href="#python-network" id="back-to-top-1"><span aria-hidden="true" class="icon-arrow-up"><span>▲</span></span> Back to Top</a>

In [10]:
html.find('li', attrs={'class':'shop-meta'})

<li class="shop-meta">
<a href="/community-landing/">Community</a>
</li>

In [11]:
import re

url_list = html.find_all('a')
for url in url_list:
    print(url['href'])

#content
#python-network
/
/psf-landing/
https://docs.python.org
https://pypi.org/
/jobs/
/community-landing/
#top
/
https://psfmember.org/civicrm/contribute/transact?reset=1&id=2
#site-map
#
javascript:;
javascript:;
javascript:;
#
https://www.facebook.com/pythonlang?fref=ts
https://twitter.com/ThePSF
/community/irc/
/about/
/about/apps/
/about/quotes/
/about/gettingstarted/
/about/help/
http://brochure.getpython.info/
/downloads/
/downloads/
/downloads/source/
/downloads/windows/
/downloads/mac-osx/
/download/other/
https://docs.python.org/3/license.html
/download/alternatives
/doc/
/doc/
/doc/av
https://wiki.python.org/moin/BeginnersGuide
https://devguide.python.org/
https://docs.python.org/faq/
http://wiki.python.org/moin/Languages
http://python.org/dev/peps/
https://wiki.python.org/moin/PythonBooks
/doc/essays/
/community/
/community/survey
/community/diversity/
/community/lists/
/community/irc/
/community/forums/
/psf/annual-report/2020/
/community/workshops/
/community/sigs/
/co

In [12]:
docs_list = html.find_all(href=re.compile('^http(s)?://docs'), limit=2)

In [13]:
for doc in docs_list:
    print(doc['href'])

https://docs.python.org
https://docs.python.org/3/license.html


In [14]:
tag = html.find('div', attrs={'id':'nojs'})
tag

<div class="do-not-print" id="nojs">
<p><strong>Notice:</strong> While Javascript is not essential for this website, your interaction with the content will be limited. Please turn Javascript on for the full experience. </p>
</div>

In [15]:
print(tag.get_text(strip=True))

Notice:While Javascript is not essential for this website, your interaction with the content will be limited. Please turn Javascript on for the full experience.


In [16]:
print(tag.get_text(separator='-- '))


-- Notice:--  While Javascript is not essential for this website, your interaction with the content will be limited. Please turn Javascript on for the full experience. -- 



In [17]:
print(html.h1)

<h1 class="site-headline">
<a href="/"><img alt="python™" class="python-logo" src="/static/img/python-logo.png"/></a>
</h1>


In [18]:
print(html.h1.prettify())

<h1 class="site-headline">
 <a href="/">
  <img alt="python™" class="python-logo" src="/static/img/python-logo.png"/>
 </a>
</h1>



In [19]:
html.h1

<h1 class="site-headline">
<a href="/"><img alt="python™" class="python-logo" src="/static/img/python-logo.png"/></a>
</h1>

In [20]:
html.h1.insert(0, 'ham')

In [21]:
html.h1

<h1 class="site-headline">ham
<a href="/"><img alt="python™" class="python-logo" src="/static/img/python-logo.png"/></a>
</h1>

In [22]:
html.h1.insert(3,'egg')
html.h1

<h1 class="site-headline">ham
<a href="/"><img alt="python™" class="python-logo" src="/static/img/python-logo.png"/></a>egg
</h1>

In [23]:
new_tag = html.new_tag('span')
new_tag.string = 'ham egg'
html.h1.img.replace_with(new_tag)

<img alt="python™" class="python-logo" src="/static/img/python-logo.png"/>

In [24]:
html.h1

<h1 class="site-headline">ham
<a href="/"><span>ham egg</span></a>egg
</h1>

In [25]:
html.h1.span.clear()
html.h1

<h1 class="site-headline">ham
<a href="/"><span></span></a>egg
</h1>

In [26]:
html.h1.span.decompose()
html.h1

<h1 class="site-headline">ham
<a href="/"></a>egg
</h1>

In [27]:
html.h1.a.extract()

<a href="/"></a>

In [28]:
html.h1

<h1 class="site-headline">ham
egg
</h1>

In [29]:
wrapper_tag = html.new_tag('div')
wrapper_tag.attrs['class'] = 'wrapper'
html.h1.wrap(wrapper_tag)

<div class="wrapper"><h1 class="site-headline">ham
egg
</h1></div>

In [30]:
html.h1

<h1 class="site-headline">ham
egg
</h1>