In [41]:
"""
urllib is standard Python library and contains functions for requesting data across the web, handling cookies
and even changing metadata such as headers and your user agent.

urlopen is a function in module urllib.request. It is used to open a remote object across a network and read it.

help(urllib)
"""
# Importing only function urlopen 
from urllib.request import urlopen
html = urlopen('http://172.17.0.2/page1.html')
print(html.read())


b'<html>\n<head>\n<title> A Useful Page </title>\n</head>\n<body>\n<h1> An interesting Title </h1>\n<div>\n\nOm Asato Maa Sad-Gamaya |\nTamaso Maa Jyotir-Gamaya |\nMrtyor-Maa Amrtam Gamaya |\nOm Shaantih Shaantih Shaantih ||\n\n</div>\n</body>\n</html>\n'
<class 'http.client.HTTPResponse'>


In [20]:
'''
Running the beautifulSoup LIBRARY (Python library aka class ) & grabing the header of the Page i.e  
*<h1> An interesting Title </h1>*
Check and verify  
pip list | grep --colour beautifulsoup
beautifulsoup4                     4.8.2

python3 ex1_script.py 
<h1>An Interesting Title</h1>

If not installed.
pip install beautifulsoup4

apt-get install python-bs4
'''
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://172.17.0.2/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
# print(bs.find_all)
# print("\n")
# print(bs.text)
# print("\n")
# print(bs.html)
# print(bs.html)
print(bs.html.h1)

<h1> An interesting Title </h1>


In [11]:
'''
 specific parser ("lxml", "lxml-xml", "html.parser", or "html5lib") 
 
- lxml has some advantages over html.parser, as it's better at parsing "messy" or malformed HTML code.
  Like it will fixes problems like  unclosed tags, tags that are  improperly nested, and missing head or body
  tags. Also it's faster then lxml. Only disadvantage of lxml is that it has to be installed separately and 
  depends on third-party C libraries to fucntion.
  
'''
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://172.17.0.2/page1.html')
bs1 = BeautifulSoup(html.read(), 'lxml')
print(bs1.head)

<head>
<title> A Useful Page </title>
</head>


In [45]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://172.17.0.2/page1.html')
bs2 = BeautifulSoup(html.read(), 'html5lib')
print(bs2.h1)

<h1> An interesting Title </h1>


In [58]:
# The Page is not Found on the server or there was an error  in retrieving it  HTTP Error 404
# http://172.17.0.2/*page2.html*

from urllib.request import urlopen
from urllib.error import HTTPError

try:
    html = urlopen('http://172.17.0.2/page2.html')
except HTTPError as e:
        print(e)
else:
    print("Page Found")


HTTP Error 404: Not Found


In [None]:
# The server is not found. HTTP ERROR 505

from urllib.request import urlopen
from urllib.error import URLError

try:
    html = urlopen('http://172.17.0.2/page1.html')
except URLError as e:
        print("NOT WORKING")
else:
    print(html.read())


In [3]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen('http://171.17.0.222/page1.html')
except HTTPError as e:
    print("The server returned an HTTP error")
except URLError as e:
    print("The server could not be found!")
else:
    print(html.read())

The server could not be found!


In [1]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def  getTitle(url):
    try:
        html = urlopen(url)
        
    except HTTPError as e:
        return None
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.html.div
    except AttributeError as e:
        return None
    
    return title

title = getTitle('http://172.17.0.2/page1.html')

if title == None:
    print('Title cold not be found')
else:
    print(title)

<div>

Om Asato Maa Sad-Gamaya |
Tamaso Maa Jyotir-Gamaya |
Mrtyor-Maa Amrtam Gamaya |
Om Shaantih Shaantih Shaantih ||

</div>


In [14]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://172.17.0.2/warandpeace.html')
bs = BeautifulSoup(html.read(), 'lxml')
print(type(bs))
nameList = bs.find_all('span', {'class':'green'})
#nameList[0:2]
#print(type(nameList))
#for name in nameList:
#    print(name.get_text())

<class 'bs4.BeautifulSoup'>


In [16]:
help(bs.find_all)

Help on method find_all in module bs4.element:

find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) method of bs4.BeautifulSoup instance
    Look in the children of this PageElement and find all
    PageElements that match the given criteria.
    
    All find_* methods take a common set of arguments. See the online
    documentation for detailed explanations.
    
    :param name: A filter on tag name.
    :param attrs: A dictionary of filters on attribute values.
    :param recursive: If this is True, find_all() will perform a
        recursive search of this PageElement's children. Otherwise,
        only the direct children will be considered.
    :param limit: Stop looking after finding this many results.
    :kwargs: A dictionary of filters on attribute values.
    :return: A ResultSet of PageElements.
    :rtype: bs4.element.ResultSet



In [22]:
'''
find_all(tag, attributes, recursive, text, limit, keywords)
'''
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://172.17.0.2/warandpeace.html')
bs = BeautifulSoup(html.read(), 'lxml')
title = bs.find_all(id='title', class_='text')
print([i for i in title])

[]


In [25]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://172.17.0.2/warandpeace.html')
bs = BeautifulSoup(html.read(), 'html.parser')
title = bs.find_all(['h1','h2','h3','h4','h5','h6'])
print([i for i in title])

[<h1>War and Peace</h1>, <h2>Chapter 1</h2>]
