In [1]:
from urllib.request import urlopen

html = urlopen('http://pythonscraping.com/pages/page1.html')

# print(html)   # <http.client.HTTPResponse object at 0x0000025CC6B89CC8>
# print(type(html))   # <class 'http.client.HTTPResponse'>
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [6]:
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
# print(type(bs))    # <class 'bs4.BeautifulSoup'>
# print(bs)    # Print the whole HTML doc
print(bs.h1)

<h1>An Interesting Title</h1>


In [7]:
# Reading HTML content with LXML parser
import lxml
html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'lxml')
print(bs.h1)
# The result is the same

<h1>An Interesting Title</h1>


In [9]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen("https://pythonscrapingthisurldoesnotexist.com")
except HTTPError as e:
    print("The server returned an HTTP error")
except URLError as e:
    print("The server could not be found!")
    print(e)
else:
    print(html.read())

The server could not be found!
<urlopen error [Errno 11001] getaddrinfo failed>


In [12]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup


def getTitle(url):
    try:
        html = urlopen(url) 
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read(), "lxml")
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    return title


title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
    print("Title could not be found")
else:
    print(title)

<h1>An Interesting Title</h1>


##### Comments
<p align="center">
    In line 8, when the URL is opened, the html variable is of type <em>None</em> if the server cannot be reached. In line 12, when trying to execute the <em>read</em> method, if html is <em>None</em>, an attribute error exception would be raised. This process is equivalent to handling URL errors.
</p>

# Notes

<p align="center">
    <ul>
        <li>When scraping, one has to think of all the possible errors that can happen so that the code can handle all the exceptions correctly. Functions can be defined to make the code reusable and more readable.</li>
    </ul>
</p>