In [1]:
from urllib.request import urlopen # Python module that provides a high-level
                                   # interface for fetching data across the
from bs4 import BeautifulSoup

In [2]:
# Context: when  the scraper is running, its common to hit errors, therefore stops
# scraper's execution. Thats why we have to anticipate the exceptions first.

In [7]:
# Conventional web scraping

url = 'https://www.metrocuadrado.com/venta/medellin/?search=form' # 1.

with urlopen(url) as response:
    bs = BeautifulSoup(response, 'html.parser')

bs.h1

# 1.There we could have 2 possible problems:
# The page is not found on the server (or there was an error in retrieving it).
# Output: "404: Page not found", "500 Internal Server Error"
# The server is not found.

<h1 class="H1-xsrgru-0 jdfXCo d-sm-inline-block breadcrumb-item active">  Inmuebles en Venta en Medellín</h1>

In [8]:
# HTTP Error hangdling

from urllib.error import HTTPError # Handles HTTP errors. The code 
                                   # can continue to run even if an error arises
url = 'https://www.metrocuadrado.com/venta/medellin/?search=form'

with urlopen(url) as response: # Closes the connection once the code is executed
    try:
        bs = BeautifulSoup(response, 'html.parser')
    except HTTPError as e:
        print(e)
        # return null, break, or do some other "Plan B"
    else:
        print(bs.h1)
        # program continues. Note: If you return or break in the
        # exception catch, you do not need to use the "else" statement                                   

<h1 class="H1-xsrgru-0 jdfXCo d-sm-inline-block breadcrumb-item active">  Inmuebles en Venta en Medellín</h1>


In [11]:
# HTTP + URL Error handling

from urllib.error import URLError # Handles URL errors.


url = 'https://www.metrocuadrado.com/venta/medellin/?search=form'

with urlopen(url) as response: # Closes the connection once the code is executed
    try:
        bs = BeautifulSoup(response, 'html.parser')
    except HTTPError as e:
        print('HTTP error; ',e)
    except URLError as e:
        print('URL error: ',e)
    else:
        print(bs.h1)

# - HTTPError specifically handles HTTP response errors.
# - URLError is more general and catches a broader range of issues related to URL
# requests, such as network problems, failed connections, or incorrectly formatted URLs.

<h1 class="H1-xsrgru-0 jdfXCo d-sm-inline-block breadcrumb-item active">  Inmuebles en Venta en Medellín</h1>


In [None]:
# - When web scraping, its important to think the overall pattern for handling
# exceptions.
# - Reuse code.
# - Generic functions such as getSiteHtml or getTitle (with its exception handling)
# makes easy to scrape.