<a href="https://colab.research.google.com/github/ajmbarron/web_scraping_with_python-/blob/main/Chapter_1_Your_First_Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [None]:
html=urlopen('http://www.pythonscraping.com/pages/page1.html')
bs=BeautifulSoup(html.read(), 'html.parser')
print(bs.h1) # bs.html.body.h1, bs.body.h1, bs.html.h1

<h1>An Interesting Title</h1>


In [None]:
## connecting reliability and handling exceptions ##

## first situation: page is not found in a server:

from urllib.error import HTTPError

try:
  html=urlopen('http://www.pythonscraping.com/pages/page1.html')
  except HTTPError as e:
    print(e)
    # return null, break, or do some other "Plan B"
    else:
      # program continues. Note: if you return or break in the 
      # exception catch, you do not need to use the "else" statement


SyntaxError: ignored

In [None]:
# handle not found page and not server found

from urllib.error import URLError
from urllib.error import HTTPError

try:
  html=urlopen('http://www.pythonscraping.com/pages/page1.html')
except HTTPError as e:   #not found page
  print(e)
except URLError as e:     #not found server
  print('The server could not be found!')
else:
    print("It worked!")



It worked!


In [None]:
# Note: the problem with a non-existing tag will return an AttributeError
print(bs.nonExistent.someTag)



AttributeError: ignored

In [None]:
# Check if the tag exists, and if something exists inside it:

try:
  badContent=bs.nonExistent.anotherTag
except AttributeError as e:
    print('Tag was not found')
else:
    if badContent==None:
      print('Tag was not found')
    else:
        print(badContent)


Tag was not found


In [None]:
## rewriting the block code

### creating function getTitle which returns either the title of the page,
### or a None object if there was a problem retrieving it.
### if the server did not existe, html would be a None object, and html.read()
## would throw an AttributeError.

def getTitle(url):
  try:
    html=urlopen(url)
  except HTTPError as e:
    return None
  try:
    bs=BeautifulSoup(html.read(), 'html.parser')
    title=bs.body.h1
  except AttributeError as e:
    return None
  return title


title=getTitle('https://www.salario.com.br/profissao/vitrificador-cbo-752420/')
if title == None:
  print('Title could not be found!')
else:
  print(title)

Title could not be found!
