## urllib

### Importing urllib


In [None]:
import urllib
import webbrowser

from pprint import pprint

urllib2 is not available in python3 version.
urllib2 is divided as modules in python3 version, those are:    
  1. urllib.request
  2. urllib.parser
  3. urllib.error
  4. urllib.robotparser

### urllib.request

It will return the response object of our request

In [None]:
url = "http://pythonscraping.com/pages/page1.html"

urllib.request returning response as HTTPResponse object

In [None]:
html_response = urllib.request.urlopen(url)

html_response

<http.client.HTTPResponse at 0x7fb0c8e03f50>

In [None]:
html_response.geturl()

'http://pythonscraping.com/pages/page1.html'

In [None]:
html_response.getcode()  # 200 means OK

# https://developer.mozilla.org/en-US/docs/Web/HTTP/Status  -> Various status code.

200

#### info() 

By using info( ) will get html_response object headers.

In [None]:
print(html_response.info())

Date: Sun, 09 May 2021 06:51:24 GMT
Content-Type: text/html; charset=utf-8
Content-Length: 9593
Connection: close
Server: gunicorn/19.9.0
Access-Control-Allow-Origin: *
Access-Control-Allow-Credentials: true




#### read()

By using .read( ) we can get the actual data of response by urllib.request

In [None]:
data = html_response.read()

pprint(data.decode('UTF-8'))

('<html>\n'
 '<head>\n'
 '<title>A Useful Page</title>\n'
 '</head>\n'
 '<body>\n'
 '<h1>An Interesting Title</h1>\n'
 '<div>\n'
 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod '
 'tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, '
 'quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo '
 'consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse '
 'cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat '
 'non proident, sunt in culpa qui officia deserunt mollit anim id est '
 'laborum.\n'
 '</div>\n'
 '</body>\n'
 '</html>\n')


In [None]:
html_response.read()

b''

In urllib response info connection is mentioned as 'close', so we can not access the data 2nd time by making single urllib.request

In [None]:
print(html_response.info())

Date: Sun, 09 May 2021 06:53:22 GMT
Server: Apache
Last-Modified: Sat, 09 Jun 2018 19:15:58 GMT
ETag: "4121bc8-234-56e3a58b39172"
Accept-Ranges: bytes
Content-Length: 564
Cache-Control: max-age=1209600
Expires: Sun, 23 May 2021 06:53:22 GMT
Connection: close
Content-Type: text/html




In [None]:
html_response.status, html_response.reason

(200, 'OK')

In [None]:
html = data.decode("UTF-8")

pprint(html)

('<html>\n'
 '<head>\n'
 '<title>A Useful Page</title>\n'
 '</head>\n'
 '<body>\n'
 '<h1>An Interesting Title</h1>\n'
 '<div>\n'
 'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod '
 'tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, '
 'quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo '
 'consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse '
 'cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat '
 'non proident, sunt in culpa qui officia deserunt mollit anim id est '
 'laborum.\n'
 '</div>\n'
 '</body>\n'
 '</html>\n')


### Exception Handling: URLError

Ensure that while requesting for a web page, handle exceptions and errors such as URLError, HTTPError etc.

In [None]:
from urllib.error import URLError, HTTPError

In [None]:
try:
    with urllib.request.urlopen('https://this-site-does-not-exist.com') as resp:
        pprint(resp.read().decode('UTF-8'))
except URLError as e:
    print(e.reason)

[Errno -2] Name or service not known


## Requests 

Another common library

In [None]:
import requests
import json
import webbrowser

from pprint import pprint

In [None]:
resp = requests.get('http://pythonscraping.com/pages/page1.html')

type(resp)

requests.models.Response

In [None]:
resp.status_code

200

In [None]:
pprint(resp.headers)

{'Date': 'Sun, 09 May 2021 07:08:34 GMT', 'Server': 'Apache', 'Last-Modified': 'Sat, 09 Jun 2018 19:15:58 GMT', 'ETag': '"4121bc8-234-56e3a58b39172"', 'Accept-Ranges': 'bytes', 'Content-Length': '564', 'Cache-Control': 'max-age=1209600', 'Expires': 'Sun, 23 May 2021 07:08:34 GMT', 'Keep-Alive': 'timeout=5, max=100', 'Connection': 'Keep-Alive', 'Content-Type': 'text/html'}


In [None]:
data = resp.text

data

'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'