In [7]:
# Question 1
# Parse the given URL and print it's components

from urllib.parse import urlparse
from pprint import pprint

url = r'http://localhost:8888/notebooks/17-urllib%20Package.ipynb'
url_parsed = urlparse(url)

print('Parsing result for:', url, '\n')
print(url_parsed, '\n')
print('Scheme of the URL is:', url_parsed.scheme)
print('The network location is:', url_parsed.netloc)
print('The path is:', url_parsed.path)

Parsing result for: http://localhost:8888/notebooks/17-urllib%20Package.ipynb 

ParseResult(scheme='http', netloc='localhost:8888', path='/notebooks/17-urllib%20Package.ipynb', params='', query='', fragment='') 

Scheme of the URL is: http
The network location is: localhost:8888
The path is: /notebooks/17-urllib%20Package.ipynb


In [14]:
# Question 2
# Access Quora's robots.txt file
# 1. Using urllib.robotparser module, parse Quora's robot.txt file
# 2. Of the given four test URLs, find which of the urls can be accessed legally
# Repeat step 2 as different client agents:
#      a. '*' client agent
#      b. 'PGBot' client agent
#      c. 'ia_archiver'  client agent


from urllib.robotparser import RobotFileParser

# URL to Quoras's robots.txt file
# find the url to robots.txt file
quora_robot_url = r'https://www.quora.com/robots.txt'

# test URLs
test_url1 = r'https://www.quora.com/topic/Robert-Downey-Jr-actor' # hint: only PGBot can access
test_url2 = r'https://www.quora.com/Why-is-Robert-Downey-Jr-so-popular' # hint: only PGBot can access
test_url3 = r'https://www.quora.com/about' # hint: only PGBot can access
test_url4 = r'https://www.quora.com/profile/Hillary-Clinton' # hint: only PGBot can access

# list of test urls for easy processing
test_urls = [test_url1, test_url2, test_url3, test_url4]

# Create a RobotFileParser object
# Pass Quoras's URL while creating the object
quora_robot_parser = RobotFileParser(quora_robot_url)
quora_robot_parser.read()

print('The robots file read is located at:', quora_robot_parser.url, '\n')


user_agents = ['*', 'PGBot', 'ia_archiver']  # fill in the names of the given user-agents

print('-'*60, '\n')


# Iterate over each user-agent in the list
#         then, for each user-agent, iterate over the url list
#                  validate the URL for a user-agent against the robots.txt file


# Note: Using for loops to access each url for multiple user-agents automates our task
# the outer loop can be urls and inner loop can iterate over user-agents.
# Use any other method you like.
# To check you correctness, see the comments next to the test URLs

for u in test_urls:
    
    # print('User agent:', agent)
    # print('Testing url: ' + u)
    
    for a in user_agents:
        # validating request
        print("Validating url {} with agent {}".format(u, a))
        valid_request = quora_robot_parser.can_fetch(a, u)
        print('The requested URL', url, 'can be crawled:', valid_request)
    
    print('-'*60, '\n')

The robots file read is located at: https://www.quora.com/robots.txt 

------------------------------------------------------------ 

Validating url https://www.quora.com/topic/Robert-Downey-Jr-actor with agent *
The requested URL http://localhost:8888/notebooks/17-urllib%20Package.ipynb can be crawled: False
Validating url https://www.quora.com/topic/Robert-Downey-Jr-actor with agent PGBot
The requested URL http://localhost:8888/notebooks/17-urllib%20Package.ipynb can be crawled: True
Validating url https://www.quora.com/topic/Robert-Downey-Jr-actor with agent ia_archiver
The requested URL http://localhost:8888/notebooks/17-urllib%20Package.ipynb can be crawled: False
------------------------------------------------------------ 

Validating url https://www.quora.com/Why-is-Robert-Downey-Jr-so-popular with agent *
The requested URL http://localhost:8888/notebooks/17-urllib%20Package.ipynb can be crawled: False
Validating url https://www.quora.com/Why-is-Robert-Downey-Jr-so-popular with

In [24]:
# Question 3
# Request the given URL and pretty print the HTML obtained from the request
# Spoof the client as a real user
# Request the page: https://www.quora.com/topic/United-Nations
#

from bs4 import BeautifulSoup
from urllib.parse import urlencode
from urllib.request import Request, urlopen

url = r'https://www.quora.com/topic/United-Nations'

# Creating request headers
req_headers = {}
req_headers['user-agent'] = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

# Creating a request object
req = Request(url, headers=req_headers)

# Requesting data from the website
try:
    response = urlopen(req)
    page_content = response.read()
    bs = BeautifulSoup(page_content, 'lxml')
    print(bs.prettify())
except Exception as e:
    print(e)

<!DOCTYPE html>
<html lang="en">
 <head>
  <link href="https://qsf.ec.quoracdn.net/-3-images.favicon.ico-26-3f34badcb59c8f6c.ico" rel="icon"/>
  <script type="text/javascript">
  </script>
  <script type="text/javascript">
   var clicks=[],handleClicks=!0,handleUnready=function(e){if(handleClicks){for(var n=e.target;n&&n.tagName&&"A"!=n.tagName;)n=n.parentNode;if(n&&n.getAttribute&&"#"===n.getAttribute("href")){clicks.push(e.target);var t=document.getElementById("async_spinner");if(!t){t=document.createElement("div"),t.setAttribute("id","async_spinner"),t.setAttribute("class","__live_spinner");var a=document.createElement("div");a.setAttribute("class","__live_spinner_indicator"),require.whenReady("shared/loading",function(){var e=require("shared/loading"),n=e.createDots("small");a.appendChild(n),t.appendChild(a),document.body.appendChild(t)})}}}};window.clearHandlers=function(){if(handleClicks){handleClicks=!1,document.detachEvent?document.detachEvent("onclick",handleUnready):document.

In [28]:
# Question 4
# Using the `bs` object created in the last question,
# pretty print just the <body> part of the HTML document received

print(bs.body.contents)

[<style class="q-inline-css" data-idx="2" type="text/css">.action_button{-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;transition:opacity ease-in-out 100ms,color ease-in-out 100ms,background-color ease-in-out 100ms,border-color ease-in-out 100ms;border-radius:3px;box-shadow:0 1px 1px 0 rgba(200,200,200,0.2);display:inline-block;font-weight:500;outline:0;padding:3px 7px 4px 7px;text-align:center;text-decoration:none;cursor:pointer;color:#6d6d6d;border:1px solid #cbcbd8;background:#f6f6f9}.action_button:active{opacity:.6;box-shadow:none}.action_button:hover{text-decoration:none}.action_button:focus{border:1px solid #656565;color:#656565}.action_button:hover{border:1px solid #656565;color:#656565}.action_button:focus .count{color:#656565}.action_button:hover .count{color:#656565}.action_button:focus .count:before{background:#c2c2d1}.action_button:hover .count:before{backgr

In [8]:
# Question 5
# Using Chrome dev-tools, inspect the question element
# and print out all the questions rendered in the page


# Hint: "question_text" encloses the class "rendered_qtext"
#       which encloses the questions.
#       Try fetching the contents of both the classes
#       See the results and chose the best one
#       Your final output should be a questions printed, one in each line

from bs4 import BeautifulSoup
from urllib.parse import urlencode
from urllib.request import Request, urlopen

# Creating request headers
req_headers = {}
req_headers['user-agent'] = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

# Creating a request object
req = Request(url, headers=req_headers)

response = urlopen(req)
page_content = response.read()
bs = BeautifulSoup(page_content, 'lxml')
questions = bs.select('span.question_text > span.rendered_qtext')
for q in questions:
    for i in q.contents: # Content of tag could be a list size > 0, not sure how likely though.
        print(i)
    print('\n', '-'*60, '\n') # Borrowed from module instructions.

What are some cultural faux pas among diplomats?

 ------------------------------------------------------------ 

Does the China’s sparing use of the veto in the UN Security council suggest that China is actually a friendlier member of the world than most people think?

 ------------------------------------------------------------ 

Why can't Nigeria have a permanent seat on the UN Security Council, since it is the most populous black nation on earth?

 ------------------------------------------------------------ 

If the UN Security Council were newly formed today, who would you choose as the five permanent members?

 ------------------------------------------------------------ 

Is the United Nations an effective organisation?

 ------------------------------------------------------------ 

