In [1]:
# YouTube Link:



# Ensure that you have both beautifulsoup and requests installed:

#   pip install beautifulsoup4

#   pip install requests



import requests

from bs4 import BeautifulSoup



# Using the requests module, we use the "get" function

# provided to access the webpage provided as an

# argument to this function:

result = requests.get("https://www.google.com/")



# To make sure that the website is accessible, we can

# ensure that we obtain a 200 OK response to indicate

# that the page is indeed present:

print(result.status_code)



# For other potential status codes you may encounter,

# consult the following Wikipedia page:

# https://en.wikipedia.org/wiki/List_of_HTTP_status_codes



# We can also check the HTTP header of the website to

# verify that we have indeed accessed the correct page:

print(result.headers)



# For more information on HTTP headers and the information

# one can obtain from them, you may consult:

# https://en.wikipedia.org/wiki/List_of_HTTP_header_fields



# Now, let us store the page content of the website accessed

# from requests to a variable:

src = result.content



# Now that we have the page source stored, we will use the

# BeautifulSoup module to parse and process the source.

# To do so, we create a BeautifulSoup object based on the

# source variable we created above:

soup = BeautifulSoup(src, 'lxml')



# Now that the page source has been processed via Beautifulsoup

# we can access specific information directly from it. For instance,

# say we want to see a list of all of the links on the page:

links = soup.find_all("a")

print(links)

print("\n")



# Perhaps we just want to extract the link that has contains the text

# "About" on the page instead of every link. We can use the built-in

# "text" function to access the text content between the <a> </a>

# tags.

for link in links:

    if "About" in link.text:

        print(link)

        print(link.attrs['href'])

200
{'Date': 'Mon, 27 Apr 2020 00:57:33 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2020-04-27-00; expires=Wed, 27-May-2020 00:57:33 GMT; path=/; domain=.google.com; Secure, NID=203=YrdoWHxUEJ-YLWnExhH4EOas3sIlvrY8sHxS0AJNnnpgzYLpLFAlvuC_C7GBvGXg4YEYA3xpgvnj4IhkrQJ7AkeJS7vg59Kt8WkdHDoGX8vSrb70fHUifCWhzoUPt8EXA-ZXoiVT3VFtv855QHib6-sSqToYX1L8m4WqbRjDMHw; expires=Tue, 27-Oct-2020 00:57:33 GMT; path=/; domain=.google.com; HttpOnly', 'Alt-Svc': 'quic=":443"; ma=2592000; v="46,43",h3-Q050=":443"; ma=2592000,h3-Q049=":443"; ma=2592000,h3-Q048=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,h3-T050=":443"; ma=2592000', 'Transfer-Encoding': 'chunked'}
[<a class="gb1" href="https://www.google.ca/imghp?hl=en&amp

In [2]:
# YouTube Link: https://www.youtube.com/watch?v=oDtLJEc5Ako



"""

In this video, we will be going over BeautifulSoup objects, namely:

    Tag, NavigableString, BeautifulSoup, and Comment



"""



from bs4 import BeautifulSoup





# To keep things simple and also reproducible, consider the following HTML code

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>



<p class="story">Once upon a time there were three little sisters; their names:

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>



<p class="story">...</p>



<b class="boldest">Extremely bold</b>

<blockquote class="boldest">Extremely bold</blockquote>

<b id="1">Test 1</b>

<b another-attribute="1" id="verybold">Test 2</b>

"""





with open('index.html', 'w') as f:

    f.write(html_doc)



soup = BeautifulSoup(html_doc, "lxml")



#print(soup.prettify())



# Tag:



# Finds the first occurrence of usage for a "b"

# bold tag.

#print(soup.b)



# The "find" function also does the same, where it

# only finds the first occurrence in the HTML doc

# of a tag with "b".

#print(soup.find('b'))



# If we want to find all of the elements on the page

# with the "b" tag, we can use the "find_all" function.

#print(soup.find_all('b'))



# Name:



# This gives the name of the tag. In this case, the 

# tag name is "b".

#print(soup.b.name)



# We can alter the name and have that reflected in the

# source. For instance:

#tag = soup.b

#print(tag)

#tag.name = "blockquote"

#print(tag)



# Attributes:



#tag = soup.find_all('b')[2]

#print(tag)



# This specific tag has the attribute "id", which

# can be accessed like so:

#print(tag['id'])



#tag = soup.find_all('b')[3]

#print(tag)



# We can even access multiple attributes that are

# non-standard HTML attributes:

#print(tag['id'])

#print(tag['another-attribute'])



# If we want to see all attributes, we can access them

# as a dictionary object:

#tag = soup.find_all('b')[3]

#print(tag)



#print(tag.attrs)



# These properties are mutable, and we can alter them

# in the following manner.

#print(tag)

#tag['another-attribute'] = 2

#print(tag)



# We can also use Python's del command for lists to

# remove attributes:

#del tag['id']

#del tag['another-attribute']

#print(tag)



# Multi-valued Attributes

tag = soup.find_all('b')[3]

print(tag)

print(tag.string)



# We can use the "replace_with" function to replace

# the content of the string with something different:

tag.string.replace_with("This is another string")

print(tag)



# NavigableString



# BeautifulSoup



# Comments

<b another-attribute="1" id="verybold">Test 2</b>
Test 2
<b another-attribute="1" id="verybold">This is another string</b>


In [3]:



# Let's obtain the links from the following website:

# https://www.whitehouse.gov/briefings-statements/



# One of the things this website consists of is records of presidential

# briefings and statements.



# Goal: Extract all of the links on the page that point to the 

# briefings and statements.



import requests

from bs4 import BeautifulSoup



result = requests.get("https://www.whitehouse.gov/briefings-statements/")

src = result.content

soup = BeautifulSoup(src, 'lxml')



urls = []

for h2_tag in soup.find_all('h2'):

    a_tag = h2_tag.find('a')

    urls.append(a_tag.attrs['href'])



print(urls)

['https://www.whitehouse.gov/briefings-statements/joint-statement-president-donald-j-trump-president-vladimir-putin-russia-commemorating-75th-anniversary-meeting-elbe/', 'https://www.whitehouse.gov/briefings-statements/remarks-president-trump-signing-ceremony-h-r-266-paycheck-protection-program-health-care-enhancement-act/', 'https://www.whitehouse.gov/briefings-statements/text-letter-president-speaker-house-representatives-president-senate-71/', 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-tennessee-disaster-declaration-7/', 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-kentucky-disaster-declaration-5/', 'https://www.whitehouse.gov/briefings-statements/bill-announcement-95/', 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-remains-committed-providing-critical-relief-american-small-businesses-workers-healthcare-providers/', 'https://www.whitehouse.gov/briefings-statements/statement-pres

In [4]:
# YouTube Link:



# Let's obtain the links from the following website:

# https://www.whitehouse.gov/briefings-statements/



# One of the things this website consists of is records of presidential

# briefings and statements.



# Goal: Extract all of the links on the page that point to the 

# briefings and statements.



import requests

from bs4 import BeautifulSoup



result = requests.get("https://www.whitehouse.gov/briefings-statements/")

src = result.content

soup = BeautifulSoup(src, 'lxml')



urls = []

for h2_tag in soup.find_all('h2'):

    a_tag = h2_tag.find('a')

    urls.append(a_tag.attrs['href'])



print(urls)


['https://www.whitehouse.gov/briefings-statements/joint-statement-president-donald-j-trump-president-vladimir-putin-russia-commemorating-75th-anniversary-meeting-elbe/', 'https://www.whitehouse.gov/briefings-statements/remarks-president-trump-signing-ceremony-h-r-266-paycheck-protection-program-health-care-enhancement-act/', 'https://www.whitehouse.gov/briefings-statements/text-letter-president-speaker-house-representatives-president-senate-71/', 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-tennessee-disaster-declaration-7/', 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-kentucky-disaster-declaration-5/', 'https://www.whitehouse.gov/briefings-statements/bill-announcement-95/', 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-remains-committed-providing-critical-relief-american-small-businesses-workers-healthcare-providers/', 'https://www.whitehouse.gov/briefings-statements/statement-pres