# Load necessary libraries

In [25]:
import requests
from bs4 import BeautifulSoup as bs
import re

# Load our first page

In [2]:
# Load webpage content

r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a bs object
soup = bs(r.content)

# Print out our html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Start using Beautiful Soup to Scrape

### find and find_all

In [57]:
first_header = soup.find("h1")

print(first_header)

headers = soup.find_all("h2")

print(headers)

<h1>HTML Webpage</h1>
[<h2>A Header</h2>, <h2>Another header</h2>]


In [58]:
# Pass in a list of elements to look for all header tags (h1 and h2)
first_header = soup.find(["h2", "h1"]) # The order doesn't matter, it will return the first header on the content
print(first_header)

headers = soup.find_all(["h1", "h2"])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [59]:
# Pass in attribute in find/find_all function
para = soup.find_all("p", attrs={"id":"paragraph-id"})
print(para)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [64]:
# Nest in find/find_all cells
body = soup.find('body')
div = body.find('div') # Now we only scrap inside div cell
header = div.find_all("h1")

print(header)

[<h1>HTML Webpage</h1>]


In [79]:
# Search specific strings in our find/find_all cells

paras = soup.find_all("p", string=re.compile("Some"))
print(paras)

headers = soup.find_all(["h1","h2"], string=re.compile("(?i)H")) # or ("(H|h)eader")
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [27]:
# Use CSS selector

content = soup.select('div p')
print("Get p after div:\n", content)

paragraphs = soup.select('h2 ~ p')
print("Get all p after h2:\n", paragraphs)

bold_text = soup.select('p b')
print("Get bold text:\n", bold_text)

Get p after div:
 [<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]
Get all p after h2:
 [<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
Get bold text:
 [<b>Some bold text</b>]


In [31]:
# Using loop
paragraphs = soup.select('body > p') # direct descendant of the body

print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [33]:
# Grab by element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

## Get different properties of the HTML

In [43]:
# Get the string within the element, only the string excluding the tag
header = soup.find('h2')
print(header.string)

div = soup.find('div')
print(div.get_text()) # Use this to get string from multiple elements

# print(div.string)
# Return None because it doesn't has clear instruction wether to print the text within h1 tag or the p tag

A Header

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [48]:
# Get specific property from an element
link = soup.find('a')
print(link['href'])

paragraph_id = soup.select('p#paragraph-id')
print(paragraph_id[0]['id'])

https://keithgalli.github.io/web-scraping/webpage.html
paragraph-id


# Code Navigation

In [76]:
# Path syntax
soup.body.div.h1

<h1>HTML Webpage</h1>

In [51]:
# Know the term: Parent, Sibling, and Child

print(soup.body.prettify())
# 1.Body -> 1.1.Div -> 1.1.1.h1 -> 1.1.1.1.p -> 1.2. h2

print(soup.body.find('div').find_next_siblings())
# Find more on BS Documentation

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>

[<h2>A Header</h2>, <p><i>Some italicized text</i></p>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>]


# Exercise

## Load the webpage

In [168]:
url = 'https://keithgalli.github.io/web-scraping/'

r = requests.get(url+'webpage.html')
wp = bs(r.content)

print(wp.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

### Grab all the social links from the webpage

Do it in three different ways

In [160]:
# Solution 1
## Get ul element with class socials
socials = wp.find('ul', attrs={'class':'socials'})
## Grab all the a tags
links = socials.find_all('a')
## Get the actual link
actual_links = [link['href'] for link in links]

print("Solution1:")
actual_links

Solution1:


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [131]:
# Solution 2
## Grab all the link
a = wp.select('li a')
## Filter links for social media
sns = a[1:5]
# Get the link using loop
print('Solution2:')
for link in sns:
    print(link['href'])

Solution2:
https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [156]:
# Solution 3
## Grab all the link
links = wp.find_all('a')
## Get the actual social links
actual_links = [link['href'] for link in links[2:6]]

print("Solution3:")
actual_links

Solution3:


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [158]:
# Solution 4
## Get the a elements inside ul.socials 
links = wp.select('ul.socials a')
## Get the actual link
actual_links = [link['href'] for link in links]

print("Solution4:")
actual_links

Solution4:


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [159]:
# Solution 5
## Get the a elements inside li.social
links = wp.select('li.social a')
## Get the actual link
actual_links = [link['href'] for link in links]

print("Solution5:")
actual_links

Solution5:


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Scrape an HTML table into a Pandas Dataframe

In [171]:
import pandas as pd

In [339]:
# Solution 1
## Get the table
table = wp.select('table.hockey-stats')[0]

## Get the head
columns = table.select('thead tr th')

## Get the column name
column_names = [columns.string for columns in columns]

## Get the rows
table_rows = wp.select('tbody tr')

## Get the dataframe
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=column_names)
# Remove None
df = df.dropna()



## Try to use [0] on table to get only the tag if met this kind error
## ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

In [340]:
## Delete \n\n
df.replace(regex=True,inplace=True,to_replace='\n',value='')

In [351]:
df.loc[df['Team']!= 'Did not play']

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [352]:
# Solution 2
## Get the table
table = wp.select('table.hockey-stats')[0] # [0] to get only the tag

## Get the column names string
columns = table.find('thead').find_all('th')
column_names = [c.string for c in columns]

## Get table row string
rows = table.find('tbody').find_all('tr')

l = []

for tr in rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

df = pd.DataFrame(l, columns=column_names)
df.loc[df['Team']!= 'Did not play']
# Use .strip() to strip whitespace (\n)
# Change into python string object (str) to make it work
# Use .get_text() instead of .text/.string because of the nested element

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8,,|,,,,,,,


### Grab all fun facts that contain the word “is”

In [135]:
# Grab the fun facts
ul_fun = wp.select("ul.fun-facts")
li_is = [li.find_all(string=re.compile('is')) for li in ul_fun]
li_is = [li for li in li_is if li]

In [184]:
# Solution
## Grab the fun facts
facts = wp.select('ul.fun-facts li')
## Return specific string "is"
facts_is = [fact.find(string=re.compile("is")) for fact in facts]
## Remove None and get the rest of the string
facts_is = [fact.find_parent().get_text() for fact in facts_is if fact]
facts_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

### Use beautiful soup to help download an image from a webpage

In [273]:

imgs = wp.select('div.column img')


img_link = imgs[0]['src']

url = 'https://keithgalli.github.io/web-scraping/'
full_url = url+img_link


img_data = requests.get(full_url).content
with open("dopeass_landscape.jpg", "wb") as handler:
    handler.write(img_data)

# img_link = [link['src'] for link in imgs]
# names = ["img1.jpg", "img2.jpg", "img3.jpg"]
# url = 'https://keithgalli.github.io/web-scraping/'

# for link in img_link:
#     full_url = url + link
#     img_data = requests.get(full_url).content
#     for n in names:
#         with open(n, "wb") as handler:
#             handler.write(img_data)

### Mystery Challange

Scrape the links below grabbing the p tag with id="secret-word", to discover a secret message :)

In [267]:
files = wp.select('div div.block ul li a')
file_url = [f["href"] for f in files]

url = 'https://keithgalli.github.io/web-scraping/'

for f in file_url:
    full_url = url+f
    pages = requests.get(full_url)
    soup = bs(pages.content)
    secret = soup.select('p#secret-word')
    secret_word = [word.get_text() for word in secret]
    print(secret_word)

['Make']
['sure']
['to']
['smash']
['that']
['like']
['button']
['and']
['subscribe']
['!!!']
