Import necessary libraries

In [62]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

Load the web page content

In [6]:
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

Convert to a beautiful soup object

In [9]:
soup = bs(r.content)

Print out our html

In [10]:
print(soup)

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>



find() and find_all()

In [12]:
first_header = soup.find("h2")
first_header.get_text()

'A Header'

In [16]:
headers = soup.find_all("h2")
for header in headers:
    print(header.get_text())

A Header
Another header


Pass in a list of elements to look for

In [17]:
headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

Pass in sttributes to the find/find_all function

In [20]:
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [23]:
paragraph = soup.find_all("b")
for p in paragraph:
    print(p.get_text())

Some bold text


Nest find/find_all calls

In [25]:
body = soup.find("body")
div = body.find("div")
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

Search for specific strings in our find/find_all calls

Find any paragraph with text "some"

In [28]:
import re

In [29]:
paragraphs = soup.find_all("p", string = re.compile("Some"))
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [30]:
headers = soup.find_all("h2", string = re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

#### Select

In [31]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [32]:
content = soup.find_all("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

deep with select

In [33]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

##### Getting different properties of HTML

- get_text() for bigger objects (if multiply child elements)

In [37]:
soup.find("h2").get_text()

'A Header'

In [45]:
soup.find("div").get_text(strip = True)

'HTML WebpageLink to more interesting example:keithgalli.github.io/web-scraping/webpage.html'

- string

In [38]:
soup.find("h2").string

'A Header'

is not work, because there more than one string

In [47]:
soup.find("div").string

Get a specific property from an element

In [49]:
soup.find("a")["href"]

'https://keithgalli.github.io/web-scraping/webpage.html'

#### Code navigation

In [52]:
soup.body.div.h1.string

'HTML Webpage'

Know the terms: parent, sibling and child. If elements on the same level, we consider them as siblings.

In [53]:
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

#### Exercises

In [54]:
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")
soup = bs(r.content)
soup

<html><head>
<title>Keith Galli's Page</title>
<style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
</style>
</head>
<body>
<h1>Welcome to my page!</h1>
<img src="./images/selfie1.jpg" width="300px"/>
<h2>About me</h2>
<p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>
<p>Here is a link to my channel: <a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a></p>
<p>I grew up in the great s

##### Grab all of the social links from the webpage

In [60]:
links = soup.find(class_ = "socials")
for link in links.find_all("li"):
    print(link.find("a")["href"])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


##### Grab a table on the webpage

In [64]:
pd.read_html("https://keithgalli.github.io/web-scraping/webpage.html")[0]

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 9,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


Grab all the fun facts that use the word is

In [79]:
facts = soup.find(class_ = "fun-facts").find_all("li")
[fact.find_all(string = re.compile("is")) for fact in facts]

[[],
 ['Middle name is Ronald'],
 [],
 ['Dunkin Donuts coffee is better than Starbucks'],
 ['A favorite book series of mine is '],
 ['Current video game of choice is '],
 ["The band that I've seen the most times live is the "]]