In [1]:
import requests
from bs4 import BeautifulSoup as bs

### Loading a Webpage - requests Library

In [2]:
# Load the webpage content
url = "https://keithgalli.github.io/web-scraping/example.html"
r = requests.get(url)

In [3]:
# Convert to a BeautifulSoup object
soup = bs(r.content)

In [4]:
# Printing out our html
print(soup)

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>



In [5]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### Starting to Scrape

#### find & find_all

In [6]:
# Finding the first element
first_header = soup.find("h2")
print(first_header)

<h2>A Header</h2>


In [7]:
# Finding all elements
headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [8]:
# Pass in a list of elements to look for
first_header = soup.find(["h1","h2"])
first_header

<h1>HTML Webpage</h1>

In [9]:
first_header = soup.find(["h2","h1"])
first_header

<h1>HTML Webpage</h1>

In [10]:
headers = soup.find_all(["h1","h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [11]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p")
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [12]:
paragraph = soup.find_all("p", attrs={"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [13]:
# You can nest find/find_all calls
body = soup.find("body")
div = body.find("div")
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [14]:
# Searching for a specific string, doesn't work for one word
paragraph = soup.find_all("p", string="Some bold text")
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [15]:
# Using regex for a specific word
import re
paragraph = soup.find_all("p", string=re.compile("Some"))
paragraph

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [16]:
# Strings of different capitalization
# This example only gives the lowercase header
headers = soup.find_all("h2", string=re.compile("header"))
headers

[<h2>Another header</h2>]

In [17]:
# Now looking for both capitalizations
headers = soup.find_all("h2", string=re.compile("H|header"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### Select method (CSS path selections)

In [18]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [19]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [20]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [21]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [22]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [23]:
# Nested calls. 
paragraphs = soup.select("body > p") # Direct descendants of the body
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [24]:
print(paragraphs)
for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [25]:
# Grab by element with specific property
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Grabbing the string/text from an HTML Element

In [26]:
header = soup.find("h2")
print(header)
header.string

<h2>A Header</h2>


'A Header'

In [27]:
# It can print out elements of a tag such as div
div = soup.find("div")
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [28]:
div = soup.find("div")
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



### Getting a property of HTML element (href, src, id, class, etc)

In [29]:
# Getting a link
link = soup.find("a")
print(link)
link["href"]

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>


'https://keithgalli.github.io/web-scraping/webpage.html'

In [30]:
paragraph = soup.select("p#paragraph-id")
print(paragraph)
paragraph[0]["id"]

[<p id="paragraph-id"><b>Some bold text</b></p>]


'paragraph-id'

### Code navigation (parents, children, siblings)

#### Path Syntax

In [31]:
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

In [32]:
soup.body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [33]:
soup.body.div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [34]:
soup.body.div.h1

<h1>HTML Webpage</h1>

In [35]:
soup.body.div.h1.string

'HTML Webpage'

### Parent, Sibling, Child

In [36]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [37]:
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Exersices

https://keithgalli.github.io/web-scraping/webpage.html

In [38]:
import requests
from bs4 import BeautifulSoup as bs

In [39]:
url = "https://keithgalli.github.io/web-scraping/webpage.html"
r = requests.get(url)
webpage = bs(r.content)

In [40]:
# print(webpage.prettify())

##### Exercise #1: Grab all social links on webpage in 3 different ways

<b> Method 1

In [41]:
# My method
links = webpage.find_all("li",attrs="social")
# print(links)
for link in links:
    print(link.a["href"])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [42]:
# video method
ulist = webpage.find("ul",attrs="socials")
links = ulist.find_all("a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

<b> Method 2

In [43]:
# My method
# elements a inside of ul.socials... for class we use .
links = webpage.select("ul.socials a")
# print(links)
for link in links:
    print(link["href"])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [44]:
# Video method
links = webpage.select("ul.socials a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

<b> Method 3

In [45]:
# My Method
links = webpage.find_all("li",attrs="social")
actual_links = [link.a["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [46]:
# Video Method
links = webpage.select("li.social a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

##### Exercise #2: Scrape an HTML table into a Pandas Dataframe

In [47]:
import pandas as pd

In [48]:
# Headers
headers = webpage.find_all("th")
headers = [header.string for header in headers]
print(headers)

['S', 'Team', 'League', 'GP', 'G', 'A', 'TP', 'PIM', '+/-', '\xa0', 'POST', 'GP', 'G', 'A', 'TP', 'PIM', '+/-']


In [49]:
# Half Method: 
# Column 1
data1 = webpage.find_all("td", attrs={"class":["season sorted"]})
data1 = [d.string.strip() for d in data1]
# Column 2
data2 = webpage.select("td.team a")
data2 = [d.string for d in data2]
# Column 3
data3 = webpage.select("td.league a")
data3 = [d.string for d in data3]
data3
data4 = webpage.find_all("td", attrs={"class":["regular gp"]})
data4 = [d.string for d in data4]
data2

[' MIT (Mass. Inst. of Tech.) ',
 ' MIT (Mass. Inst. of Tech.) ',
 ' MIT (Mass. Inst. of Tech.) ',
 ' MIT (Mass. Inst. of Tech.) ']

In [50]:
table = pd.DataFrame({headers[0]:data1,headers[2]:data3})
table

Unnamed: 0,S,League
0,2014-15,ACHA II
1,2015-16,ACHA II
2,2016-17,ACHA II
3,2017-18,
4,2018-19,ACHA III


<b> Built-in Method

In [51]:
# Built in method
table = webpage.find_all("table")
df = pd.read_html(str(table))[0]
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 9,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


<b> Video Method

In [52]:
import pandas as pd

In [53]:
# Headers
table = webpage.select("table.hockey-stats")[0]
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]
print(column_names)

['S', 'Team', 'League', 'GP', 'G', 'A', 'TP', 'PIM', '+/-', '\xa0', 'POST', 'GP', 'G', 'A', 'TP', 'PIM', '+/-']


In [54]:
# Rows in tables
l = []
table_rows = webpage.find("tbody").find_all("tr")
for row in table_rows:
    td = row.find_all("td")
    row = [t.get_text().strip() for t in td]
    l.append(row)

print(l)

[['2014-15', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '17', '3', '9', '12', '20', '', '|', '', '', '', '', '', '', ''], ['2015-16', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '9', '1', '1', '2', '2', '', '|', '', '', '', '', '', '', ''], ['2016-17', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '12', '5', '5', '10', '8', '0', '|', '', '', '', '', '', '', ''], ['2017-18', 'Did not play', '', '', '', '', '', '', '', '|', '', '', '', '', '', '', ''], ['2018-19', 'MIT (Mass. Inst. of Tech.)', 'ACHA III', '8', '5', '10', '15', '8', '', '|', '', '', '', '', '', '', '']]


In [55]:
# Creating DataFrame
df = pd.DataFrame(l,columns=headers)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [57]:
df["Team"]

0    MIT (Mass. Inst. of Tech.)
1    MIT (Mass. Inst. of Tech.)
2    MIT (Mass. Inst. of Tech.)
3                  Did not play
4    MIT (Mass. Inst. of Tech.)
Name: Team, dtype: object

In [58]:
df.loc[df["Team"] != "Did not play"]

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8,,|,,,,,,,


##### Exercise #3: Grab all fun facts that contain the word “is”

In [86]:
# My Method
words = webpage.find("ul",attrs={"class":"fun-facts"})
words = words.find_all("li")
words = [w.get_text() for w in words if "is" in w.get_text()]
words

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

In [88]:
# video method
import re

facts = webpage.select("ul.fun-facts li")
facts_with_is = [fact.find(string=re.compile("is")) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

##### Exercise #4: Use beautiful soup to help download an image from a webpage

In [107]:
base_url = "https://keithgalli.github.io/web-scraping/" 
image = webpage.find_all("img")[0]
image_url = image["src"]
print(image_url)
full_url = base_url+image_url[2:]
image_data = requests.get(full_url).content
with open("image.jpg","wb") as im:
    im.write(image_data)

./images/selfie1.jpg


In [115]:
# video method
url = "https://keithgalli.github.io/web-scraping/" 
images = webpage.select("div.row div.column img")  # or image = webpage.select("div.column img")
image_url = images[0]["src"]
full_url = url + image_url
img_data = requests.get(full_url).content
with open("lake_como.jpg", "wb") as handler:
    handler.write(img_data)

##### Exercise #5: Solve the mystery challenge!!!

In [140]:
# My Method
url = "https://keithgalli.github.io/web-scraping/" 
files = webpage.select("div.block li a")
files = [url+file["href"] for file in files]

s_msg = ""

for file in files:
    f = requests.get(file).content
    web_file = bs(f)
    msg = web_file.find("p",attrs={"id":"secret-word"})
    s_msg += " " + msg.string
    
print(s_msg)

 Make sure to smash that like button and subscribe !!!


In [139]:
# video method
files = webpage.select("div.block a")
relative_files = [f["href"] for f in files]

url = "https://keithgalli.github.io/web-scraping/" 

for f in relative_files:
    full_url = url + f
    page = requests.get(full_url)
    bs_page = bs(page.content)
    secret_word_element = bs_page.find("p",attrs={"id":"secret-word"})
    secret_word_element = secret_word_element.string
    print(secret_word_element)

Make
sure
to
smash
that
like
button
and
subscribe
!!!
