In [1]:
import requests
from bs4 import BeautifulSoup


### Load our first page

In [2]:
# load the webpage content

r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

soup = BeautifulSoup(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using Beautiful Soup to Scrape
#### find and find_all

In [3]:
first_header = soup.find('h1')
first_header

<h1>HTML Webpage</h1>

In [4]:
all_header = soup.find_all('h2')
all_header

[<h2>A Header</h2>, <h2>Another header</h2>]

In [5]:
headers = soup.find_all(['h1','h2'])

headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [6]:
soup.find('h1').get_text()

'HTML Webpage'

In [7]:
paragraph = soup.find_all('p')

paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [8]:
paragraph_by_id  = soup.find_all('p',attrs={'id':'paragraph-id'})

paragraph_by_id

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [9]:
body = soup('body')

body

[<body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>
 <h2>A Header</h2>
 <p><i>Some italicized text</i></p>
 <h2>Another header</h2>
 <p id="paragraph-id"><b>Some bold text</b></p>
 </body>]

In [10]:
body = soup.find('body')
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [11]:
div = body.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [12]:
header1 = div.find('h1')
header1

<h1>HTML Webpage</h1>

In [13]:
soup.find_all('p',string='Some')

[]

In [14]:
import re
soup.find_all("p", string=re.compile("Some"))

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [15]:
header = soup.find_all("h2",string=re.compile("(H|h)eader"))
header

[<h2>A Header</h2>, <h2>Another header</h2>]

#### select (CSS selector)

In [16]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [17]:
soup.select('p')

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [18]:
soup.select('div p')

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [19]:
soup.select('h2 ~ p')

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [20]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [21]:
paragraphs = soup.select("body > p")
print(paragraphs)


[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [22]:
for paragraph in paragraphs:
  print(paragraph.select("i"))

[<i>Some italicized text</i>]
[]


### Get different properties of the HTML

In [23]:
# use .string
header = soup.find("h2")
print(header.string)

# If multiple child elements use get_text
div = soup.find("div")
print(div.prettify())
print(div.get_text())

A Header
<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [24]:

# Get a specific property from an element
link = soup.find("a")
print(link['href'])

paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

https://keithgalli.github.io/web-scraping/webpage.html


'paragraph-id'

In [25]:
anchors = soup.find_all('a')

all_link = set()
for link in anchors:
    if link.get('href') != '#':
        all_link.add('https://codewithharry.com' + link.get('href'))

print(all_link)

{'https://codewithharry.comhttps://keithgalli.github.io/web-scraping/webpage.html'}


### Code navigation

In [26]:

# Path Syntax
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [27]:
# Know the terms: Parent, Sibling, Child

soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Exercises!
#### Go to https://keithgalli.github.io/web-scraping/webpage.html

#### Load the webpage

In [30]:
r  = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

soup = BeautifulSoup(r.content)

print(soup.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

### Grab all of the social links from the webpage
Do this in at least 3 different ways

In [32]:
anchors = soup.find_all('a')

for link in anchors:
  print(link['href'])

https://www.youtube.com/kgmit
#footer
https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli
https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats
https://www.eliteprospects.com/league/acha-ii/stats/2014-2015
https://www.eliteprospects.com/league/acha-ii/stats/2014-2015
https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats
https://www.eliteprospects.com/league/acha-ii/stats/2015-2016
https://www.eliteprospects.com/league/acha-ii/stats/2015-2016
https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2016-2017?tab=stats
https://www.eliteprospects.com/league/acha-ii/stats/2016-2017
https://www.eliteprospects.com/stats
https://www.eliteprospects.com/stats
https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats
https://www.eliteprospects.com/league/acha-iii/stats/2018-2019
https://www.eliteprospe

In [152]:
all_link = set()

url = "https://keithgalli.github.io/web-scraping/"
for link in anchors:
  
  if link['href'][:5] == 'https':
    all_link.add(link['href'])

  elif link['href'][:1]!='#':
    all_link.add(url+link['href'])

for link in all_link:
  print(link)


https://www.eliteprospects.com/league/acha-ii/stats/2016-2017
https://keithgalli.github.io/web-scraping/challenge/file_9.html
https://keithgalli.github.io/web-scraping/challenge/file_1.html
https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats
https://www.youtube.com/kgmit
https://www.eliteprospects.com/league/acha-ii/stats/2014-2015
https://www.linkedin.com/in/keithgalli/
https://keithgalli.github.io/web-scraping/challenge/file_2.html
https://keithgalli.github.io/web-scraping/challenge/file_10.html
https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://keithgalli.github.io/web-scraping/challenge/file_7.html
https://keithgalli.github.io/web-scraping/challenge/file_4.html
https://keithgalli.github.io/web-scraping/challenge/file_3.html
https://keithgalli.github.io/web-scraping/challenge/file_5.html
https://www.eliteprospects.com/league/acha-iii/stats/2018-2019
https://www.eliteprospects.com/league/acha-ii/stats/2015-2016
https://keit

In [50]:
links = soup.select('ul.socials a')

actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [54]:
ulist = soup.find("ul", attrs={"class": "socials"})
links = ulist.find_all("a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']


 ### Scrape the Table

In [69]:
table = soup.select('table.hockey-stats')[0]
print(table.prettify())

<table class="hockey-stats">
 <thead>
  <tr>
   <th class="season" data-sort="">
    S
   </th>
   <th class="team" data-sort="team">
    Team
   </th>
   <th class="league" data-sort="league">
    League
   </th>
   <th class="regular gp" data-sort="gp">
    GP
   </th>
   <th class="regular g" data-sort="g">
    G
   </th>
   <th class="regular a" data-sort="a">
    A
   </th>
   <th class="regular tp" data-sort="tp">
    TP
   </th>
   <th class="regular pim" data-sort="pim">
    PIM
   </th>
   <th class="regular pm" data-sort="pm">
    +/-
   </th>
   <th class="separator">
   </th>
   <th class="postseason">
    POST
   </th>
   <th class="postseason gp" data-sort="playoffs-gp">
    GP
   </th>
   <th class="postseason g" data-sort="playoffs-g">
    G
   </th>
   <th class="postseason a" data-sort="playoffs-a">
    A
   </th>
   <th class="postseason tp" data-sort="playoffs-tp">
    TP
   </th>
   <th class="postseason pim" data-sort="playoffs-pim">
    PIM
   </th>
   <th class=

In [72]:
import pandas as pd

table = soup.select('table.hockey-stats')[0]
header = table.find_all('th')

col = [name.get_text() for name in header]    # you can use .get_text or .string

row = table.find('tbody').find_all('tr')

l = []
for tr in row:
  td = tr.find_all('td')
  row = [str(tr.get_text()).strip() for tr in td]
  l.append(row)

df = pd.DataFrame(l, columns=col)

In [73]:
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   S       5 non-null      object
 1   Team    5 non-null      object
 2   League  5 non-null      object
 3   GP      5 non-null      object
 4   G       5 non-null      object
 5   A       5 non-null      object
 6   TP      5 non-null      object
 7   PIM     5 non-null      object
 8   +/-     5 non-null      object
 9           5 non-null      object
 10  POST    5 non-null      object
 11  GP      5 non-null      object
 12  G       5 non-null      object
 13  A       5 non-null      object
 14  TP      5 non-null      object
 15  PIM     5 non-null      object
 16  +/-     5 non-null      object
dtypes: object(17)
memory usage: 808.0+ bytes


### Grab all fun facts that use word "is"

In [92]:
import re

facts = soup.select("ul.fun-facts li")
facts_with_is = [fact.find(string=re.compile("is")) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

### Download an Image

In [93]:


# Load the webpage content
url = "https://keithgalli.github.io/web-scraping/"
r = requests.get(url+"webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)

images = webpage.select("div.row div.column img")
image_url = images[0]['src']
full_url = url + image_url

img_data = requests.get(full_url).content
with open('lake_como.jpg', 'wb') as handler:
    handler.write(img_data)

### Solve the mystery challenge!

In [151]:
files = soup.select('div.block ul a')

relative_files = [f['href'] for f in files]


url = "https://keithgalli.github.io/web-scraping/"
for f in relative_files:
  full_url = url + f
  page = requests.get(full_url)
  bs_page = BeautifulSoup(page.content)
  secret_word_element = bs_page.find("p", attrs={"id": "secret-word"})
  secret_word = secret_word_element.string
  print(secret_word)

Make
sure
to
smash
that
like
button
and
subscribe
!!!


# Thankyou !!