# Imports

In [347]:
import requests
from bs4 import BeautifulSoup as bs

# Load the First Page

In [348]:
# load webpage content with requests
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')
# convert to BeautifulSoup object 
soup = bs(r.content)

print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# find and find_all

In [349]:
headers = soup.find_all("h2")
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [350]:
headers = soup.find_all(["h2", 'h1'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [351]:
# passing in attributes 
paragraph = soup.find_all("p", attrs={"id" :"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [352]:
# nesting find/find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [353]:
import re
# search for strings in find/find_all calls with regex
string = re.compile("Some")
string_search = soup.find_all('p', string = string)
string_search

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [354]:
headers = soup.find_all('h2', string = re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

# select 

In [355]:
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [356]:
paragraphs = soup.select('h2 ~ p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [357]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [358]:
paragraphs = soup.select('body > p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

# Getting different properties of HTML

In [359]:
div = soup.find('div')
print(div.prettify())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



In [360]:
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [361]:
# get a specific property from an element
soup.find("a")['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [362]:
paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

# Code navigation

In [363]:
#path syntax
soup.body.div.h1.string

'HTML Webpage'

In [364]:
# Parent, sibling, and child
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

# Exercises

In [365]:
# load webpage
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
# convert to BeautifulSoup object 
webpage = bs(r.content)

print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

In [366]:
# grab all social links from webpage in 3 different ways
social_links = webpage.find('ul', attrs={"class" :"socials"})
for link in social_links.find_all('a'):
    print(link['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [367]:
social_links = webpage.select("br ~ ul")
for link in social_links[0].find_all('a'):
    print(link['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [368]:
social_links = webpage.find_all('ul', attrs = {"class" : "socials"})
for link in social_links[0].find_all('a'):
    print(link['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [369]:
#scrape HTML table into pandas dataframe
import pandas as pd 

table = webpage.find('table')
table

<table class="hockey-stats">
<thead>
<tr>
<th class="season" data-sort="">S</th>
<th class="team" data-sort="team">Team</th>
<th class="league" data-sort="league">League</th>
<th class="regular gp" data-sort="gp">GP</th>
<th class="regular g" data-sort="g">G</th>
<th class="regular a" data-sort="a">A</th>
<th class="regular tp" data-sort="tp">TP</th>
<th class="regular pim" data-sort="pim">PIM</th>
<th class="regular pm" data-sort="pm">+/-</th>
<th class="separator"> </th>
<th class="postseason">POST</th>
<th class="postseason gp" data-sort="playoffs-gp">GP</th>
<th class="postseason g" data-sort="playoffs-g">G</th>
<th class="postseason a" data-sort="playoffs-a">A</th>
<th class="postseason tp" data-sort="playoffs-tp">TP</th>
<th class="postseason pim" data-sort="playoffs-pim">PIM</th>
<th class="postseason pm" data-sort="playoffs-pm">+/-</th>
</tr>
</thead>
<tbody>
<tr class="team-continent-NA">
<td class="season sorted">
                  2014-15
              </td>
<td class="team"

In [370]:
# get all table headers
columns = table.find_all('th')
column_names = [column.get_text() for column in columns]
column_names

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [371]:
rows = table.find('tbody').find_all('tr')
l = []
for r in rows:
    data = r.find_all('td')
    row = [tr.get_text() for tr in data]
    l.append(row)

In [372]:
new_l = []
for table_row in l:
    new_row = []
    for element in table_row:
        new_row.append(str(element).strip())
    new_l.append(new_row)
        

In [373]:
new_l

[['2014-15',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA II',
  '17',
  '3',
  '9',
  '12',
  '20',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2015-16',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA II',
  '9',
  '1',
  '1',
  '2',
  '2',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2016-17',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA II',
  '12',
  '5',
  '5',
  '10',
  '8',
  '0',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2017-18',
  'Did not play',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2018-19',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA III',
  '8',
  '5',
  '10',
  '15',
  '8',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  '']]

In [374]:
pd.DataFrame(new_l, columns = column_names)

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [375]:
import re
# get all facts that include the word is
facts = webpage.select('ul.fun-facts li')
facts_with_is = [fact.find(string = re.compile('is')) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

# Download an image

In [376]:
url = 'https://keithgalli.github.io/web-scraping/'
r = requests.get(url + "webpage.html")
webpage = bs(r.content)

# get image URL
img_url = webpage.select('div.row div.column img')[0]['src']

#download the image 
with open('lake.jpg', 'wb') as handler:
    handler.write(requests.get(url + img_url).content)

# Challenge

In [377]:
mystery_links = webpage.select('div.block a')
mystery_links
links = []
for link in mystery_links:
    links.append(link['href'])
    
for link in links:
    full_url = url + link
    webpage = requests.get(full_url)
    webpage_bs = bs(webpage.content)
    word = webpage_bs.find("p", attrs = {"id": "secret-word"})
    print(word.string)

Make
sure
to
smash
that
like
button
and
subscribe
!!!
