## Import packages

In [2]:
import requests
from bs4 import BeautifulSoup as bs

## Load a simple HTML page


In [5]:
# Load the webpage content 
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert HTML content into a BS object
soup = bs(r.content)

# Print out the HTML
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Using Beautiful Soup

### find and find_all methods

In [8]:
# find will return the first instance of the HTML tag 
first_header = soup.find("h2")
first_header

<h2>A Header</h2>

In [11]:
# Now we see all h2 headers in the HTML 
all_h2_headers = soup.find_all("h2")
all_h2_headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [49]:
# We can pass in a list of elements to get multiple tags
all_headers = soup.find_all(name=["h1", "h2"])
all_headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [51]:
# We can pass in attributes to the find/find_all functions
# Suppose we want to only get the paragraph whose id is "paragraph-id"
# We can pass in attrabiutes as a dictionary object
# if the attribute is not valid, an empty list is returned
paragraphs = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [56]:
# Nesting find/find_all calls
body = soup.find("body")
div = body.find("div")
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [70]:
# Search for specific strings in the find/find_all searches
# You regex to enhance strings searches
import re
paragraphs = soup.find_all(name= "p",string=re.compile("Some"))
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [74]:
headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [116]:
paragraphs = soup.find_all("p")
paragraphs

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Select (a CSS selector)

In [123]:
# Let's get only the paragraphs
# This works the same as .find_all("p")
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [139]:
# Get the paragrahs only IN div
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [135]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [137]:
# Get the paragaphs that come AFTER h2 (i.e. on the same level)
content = soup.select("h2 ~ p")
content

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [166]:
# Let's grab the bold text in paragraph whose id="paragraph-id"
# Essentially "Select all b elements inside p with id="pragraph-id""
bold_text = soup.select('p[id~=paragraph-id] b')
bold_text

[<b>Some bold text</b>]

In [198]:
# Alternate way achieve the same thing
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [208]:
# Third way
bold_text = soup.select("p#paragraph-id") # This returns a list object 
bold_text[0].b

<b>Some bold text</b>

In [199]:
# Get paragraphs that are DIRECT DESCENDANTS of body (i.e. ONLY one level below)
body_paragraphs = soup.select("body > p")
body_paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [184]:
for paragraphs in body_paragraphs:
    print(paragraphs.string)

Some italicized text
Some bold text


### Get different properties of the HTML

In [186]:
header = soup.find('h1')
header.string

'HTML Webpage'

In [193]:
div = soup.find('div')
div.string # Return None because there are child text elements

# Use the .get_text() method when there are many child elements
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [212]:
# Get a specific property from an element
# We can treat tags like dictionries and access their attributes
link = soup.find("a")
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

### Navigation

In [218]:
# Navigate the HTML with paths
soup.body.div.h1

<h1>HTML Webpage</h1>

#### If a tag is on the same level as another tag, they are known as siblings
- use .find_next_siblingS to get all siblings
- .find_next_sibling to get only the immediate sibling

In [235]:
# Let's get the siblings belonging to the div tag
soup.body.div.find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [236]:
# Get the immediate parent of the h1 tag
soup.body.div.h1.find_parent()

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [237]:
# Get all the parents of the h1 tag
# This will return div, body
soup.body.div.h1.find_parents()

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>,
 <body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>
 <h2>A Header</h2>
 <p><i>Some italicized text</i></p>
 <h2>Another header</h2>
 <p id="paragraph-id"><b>Some bold text</b></p>
 </body>,
 <html>
 <head>
 <title>HTML Example</title>
 </head>
 <body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>
 <h2>A Header</h2>
 <p><i>Some italicized text</i></p>
 <h2>Another header</h2>
 <p id="paragraph-id"><b>Some bold text</b></p>
 </body>
 </html>,
 <html>
 <h

In [251]:
# return children
soup.body.find_all_next()

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>,
 <h1>HTML Webpage</h1>,
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>,
 <h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <i>Some italicized text</i>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>,
 <b>Some bold text</b>]

## Practice using BS4
- use https://keithgalli.github.io/web-scraping/webpage.html

### Load the webpage

In [259]:
# Request the webpage
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Load into a soup object
webpage = bs(r.content)

# Print HTML content
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

### 1. Grab all social media links 
- do this at least 3 ways 
    - find/find_all and select

In [289]:
# Inspecting the HTML, I find the youtube link
youtube_link = webpage.find(name="a", string=re.compile('(Y|y)outube'))


In [291]:
youtube_link.string

'youtube.com/kgmit'

In [408]:
social_media_links = webpage.select("ul.socials li a")
[link.string for link in social_media_links]

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [409]:
youtube_link = webpage.find("a")
youtube_link['href']

'https://www.youtube.com/kgmit'

In [410]:
social_media_links = webpage.find(name="ul", attrs={'class': 'socials'})
[text for text in social_media_links.stripped_strings]

['Instagram:',
 'https://www.instagram.com/keithgalli/',
 'Twitter:',
 'https://twitter.com/keithgalli',
 'LinkedIn:',
 'https://www.linkedin.com/in/keithgalli/',
 'TikTok:',
 'https://www.tiktok.com/@keithgalli']

In [411]:
social_media_links = webpage.find_all(name="li", attrs={'class': re.compile('social')})
[link.a.string for link in social_media_links]

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [412]:
[link.a['href'] for link in social_media_links]

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Get the table and put it into a Pandas dataframe 

In [548]:
table = webpage.find(class_ ="hockey-stats")
table_head = table.find("thead")
table_body = table.find('tbody')
data_rows = table_body.find_all('tr')
all_info = []
column_labels = [header.get_text("", strip=True) for header in table_head.find_all('th')]

def get_hockey_info(row_data):    
    return [data.get_text(strip=True) for data in row_data.find_all('td')]


In [549]:
for row in data_rows:
    all_info.append(get_hockey_info(row))

In [546]:
import pandas as pd

In [550]:
df = pd.DataFrame(all_info, columns=column_labels)

In [551]:
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [567]:
facts = webpage.select("ul.fun-facts li")

In [583]:
facts_with_is = [fact.find(string=re.compile('is')) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

In [None]:
# 