### Beautiful Soup! 

Here I am, learning webscraping!
Note: You need to use _Python 3_

In [1]:
import requests
from bs4 import BeautifulSoup as bs

In [4]:
#get info from webpage
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

#convert to a beautiful soup object
soup = bs(r.content)

#prints out elements in a pretty way
print(soup.prettify()) 


<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [5]:
first_header =soup.find("h2") #generates the first element that corresponds to h2
headers = soup.find_all("h2") #creates a list of all the elements corresponding to h2
print(headers)

In [9]:
first_header = soup.find(["h1", "h2"])
headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [12]:
#passing in attributes to the find all function
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [18]:
#you can nest multiple calls
body = soup.find('body')
div = body.find('div')
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [22]:
import re #use the re library to correctly find infor on

#search for specific strings 
paragraphs = soup.find_all("p", string=re.compile("Some"))
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [25]:
print(soup.body.prettify())


<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [26]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [27]:
paragraphs = soup.select("h2 ~p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [28]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [33]:
paragraphs = soup.select("body > p")
paragraphs

for paragraph in paragraphs:
    paragraph.select("i")
    
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

In [37]:
header = soup.find("h2")  
header.string #this will give u just the string


div = soup.find("div")
print(div.prettify())
print(div.string) #unsure if it should print out the webpage or something else bc of multiple child elements
print(div.get_text()) #this will work if string does not

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>

None

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [45]:
#get a specific property of an element
link = soup.find("a")
link['href']

paragraphs = soup.select("p#paragraph-id")
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [47]:
#path syntax
soup.body.div.h1.string

'HTML Webpage'

In [49]:
#know the terms: parent, sibling, child
print(soup.body.prettify())
#nested structure
#parent is the object directly above it in the hierarchy
#child is below t
#if it is on the same level, they are siblings

soup.body.find("div").find_next_sibling()

#bs4 has a lot of different functions 

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



<h2>A Header</h2>

### Exercises

In [50]:
##load webage
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

webpage = bs(r.content)

print(webpage.prettify()) #pretty extensive!

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

#### Task 1: Grab all social links in webpage

Do it in three different ways

In [74]:
links = webpage.select("ul.socials a")
a_links = [link['href'] for link in links]
a_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [79]:
ulist = webpage.find("ul", attrs={"class": "socials"})
links = ulist.find_all("a")
a_links = [link['href'] for link in links]
a_links


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

#### Task 2: Grab the table

In [94]:
import pandas as pd

table = webpage.select("table.hockey-stats")[0]
table

columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]

table_rows = table.find("tbody").find_all("tr")
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text().strip()) for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=column_names)
df['Team']

0    MIT (Mass. Inst. of Tech.)
1    MIT (Mass. Inst. of Tech.)
2    MIT (Mass. Inst. of Tech.)
3                  Did not play
4    MIT (Mass. Inst. of Tech.)
Name: Team, dtype: object