# Loading necessary libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs

# Load our first page

In [2]:
# Load the web content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beautiful soup object
soup = bs(r.content)

# print the html content
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Start using beautiful soup for scraping

## find and find_all

In [3]:
first_header = soup.find("h2")
#print(first_header)

header = soup.find_all("h2")
print(header)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [4]:
# Passing a list of element o look for
first_header = soup.find(["h1","h2"]) #will find the one which occour first
#print(first_header)

header = soup.find_all(["h1","h2"])
print(header)

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [5]:
# You can pass attributes to find/find_all function
paragraph = soup.find_all("p",attrs={"id": "paragraph-id"})
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [6]:
# You can nest find/find_all calls
body = soup.find("body")
#print(body)
div = body.find("div")
#print(div)
header = div.find("h1")
print(header)

<h1>HTML Webpage</h1>


In [7]:
# Search specific string in find/find_all calls
import re

paragraph = soup.find_all("p",string= re.compile("Some"))
paragraph

header = soup.find_all("h2",string= re.compile("(H|h)eader"))
header

[<h2>A Header</h2>, <h2>Another header</h2>]

## Select (css selector)

For refrence (https://www.w3schools.com/cssref/css_selectors.asp)

In [8]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [9]:
paragraph = soup.select("div p")
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [10]:
paragraph = soup.select("h2 ~ p")
paragraph

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [12]:
paragraphs = soup.select("body > p")
print(paragraphs)
for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


## Getting different properties of HTML

In [13]:
header = soup.find("h2")
header.string

div = soup.find("div")
print(div.prettify())
print(div.string)# It will not work as it did not know which text to print
print(div.get_text())# Use this if multiple child element

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>

None

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [14]:
# Get a specific property form a elemnet
link = soup.find("a")
link["href"]

'https://keithgalli.github.io/web-scraping/webpage.html'

In [15]:
#path syntax
soup.body.div.h1.string

'HTML Webpage'

# Another link for testing the knowledge

## Getting all social link

In [16]:
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

soup = bs(r.content)

In [17]:
links = soup.select(".socials a")
for link in links:
    print(link["href"])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


## Making the table into datafram

In [18]:
import pandas as pd

In [19]:
columns = soup.find("thead").find_all("th")
column_names = [c.string for c in columns]

table_rows = soup.find("tbody").find_all("tr")
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=column_names)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


## Grab all funfact with world "is"

In [20]:
facts = soup.select("ul.fun-facts li")
fun_fact=[fact.find(string = re.compile("is")) for fact in facts]
fun_fact = [fact.find_parent().get_text() for fact in fun_fact if fact]
fun_fact

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

## Getting the secret msg

In [21]:
links = soup.select(".block ul li a")

In [22]:
file_link =[]
for link in links:
    file_link.append(link["href"])
file_link

['challenge/file_1.html',
 'challenge/file_2.html',
 'challenge/file_3.html',
 'challenge/file_4.html',
 'challenge/file_5.html',
 'challenge/file_6.html',
 'challenge/file_7.html',
 'challenge/file_8.html',
 'challenge/file_9.html',
 'challenge/file_10.html']

In [31]:
paragraph= []
for link in file_link:
    r = requests.get("https://keithgalli.github.io/web-scraping/"+link)
    site = bs(r.content)
    for wrapper in site.find_all("p",attrs={"id": "secret-word"}):
        paragraph.append(wrapper.text)

In [37]:
secret_msg = " ".join(paragraph)
print(secret_msg)

Make sure to smash that like button and subscribe !!!
