## Import Libraries

In [1]:
import requests
import re
from bs4 import BeautifulSoup as bs
import pandas as pd

## Load out first web page

In [2]:
# Load the webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# Convert to bs object
soup = bs(r.content)

print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using bs for scrapping

In [3]:
first_header = soup.find_all('h2')
headers = soup.find_all(['h1', 'h2'])
print(first_header)
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [4]:
headers = soup.find_all(['h1', 'h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [5]:
# attributes to pass into find/find_all methods
paras = soup.find('body').find_all('p', attrs={'id': 'paragraph-id'})
paras

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [6]:
# We can search for specific string in find/find_all
some_string = soup.findAll('p', string=re.compile('Some'))
print(some_string)

headers = soup.findAll(['h1', 'h2'], string=re.compile('(H|h)eader'))
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


In [7]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [8]:
### select (CSS)

content = soup.select('p#paragraph-id b')
print(content)

[<b>Some bold text</b>]


In [9]:
# Nested calls

paragraphs = soup.select('body > p')
print(paragraphs)

for para in paragraphs:
    print(para.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [10]:
# Grab element with specific property

print( soup.select('[align="middle"]') )

[<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>]


## Get different properties of the HTML

In [11]:
print(soup.find('h2').string)
print(soup.find('div').string)
print(soup.find('div').get_text())

A Header
None

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [12]:
# Getting a specific property from an element

print(soup.find('a')['href'])
print(soup.select('p#paragraph-id')[0]['id'])

https://keithgalli.github.io/web-scraping/webpage.html
paragraph-id


In [13]:
# Path syntax

print(soup.body.p.a.get_text())

keithgalli.github.io/web-scraping/webpage.html


In [14]:
# Know the terms: parent, sibling, child
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [15]:
print(soup.body.find('div').find_next_siblings())

[<h2>A Header</h2>, <p><i>Some italicized text</i></p>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>]


# Next WebPage

In [16]:
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
webpage = bs(r.content)

In [17]:
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

## Grab all of the social links from the web page

In [18]:
a_links = webpage.body.select('li.social a')
links = [a_link['href'] for a_link in a_links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [19]:
ul_socials = webpage.find('ul', attrs={'class': 'socials'})
links = [a_link['href'] for a_link in ul_socials.find_all('a')]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

## Grab the table

In [24]:
my_table = []
thead = webpage.thead.select('th')
headers = [th.get_text() for th in thead]
my_table.append(headers)
headers

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [25]:
tbody = webpage.tbody.select('tr')
# for tr in tbody:
for table_row in tbody:
    row = [item.get_text().strip() for item in table_row.select('td')]
    my_table.append(row)

for row in my_table:
    print(row)


['S', 'Team', 'League', 'GP', 'G', 'A', 'TP', 'PIM', '+/-', '\xa0', 'POST', 'GP', 'G', 'A', 'TP', 'PIM', '+/-']
['2014-15', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '17', '3', '9', '12', '20', '', '|', '', '', '', '', '', '', '']
['2015-16', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '9', '1', '1', '2', '2', '', '|', '', '', '', '', '', '', '']
['2016-17', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '12', '5', '5', '10', '8', '0', '|', '', '', '', '', '', '', '']
['2017-18', 'Did not play', '', '', '', '', '', '', '', '|', '', '', '', '', '', '', '']
['2018-19', 'MIT (Mass. Inst. of Tech.)', 'ACHA III', '8', '5', '10', '15', '8', '', '|', '', '', '', '', '', '', '']


## Save as csv

In [26]:
df = pd.DataFrame(my_table)
df.columns = df.iloc[0]
df = df.iloc[1:]
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
1,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
2,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
3,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
4,2017-18,Did not play,,,,,,,,|,,,,,,,
5,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [27]:
df.to_csv('MIT_hockey_stats.csv', index=False)

## Grab all fun facts with 'is'

In [54]:
fun_facts_list = webpage.select('ul.fun-facts li')

fun_facts1 = [fact.get_text() for fact in fun_facts_list if fact.find(string=re.compile('is'))]
print(fun_facts1)
fun_facts2 = [li.get_text() for li in fun_facts_list if 'is' in str(li.get_text())]
print(fun_facts2)

['Middle name is Ronald', 'Dunkin Donuts coffee is better than Starbucks', "A favorite book series of mine is Ender's Game", 'Current video game of choice is Rocket League', "The band that I've seen the most times live is the Zac Brown Band"]
['Middle name is Ronald', 'Dunkin Donuts coffee is better than Starbucks', "A favorite book series of mine is Ender's Game", 'Current video game of choice is Rocket League', "The band that I've seen the most times live is the Zac Brown Band"]


## Getting the Secret Message

In [83]:
url = 'https://keithgalli.github.io/web-scraping/'
links_list = webpage.select('li  a')[5:]
links = [url + link['href'] for link in links_list]

secret_message = []
for link in links:
    file1 = bs(requests.get(link).content)
    secret_message += [file1.select('p#secret-word')[0].get_text()]
print(' '.join(secret_message))

Make sure to smash that like button and subscribe !!!
