## Load in the necessary libraries 

In [1]:
pip install requests





In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup as bs

## Load our First Page

In [4]:
#Load Webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out our HTML
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using Beautiful Soup to Scrape

In [5]:
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

# Find and Find-All

In [6]:
first_headers = soup.find("h2")
first_headers

<h2>A Header</h2>

In [7]:
headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

 Passing List

In [8]:
headers = soup.find(["h1", "h2"])
headers

<h1>HTML Webpage</h1>

In [9]:
headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

Passing attributes to the find/find_all function

In [10]:
paragraph = soup.find_all("p")
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
paragraph = soup.find_all("p", attrs={"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

You can nest find/find_all calls

In [12]:
body = soup.find("body")
div = body.find("div")
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [13]:
body = soup.find("body")
div = body.find("div")
header = div.find("p")
header

<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>

We can search specific strings in our find/find_all calls

In [14]:
paragraph = soup.find_all("p", string="Some bold text")
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

## Regex 
by using this we don't need to use full text to search 

In [15]:
#using regex 
import re

paragraph = soup.find_all("p", string=re.compile("Some"))
paragraph

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [16]:
#if there is a capital letters (A / a)

paragraph = soup.find_all("h2", string=re.compile("(H|h)eader"))
paragraph

[<h2>A Header</h2>, <h2>Another header</h2>]

## select (CSS selector)

In [17]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [18]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [19]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [20]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [21]:
paragraphs = soup.select("body > p")
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [22]:
# Grab by elemet with specific proprty 
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Get different properties of the HTML

In [23]:
#Getting only string 

header = soup.find("h2")
header.string

'A Header'

In [24]:
div = soup.find("div")
print(div.string)

None


In [25]:
#to over come above we have a built-in function
#use this when u have multiple child elements

div = soup.find("div")
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [26]:
#get a specific property from an element [in this case we r extracting only link]
link = soup.find("a")
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

## Code navigation

In [27]:
#path syntax
soup.body.h2.string

'A Header'

In [28]:
#know the terms : parents, child, sibling   (documentation)

soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Practice

## Load a web page

In [39]:
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

webpage = bs(r.content)
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

## Grab all social link from the web page
Do this in 3 ways

In [50]:
link = webpage.find_all("a")
link

[<a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a>,
 <a href="#footer"><sup>1</sup></a>,
 <a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2015-2016"> ACHA II </a>,
 <a href="https://www.elite

In [56]:
links = webpage.select("ul.socials")
links

[<ul class="socials">
 <li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
 <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
 <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
 <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
 </ul>]

In [59]:
links = webpage.select("ul.socials a") 1a
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [60]:
# ** For actual links select

links = webpage.select("ul.socials a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [65]:
# know with find 2
links = webpage.find("ul", attrs={"class": "socials"})
links

<ul class="socials">
<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
</ul>

In [70]:
ulinks = webpage.find("ul", attrs={"class": "socials"})
links = ulinks.find_all("a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [77]:
#Another Method 3
links = webpage.select("li.social a")
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [78]:
#actual link
links = webpage.select("li.social a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

## Scraping the Table
best practice/ ideal to load into pandas dataframe

In [118]:
import pandas as pd

table = webpage.select("table.hockey-stats")[0]
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]     # we get colunm Headers Name

table_rows = table.find("tbody").find_all("tr")   # Table Rows from Table body
l = []       # storing in list                                    
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]   # .strip() we striping empty sapce, using get_text() beacuse nested elements [insted of string]  ** str().strip() because we can't strip to string we need to convert to python string ** 
    l.append(row)
    
df = pd.DataFrame(l, columns = column_names)  # loading into DataFrame (Pandas)
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [None]:
#Extract Data from Each Row [Explanation purpose]
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]   # .strip() we striping empty sapce, using get_text() beacuse nested elements [insted of string]  ** str().strip() because we can't strip to string we need to convert to python string ** 
    l.append(row)
    

## Grab all  - fun facts - that contains the word 'is'
fun facts is a module

In [124]:
facts = webpage.select("ul.fun-facts li")
facts

[<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>,
 <li>Middle name is Ronald</li>,
 <li>Never had been on a plane until college</li>,
 <li>Dunkin Donuts coffee is better than Starbucks</li>,
 <li>A favorite book series of mine is <i>Ender's Game</i></li>,
 <li>Current video game of choice is <i>Rocket League</i></li>,
 <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>]

In [137]:
#get a list (li) where "is" present   [ import re(regex) which allows for pattern matching in strings.]
import re

facts = webpage.select("ul.fun-facts li")
facts_with_is = [fact.find(string=re.compile("is")) for fact in facts]
facts_with_is

[None,
 'Middle name is Ronald',
 None,
 'Dunkin Donuts coffee is better than Starbucks',
 'A favorite book series of mine is ',
 'Current video game of choice is ',
 "The band that I've seen the most times live is the "]

In [138]:
#get rid of None
import re

facts = webpage.select("ul.fun-facts li")
facts_with_is = [fact.find(string = re.compile("is")) for fact in facts]
facts_with_is = [fact for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 'A favorite book series of mine is ',
 'Current video game of choice is ',
 "The band that I've seen the most times live is the "]

## Download Images

In [146]:
#load webpage content
url = "https://keithgalli.github.io/web-scraping/" #The base URL
r = requests.get(url +"webpage.html")              #The script fetches the webpage by appending to the base URL

#convert to a beautiful soup object
webpage = bs(r.content)

#image selecting  -----1
images = webpage.select("div.row div.column img")

#selecting first image  ------2
image_url = images[0]["src"]
full_url = url + image_url

#Download and Save the Image  -----3
img_data = requests.get(full_url).content
with open ('lake_como.jpg', 'wb') as handler:            # ** wb ** w = write mode, b = binary together is used to write binary data (such as an image)
    handler.write(img_data)



In [142]:
import os
print(os.getcwd())  # Get Current Working Directory # downloaded image is stored in this Directory


C:\Users\Shaik


## Mystery Message Challenge!
https://keithgalli.github.io/web-scraping/webpage.html     from this page

In [152]:
files = webpage.select("div.block a")
relative_files = [f["href"] for f in files]


url = "https://keithgalli.github.io/web-scraping/" #The base URL
for f in relative_files:
    full_url = url + f
    page = requests.get(full_url)
    bs_page = bs(page.content)
    secret_word_elemet = bs_page.find("p", attrs={"id": "secret-word"})
    secret_word = secret_word_elemet.string
    print(secret_word)
   

Make
sure
to
smash
that
like
button
and
subscribe
!!!
