In [1]:
import requests
from bs4 import BeautifulSoup as bs

In [5]:
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# convert to bs object
soup = bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Some basic methods/properties of BS object (Scraping)

In [6]:
soup.title # first encountered title tag

<title>HTML Example</title>

In [7]:
soup.head # first ecountered head tag

<head>
<title>HTML Example</title>
</head>

In [9]:
soup.html.prettify()

'<html>\n <head>\n  <title>\n   HTML Example\n  </title>\n </head>\n <body>\n  <div align="middle">\n   <h1>\n    HTML Webpage\n   </h1>\n   <p>\n    Link to more interesting example:\n    <a href="https://keithgalli.github.io/web-scraping/webpage.html">\n     keithgalli.github.io/web-scraping/webpage.html\n    </a>\n   </p>\n  </div>\n  <h2>\n   A Header\n  </h2>\n  <p>\n   <i>\n    Some italicized text\n   </i>\n  </p>\n  <h2>\n   Another header\n  </h2>\n  <p id="paragraph-id">\n   <b>\n    Some bold text\n   </b>\n  </p>\n </body>\n</html>\n'

In [10]:
soup.title.name 

'title'

In [11]:
dir(soup.title) # available options to use with soup.title

['HTML_FORMATTERS',
 'XML_FORMATTERS',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_attr_value_as_string',
 '_attribute_checker',
 '_find_all',
 '_find_one',
 '_formatter_for_name',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_select_debug',
 '_selector_combinators',
 '_should_pretty_print',
 '_tag_name_matches_and',
 'append',
 'attribselect_re',
 'attrs',
 'can_be_empty_element',
 'childGenerator',
 'children',
 'clear',
 'contents',
 'decode',
 'decode_contents',
 'decomp

In [12]:
soup.title.text # gets the text inside first ecnountered title tag

'HTML Example'

In [13]:
soup.title.string # gets the text inside first ecnountered title tag (find out what the difference is, if any)

'HTML Example'

In [14]:
soup.div # first encountered div tag

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [15]:
soup.p # first encountered p tag

<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>

In [16]:
soup.find_all("p") # all occurences of p tag

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [17]:
for link in soup.find_all("a"):
  print(link.href)

None


In [18]:
for link in soup.find_all("a"):
  print(link.get('href')) # use object.get to get attributes within tags like href, class, etx

https://keithgalli.github.io/web-scraping/webpage.html


In [19]:
print(soup.get_text()) # gets the text rendered by the html file



HTML Example



HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html

A Header
Some italicized text
Another header
Some bold text





In [21]:
first_header = soup.find('h2') # find can also be used to get the first ocurrence of a tag
print(first_header)

<h2>A Header</h2>


In [22]:
first_header = soup.find(['h1', 'h2']) # returns whichever tag found first from the list passed
print(first_header)

<h1>HTML Webpage</h1>


In [23]:
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"}) # can pass atributes of the tag as a dictionary
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [24]:
body = soup.find("body")
div = body.find("div") # search narrowed down to the elements inside the body returned by previous line
# useful for narrowing down searches in large pages
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [26]:
# can search for exact strings, partials do not work out of the box (case sensitive)
paragraphs = soup.find_all("p", string = "Some bold text")
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [27]:
import re

In [30]:
paragraph2 = soup.find_all("p", string = re.compile("Some")) # partial string matched by use of regex
paragraph2

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [32]:
headers = soup.find_all("h2", string = re.compile("(H|h)eader")) # learn regex!
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

# select (CSS Selector)

In [33]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [35]:
paragraphs = soup.select("h2 ~ p") # gets the p tag directly after h2
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [40]:
bold_text = soup.select("p#paragraph-id b") # b tag inside a p tag with id as paragraph-id 
bold_text

[<b>Some bold text</b>]

In [41]:
paragraphs = soup.select("body > p") # p tags that are direct descendants of body
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [42]:
for p in paragraphs: 
  print(p.select("i")) # displays i tags that are present inside the selected paragraph

[<i>Some italicized text</i>]
[]


In [44]:
# Grab element by specific property
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

In [45]:
soup.select("[href]")

[<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>]

In [46]:
soup.select("href")

[]

# Get Different Properties of HTML

In [47]:
header2 = soup.find("h2") # I experimented this above, this is here again as Keith has brought this up in the tutorial
header2.string

'A Header'

In [50]:
div = soup.find("div")
print(div.prettify())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



In [53]:
div.string # nothing happens, because it has 2 elements as children

In [55]:
print(div.get_text()) # Now you know the differnece between sring and text :P!


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



# Code Navigation

In [56]:
soup.body.div.h1.string # path syntax, drill down the tags

'HTML Webpage'

In [58]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [60]:
# know the terms: parent, sibling, child
print(soup.body.div.h1.parent.parent)

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>


In [61]:
print(soup.body.div.h1.sibling)

None


In [63]:
print(soup.body.div.h1.parent.siblings)

None


In [64]:
print(soup.body.div.find_next_sibling())

<h2>A Header</h2>


In [65]:
print(soup.body.div.find_next_siblings())

[<h2>A Header</h2>, <p><i>Some italicized text</i></p>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>]
