## Getting Data (Obtendo Dados)

Extracting data from **Internet**

In [17]:
from bs4 import BeautifulSoup
import requests

url = ("https://raw.githubusercontent.com/"
        "joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

In [6]:
first_paragraph = soup.find('p')    # or just soup.p
assert str(soup.find('p')) == '<p id="p1">This is the first paragraph.</p>'

In [11]:
# text

first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()

assert first_paragraph_words == ['This', 'is', 'the', 'first', 'paragraph.']

In [12]:
# multiples tags

first_paragraph_id = soup.p['id']
first_paragraph_id2 = soup.p.get('id')

assert first_paragraph_id == first_paragraph_id2 == 'p1'

In [14]:
# all

all_paragraphs = soup.find_all('p') # or just soup('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

assert len(all_paragraphs) == 2
assert len(paragraphs_with_ids) == 1

In [15]:
# specific class

important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p')
                         if 'important' in p.get('class', [])]


assert important_paragraphs == important_paragraphs2 == important_paragraphs3
assert len(important_paragraphs) == 1

In [20]:
# find all span elements in a div

spans_inside_divs = [span
                    for div in soup('div')
                    for span in div('span')]

assert len(spans_inside_divs) == 3

In [22]:
# book ex: monitoring the congress

from bs4 import BeautifulSoup
import requests

url = "https://www.house.gov/representatives"   # eca
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")

all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]

print(len(all_urls))

967


In [23]:
import re

regex = r"^https?://.*\.house\.gov/?$"

# Let's write some tests!
assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

In [25]:
good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))

# it's above 435 parliamentarians

good_urls = list(set(good_urls))

print(len(good_urls))

872
436


In [27]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')

# Use a set because the links might appear multiple times.
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}

print(links) # {'/media/press-releases'}

{'https://jayapal.house.gov/category/news/', 'https://jayapal.house.gov/category/press-releases/'}


In [28]:
from typing import Dict, Set

press_releases: Dict[str, Set[str]] = {}

for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://waters.house.gov: {'/media-center/press-releases'}
https://mccollum.house.gov: {'/media/press-releases'}
https://armstrong.house.gov: {'/media/press-releases'}
https://houlahan.house.gov/: set()
https://raskin.house.gov: {'/press-releases'}
https://adriansmith.house.gov/: {'/media/press-releases'}
https://nadler.house.gov: {'/news/documentquery.aspx?DocumentTypeID=1753'}
https://goldman.house.gov: {'/media/press-releases'}
https://houchin.house.gov: {'/media/press-releases'}
https://miller.house.gov/: {'/media/press-releases'}
https://meeks.house.gov: {'/media/press-releases', 'https://democrats-foreignaffairs.house.gov/press-releases'}
https://cardenas.house.gov: {'https://cardenas.house.gov/media-center/press-releases'}
https://cartwright.house.gov: {'/news/documentquery.aspx?DocumentTypeID=2442'}
https://maxmiller.house.gov: set()
https://laturner.house.gov: {'/media/press-releases'}
https://fry.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://sherrill.house

In [29]:
# paragraph metions

def paragraph_mentions(text: str, keyword: str) -> bool:
  """Return True if a <p> in text mentionh {keyword}"""

  soup = BeautifulSoup(text, 'html5lib')
  paragraphs = [p.get_text() for p in soup('p')]

  return any(keyword.lower() in paragraph.lower()
            for paragraph in paragraphs)

In [31]:
# test for paragraph_mentions

text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, "twitter")
assert not paragraph_mentions(text, "facebook")

In [32]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text

        if paragraph_mentions(text, 'data'):
            print(f"{house_url}")
            break  # done with this house_url

https://cartwright.house.gov
https://laurellee.house.gov
https://grothman.house.gov
https://mchenry.house.gov
https://kean.house.gov
https://balint.house.gov
https://schakowsky.house.gov
https://carbajal.house.gov
https://degette.house.gov
https://biggs.house.gov
https://pallone.house.gov
https://danbishop.house.gov
https://dustyjohnson.house.gov/
https://phillips.house.gov/
https://luttrell.house.gov
https://sarajacobs.house.gov
https://tokuda.house.gov
https://veasey.house.gov
https://fallon.house.gov
https://delbene.house.gov
https://tenney.house.gov/
https://beyer.house.gov
