In [31]:
from collections import Counter
import csv
from datetime import datetime
from bs4 import BeautifulSoup
import requests
import re
import json
from dateutil.parser import parse

In [3]:
def get_domain(email_address: str) -> str:
    return email_address.lower().split("@")[-1]

assert get_domain('joelgrus@gmail.com') == 'gmail.com'
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

In [4]:
with open('data/emails.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())
                            for line in f
                            if '@' in line)

print(domain_counts)   

Counter({'gmail.com': 4, 'ukr.net': 3, 'yahoo.com': 2, 'mail.com': 1})


In [5]:
with open('data/tab_delimited_stock_prices.txt', 'w') as f:
    f.write("""6/20/2014\tAAPL\t90.91
6/20/2014\tMSFT\t41.68
6/20/2014\tFB\t64.5
6/19/2014\tAAPL\t91.86
6/19/2014\tMSFT\t41.51
6/19/2014\tFB\t64.34
""")

In [6]:
with open('data/tab_delimited_stock_prices.txt', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        print(date, symbol, closing_price)
    

6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5
6/19/2014 AAPL 91.86
6/19/2014 MSFT 41.51
6/19/2014 FB 64.34


In [7]:
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5 }

In [8]:
with open('data/tab_delimited_stock_prices.txt', 'a', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    today = datetime.today()
    today = today.strftime("%#m/%#d/%Y")
    for stock, price in todays_prices.items():
        writer.writerow([today, stock, price])

# HTML

In [9]:
url = ("https://raw.githubusercontent.com/"
       "joelgrus/data/master/getting-data.html")

html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

In [10]:
soup

<!DOCTYPE html>
<html lang="en-US"><head>
    <title>Getting Data</title>
    <meta charset="utf-8"/>
</head>
<body>
    <h1>Getting Data</h1>
    <div class="explanation">
        This is an explanation.
    </div>
    <div class="comment">
        This is a comment.
    </div>
    <div class="content">
        <p id="p1">This is the first paragraph.</p>
        <p class="important">This is the second paragraph.</p>
    </div>
    <div class="signature">
        <span id="name">Joel</span>
        <span id="twitter">@joelgrus</span>
        <span id="email">joelgrus-at-gmail</span>
    </div>


</body></html>

In [11]:
first_paragraph = soup.find('p')
first_paragraph_text = soup.p.text
first_paragraph_word = soup.p.text.split()
first_paragraph_id = soup.p.get('id')
first_paragraph, first_paragraph_text, first_paragraph_word, first_paragraph_id

(<p id="p1">This is the first paragraph.</p>,
 'This is the first paragraph.',
 ['This', 'is', 'the', 'first', 'paragraph.'],
 'p1')

In [12]:
all_paragraphs = soup.find_all('p')
paragraph_with_ids = [p for p in soup('p') if p.get('id')]
all_paragraphs, paragraph_with_ids

([<p id="p1">This is the first paragraph.</p>,
  <p class="important">This is the second paragraph.</p>],
 [<p id="p1">This is the first paragraph.</p>])

In [13]:
impotant_paragraphs = soup('p', {'class': 'important'})
impotant_paragraphs2 = soup('p', 'important')
impotant_paragraphs3 = [p for p in soup('p') if 'important' in p.get('class', [])]
impotant_paragraphs, impotant_paragraphs2, impotant_paragraphs3

([<p class="important">This is the second paragraph.</p>],
 [<p class="important">This is the second paragraph.</p>],
 [<p class="important">This is the second paragraph.</p>])

In [14]:
spans_inside_divs = [span
                     for div in soup('div')
                     for span in div('span')]
spans_inside_divs

[<span id="name">Joel</span>,
 <span id="twitter">@joelgrus</span>,
 <span id="email">joelgrus-at-gmail</span>]

## House gov

In [15]:
url = 'https://www.house.gov/representatives'
text = requests.get(url).text
soup = BeautifulSoup(text, 'html5lib')

all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]

print(len(all_urls))

967


In [16]:
regex = r"^https?://.*\.house.gov/?$"

assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

In [17]:
good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))

876


In [18]:
good_urls = list(set(good_urls))

print(len(good_urls))

438


In [21]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')

In [23]:
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}

print(links)

{'https://jayapal.house.gov/category/press-releases/', 'https://jayapal.house.gov/category/news/'}


## API

In [28]:
github_user = 'bohdin'
endpoint = f'https://api.github.com/users/{github_user}/repos'

repos = json.loads(requests.get(endpoint).text)

In [32]:
dates = [parse(repo["created_at"]) for repo in repos]
mounth_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

In [36]:
last_5_repositories = sorted(repos, key=lambda r: r["created_at"], reverse=True)[:5]

last_5_repositories = [repo['language'] for repo in last_5_repositories]

print(last_5_repositories)

['Jupyter Notebook', None, 'HTML', 'Jupyter Notebook', 'Python']
