In [1]:
import sys, re

# sys.argv is the list of command-lne argument
# sys.argv[0] is the name of the program itself
# sys.argv[1] will be the regex specified a the command line

def egrep(regex, line):
    regex = sys.argv[1]

    # for every line passed into the script
    for line in sys.stdin:
        # if it mathes the regex, write it to stdout
        if re.search(regex, line):
            sys.stdout.write(line)

In [2]:
def line_count():

    count = 0
    for line in sys.stdin:
        count += 1

    # Print goes to sys.stdout
    print(count)

In [3]:
with open('SomeFile.txt', 'w') as fp:
    fp.write("""date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5""")

In [4]:
# Just stick some data there

with open('email_addresses.txt', 'w') as f:
    f.write("ari.tyo52@gmail.com\n")
    f.write("ari.tyo@m.datascience.com\n")
    f.write("ari@m.datascience.com\n")

def get_domain(email_address: str) -> str:
    """Split on '@' and return the last piece"""
    return email_address.lower().split("@")[-1]

with open("email_addresses.txt", 'r') as f:
    # print(f.read())
    x = f.read().splitlines()
    for line in x:
        print(get_domain(line))

gmail.com
m.datascience.com
m.datascience.com


In [5]:
from collections import Counter

with open('email_addresses.txt', 'r') as f:
    """Check line count and domain counts"""
    # count_line = len(f.readlines())
    domain_counts = Counter(get_domain(line.strip()) for line in f if "@" in line)
    # print(count_line)
    print(domain_counts)

Counter({'m.datascience.com': 2, 'gmail.com': 1})


In [6]:
with open("tab_delimeted_stock_prices.txt", "w") as f:
    """make text file"""
    f.write("""6/20/2014\tAAPL\t90.91
6/20/2014\tMSFT\t41.68
6/20/2014\tFB\t64.5
6/19/2014\tAAPL\t91.86
6/19/2014\tMSFT\t41.51
6/19/2014\tFB\t64.34
""")


In [7]:
import csv

def process(date: str, symbol: str, closing_price: float) -> None:
    # Imagine that this function actually does something
    assert closing_price > 0.0

with open('tab_delimeted_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter='\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        process(date, symbol, closing_price)

In [8]:
with open('colon_delimited_stock_prices.txt', 'w') as f:
    f.write("""date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5
""")

with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row["date"]
        symbol = dict_row["symbol"]
        closing_price = float(dict_row["closing_price"])
        process(date, symbol, closing_price)

In [9]:
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5 }

with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f, delimiter=',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])

In [10]:
results = [["test1", "success", "Monday"],
           ["test2", "success, kind of", "Tuesday"],
           ["test3", "failure, kind of", "Wednesday"],
           ["test4", "failure, utter", "Thursday"]]

# don't do this!
with open('bad_csv.txt', 'w') as f:
    for row in results:
        f.write(",".join(map(str, row))) # might have too many commas in it!
        f.write("\n")                    # row might have newlines as well!


In [11]:
from bs4 import BeautifulSoup
import requests

# I put the relevant HTML file on Github. In order to fit
# the URL in the book I had to split it across tho lines.
# Recall that whitespace-separated strings get concatenated.

url = ("https://raw.githubusercontent.com/joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

print(soup)

<!DOCTYPE html>
<html lang="en-US"><head>
    <title>Getting Data</title>
    <meta charset="utf-8"/>
</head>
<body>
    <h1>Getting Data</h1>
    <div class="explanation">
        This is an explanation.
    </div>
    <div class="comment">
        This is a comment.
    </div>
    <div class="content">
        <p id="p1">This is the first paragraph.</p>
        <p class="important">This is the second paragraph.</p>
    </div>
    <div class="signature">
        <span id="name">Joel</span>
        <span id="twitter">@joelgrus</span>
        <span id="email">joelgrus-at-gmail</span>
    </div>


</body></html>


In [12]:
first_paragraph = soup.find('p')
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()
print(first_paragraph)
print(first_paragraph_text)
print(first_paragraph_words)


<p id="p1">This is the first paragraph.</p>
This is the first paragraph.
['This', 'is', 'the', 'first', 'paragraph.']


In [13]:
first_paragraph_id = soup.p['id']           # raises keyError if no 'id'
first_paragraph_id2 = soup.p.get('id')      # returns None if no 'id'

print(first_paragraph_id)
print(first_paragraph_id2)

p1
p1


In [14]:
all_paragraphs = soup.find_all('p')         # or just soup('p)
paragraph_with_ids = [p for p in soup('p') if p.get('id')]

print(all_paragraphs)
print(paragraph_with_ids)

[<p id="p1">This is the first paragraph.</p>, <p class="important">This is the second paragraph.</p>]
[<p id="p1">This is the first paragraph.</p>]


In [15]:
important_paragraphs = soup('p', {"class": "important"})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p') if 'important' in p.get('class', [])]

print(important_paragraphs)
print(important_paragraphs2)
print(important_paragraphs3)

[<p class="important">This is the second paragraph.</p>]
[<p class="important">This is the second paragraph.</p>]
[<p class="important">This is the second paragraph.</p>]


In [16]:
# Warning: will returns the same <span> multiple times
# if it sits inside multiple <div> s
# Be more clever if that's the case

# for each <div> on the page
# find each <span> inside it
spans_inside_divs = [span for div in soup('div') for span in div('span')]
print(spans_inside_divs)

[<span id="name">Joel</span>, <span id="twitter">@joelgrus</span>, <span id="email">joelgrus-at-gmail</span>]


In [17]:
from bs4 import BeautifulSoup
import requests

url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")

all_urls = [a['href'] for a in soup('a') if a.has_attr('href')]
print(all_urls)
print(len(all_urls))

['#main-content', '/', '/', '/representatives', '/leadership', '/committees', '/legislative-activity', '/the-house-explained', '/visitors', '/educators-and-students', '/media', '/doing-business-with-the-house', '/employment', '/representatives', '/leadership', '/committees', '/legislative-activity', '/the-house-explained', '/visitors', '/educators-and-students', '/media', '/doing-business-with-the-house', '/employment', '/the-house-explained', 'https://www.aoc.gov/explore-capitol-campus/buildings-grounds/house-office-buildings/cannon', 'https://www.aoc.gov/explore-capitol-campus/buildings-grounds/house-office-buildings/longworth', 'https://www.aoc.gov/explore-capitol-campus/buildings-grounds/house-office-buildings/rayburn', 'https://www.visitthecapitol.gov/visit/maps-and-brochures/us-capitol-map', '#room-numbers', '#by-state', '#by-name', '#state-alabama', '#state-california', '#state-delaware', '#state-florida', '#state-georgia', '#state-hawaii', '#state-idaho', '#state-kansas', '#sta

In [18]:
import re

# Must start with http:// or https://
# Must end with .house.gov or .house.gov/
regex = r"^https?://.*\.house\.gov/?$"

# Let's write some tests!
assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

# And now apply
good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))  # still 862 for me
print(good_urls)

878
['https://barrymoore.house.gov', 'https://figures.house.gov/', 'https://mikerogers.house.gov/', 'https://aderholt.house.gov/', 'https://strong.house.gov', 'https://palmer.house.gov/', 'https://sewell.house.gov/', 'https://begich.house.gov/', 'https://radewagen.house.gov', 'https://schweikert.house.gov/', 'https://crane.house.gov', 'https://ansari.house.gov/', 'https://stanton.house.gov/', 'https://biggs.house.gov', 'https://ciscomani.house.gov', 'https://grijalva.house.gov/', 'https://hamadeh.house.gov/', 'https://gosar.house.gov/', 'https://crawford.house.gov/', 'https://hill.house.gov/', 'https://womack.house.gov/', 'https://westerman.house.gov/', 'https://lamalfa.house.gov', 'https://huffman.house.gov', 'https://kiley.house.gov', 'https://mikethompson.house.gov/', 'https://mcclintock.house.gov/', 'https://bera.house.gov', 'https://matsui.house.gov', 'https://garamendi.house.gov/', 'https://harder.house.gov/', 'https://desaulnier.house.gov/', 'https://pelosi.house.gov/', 'https:/

In [19]:
good_urls = list(set(good_urls))

print(len(good_urls))

439


In [20]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, "html5lib")

# Use a set because the links might appear multiple times.
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
print(links)

{'https://jayapal.house.gov/category/press-releases/', 'https://jayapal.house.gov/category/news/'}


In [21]:
# from typing import Dict, Set

# press_releases: Dict[str, Set[str]] = {}

# for house_url in good_urls:
#     html = requests.get(house_url).text
#     soup = BeautifulSoup(html, 'html5lib')
#     pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}

#     print(f"{house_url}: {pr_links}")
#     press_releases[house_url] = pr_links

In [22]:
# print(press_releases)

NameError: name 'press_releases' is not defined

In [None]:
# def paragraph_mentions(text: str, keyword: str) -> bool:
#     """
#     Returns True if a <p> inside the text mentions {keyword}
#     """
#     soup = BeautifulSoup(text, 'html5lib')
#     paragraphs = [p.get_text() for p in soup('p')]

#     return any(keyword.lower() in paragraph.lower() for paragraph in paragraphs)

# text = """<body><h1>Facebook</h1><p>Twitter</p>"""
# # assert paragraph_mentions(text, "twitter")
# assert not paragraph_mentions(text, "twitter")
# assert paragraph_mentions(text, "facebook")
# assert not paragraph_mentions(text, "facebook")

In [23]:
import requests, json

github_user = "joelgrus"
endpoint = f"https://api.github.com/users/{github_user}/repos"

repos = json.loads(requests.get(endpoint).text)

from collections import Counter
from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

print(dates)
print(month_counts)
print(weekday_counts)

last_5_repositories = sorted(repos,
                                key=lambda r: r["pushed_at"],
                                reverse=True)[:5]

last_5_languages = [repo["language"]
                    for repo in last_5_repositories]

print(last_5_repositories)
print(last_5_languages)


[datetime.datetime(2017, 12, 2, 20, 13, 49, tzinfo=tzutc()), datetime.datetime(2018, 11, 30, 22, 41, 16, tzinfo=tzutc()), datetime.datetime(2019, 12, 1, 2, 57, 18, tzinfo=tzutc()), datetime.datetime(2020, 11, 21, 16, 21, 49, tzinfo=tzutc()), datetime.datetime(2021, 11, 24, 13, 53, 23, tzinfo=tzutc()), datetime.datetime(2022, 11, 22, 2, 25, 22, tzinfo=tzutc()), datetime.datetime(2023, 12, 2, 3, 15, 48, tzinfo=tzutc()), datetime.datetime(2018, 2, 23, 15, 51, 4, tzinfo=tzutc()), datetime.datetime(2017, 12, 19, 0, 12, 40, tzinfo=tzutc()), datetime.datetime(2018, 1, 31, 23, 51, 16, tzinfo=tzutc()), datetime.datetime(2018, 12, 19, 19, 44, 45, tzinfo=tzutc()), datetime.datetime(2018, 9, 5, 2, 43, 52, tzinfo=tzutc()), datetime.datetime(2019, 2, 1, 20, 25, 46, tzinfo=tzutc()), datetime.datetime(2013, 7, 5, 2, 2, 28, tzinfo=tzutc()), datetime.datetime(2023, 3, 19, 20, 15, 39, tzinfo=tzutc()), datetime.datetime(2017, 5, 10, 17, 22, 45, tzinfo=tzutc()), datetime.datetime(2013, 11, 15, 5, 33, 22, t

In [None]:
# TUGAS!!
# Nanti Malam Crawling data make tweepy
# gak pnya tweter