In [1]:
from bs4 import BeautifulSoup as Soup
from itertools import chain, islice, takewhile
import json
import re
import requests
import string
import time

TRACKER_URL = 'https://www.equalitytexas.org/legislature/legislative-bill-tracker-2023'
OUTPUT_PATH = '../datasets/equalitytexas.json'

def extract_row(row):
    cells = row.find_all('td')
    d = re.search(r'\d{2}/\d{2}/\d{4}', cells[3].text)
    return {
        'state': 'TX',
        'bill_id': cells[0].text,
        'sponsors': [sponsor for sponsor in cells[1].text.split(' ') if sponsor not in string.punctuation],
        'description': cells[2].text,
        'status_date': d.group(0) if d else '',
    }

start_time = time.time()

page = requests.get(TRACKER_URL)

soup = Soup(page.content, 'html.parser')
bad_bills = soup.find(id='bad-bills')

bill_tables = islice(bad_bills.parent.parent.find_all('table'), 1, None)
relevant_rows = chain.from_iterable(
    (row for row in tbl.find_all('tr') if not row.find('th'))
    for tbl
    in bill_tables
)

dataset = list((takewhile(lambda r: r['bill_id'] != '#N/A', (extract_row(row) for row in relevant_rows))))

with open(OUTPUT_PATH, 'w') as f:
    json.dump(dataset, f, indent=2)

end_time = time.time()

print(f'Data "equalitytexas.json" refreshed with {len(dataset)} items ({(end_time-start_time):.2f}s elapsed)')

Data "equalitytexas.json" refreshed with 139 items (0.17s elapsed)
