In [None]:
from bs4 import BeautifulSoup
import html
import math
import os
import pandas as pd
import requests
import socket
import time
import urllib.request

In [None]:
aclu_pandas = pd.read_json('aclu.json') # data from https://www.aclu.org/legislative-attacks-on-lgbtq-rights?state
WEBAPP_SHENANIGANS = ['GA', 'IN', 'ME', 'ND', 'NH', 'NJ', 'SD']

#pd.set_option('display.max_colwidth', None)
#aclu_pandas.loc[pd.json_normalize(aclu_pandas['state']).loc[lambda item: item['value'] == 'PA'].index]['link'].transform(lambda x: html.unescape(x))

In [None]:
def expand(short_prefix: str):
    if short_prefix.lower().startswith('h'):
        return 'house'
    if short_prefix.lower().startswith('s'):
        return 'senate'

def format_ak(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://www.akleg.gov/basis/Bill/Text/33?Hsid=',
        prefix,
        num.zfill(4),
        'A',
    ])])

def format_ar(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://www.arkleg.state.ar.us/Bills/FTPDocument?path=%2FBills%2F2023R%2FPublic%2F',
        prefix,
        num,
        '.pdf',
    ])])

def format_az(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://www.azleg.gov/legtext/56leg/1R/bills/',
        prefix,
        num.zfill(4),
        'P.htm',
    ])])

def format_co(frame):
    bill_name = frame['name']

    # hacky
    prefix, num = bill_name.split(' ')
    _, num = num.split('-')
    return ('pdf', [''.join([
        'https://leg.colorado.gov/sites/default/files/documents/2023A/bills/2023a_',
        num,
        '_01.pdf',
    ])])

def format_ct(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://www.cga.ct.gov/2023/TOB/',
        prefix[0],
        '/PDF/2023',
        prefix,
        '-',
        num.zfill(5),
        '-R00-',
        prefix,
        '.pdf',
    ])])

def format_fl(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://www.flsenate.gov/Session/Bill/2023/',
        num,
        '/BillText/Filed/PDF',
    ])])

def format_hi(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://www.capitol.hawaii.gov/sessions/session2023/bills/',
        prefix,
        num,
        '_.HTM',
    ])])

def format_ia(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://www.legis.iowa.gov/docs/publications/LGI/90/',
        prefix,
        num,
        '.pdf',
    ])])

def format_id(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://legislature.idaho.gov/wp-content/uploads/sessioninfo/2023/legislation/',
        prefix[0],
        num.zfill(4),
        '.pdf',
    ])])

def format_ks(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'http://kslegislature.org/li/b2023_24/measures/documents/',
        prefix.lower(),
        num,
        '_00_0000.pdf',
    ])])

def format_ky(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://apps.legislature.ky.gov/recorddocuments/bill/23RS/',
        prefix.lower(),
        num,
        '/orig_bill.pdf',
    ])])

def format_md(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://mgaleg.maryland.gov/2023RS/bills/',
        prefix.lower(),
        '/',
        prefix.lower(),
        num.zfill(4),
        'F.pdf',
    ])])

def format_mi(frame):
    # TODO: hacky
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'http://www.legislature.mi.gov/documents/2023-2024/billintroduced/House/htm/2023-HIB-',
        num,
        '.htm',
    ])])

def format_mn(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://www.revisor.mn.gov/bills/text.php?number=',
        prefix,
        num,
        '&type=bill&version=0&session=ls93&session_year=2023&session_number=0',
    ])])

def format_mo(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    
    if prefix == 'HB':
        host = 'https://www.house.mo.gov'
        page = requests.get(item['link'].replace('Bill.aspx', 'BillContent.aspx'))
        soup = BeautifulSoup(page.content)
        return ('pdf', [
            urllib.parse.urljoin(
                host, 
                soup.find_all(class_='textType')[0].find('a')['href'],
        )])

    if prefix == 'SB':
        return ('pdf', [''.join([
            'https://senate.mo.gov/23info/pdf-bill/intro/',
            prefix,
            num,
            '.pdf',
        ])])


def format_ms(frame):
    # this is such a shitty system
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    fnum = float(num)/100
    low_index = int(100*math.floor(fnum))

    # nice one, mississippi
    if low_index == 2000:
        low_index = 2001
        
    high_index = int(100*math.ceil(fnum))-1
    return ('html', [''.join([
        'http://billstatus.ls.state.ms.us/documents/2023/html/',
        prefix,
        f'/{str(low_index).zfill(4)}-{str(high_index).zfill(4)}/',
        prefix,
        num.zfill(4),
        'IN.htm',
    ])])

def format_mt(frame):
    # https://leg.mt.gov/bills/2023/billhtml/SB0099.htm
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://leg.mt.gov/bills/2023/billhtml/',
        prefix,
        num.zfill(4),
        '.htm',
    ])])

def format_nc(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://www.ncleg.gov/Sessions/2023/Bills/',
        expand(prefix),
        '/PDF/',
        prefix[0],
        num,
        'v0.pdf',
    ])])

def format_ne(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://nebraskalegislature.gov/FloorDocs/108/PDF/Intro/',
        prefix,
        num,
        '.pdf',
    ])])

def format_nm(frame):
    # TODO: hacky
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://www.nmlegis.gov/Sessions/23%20Regular/bills/house/',
        prefix,
        num.zfill(4),
        '.html',
    ])])

def format_ok(frame):
    # TODO: this route seems busted
    # http://webserver1.lsb.state.ok.us/cf_pdf/2023-24%20INT/SB/SB789%20INT.PDF
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'http://webserver1.lsb.state.ok.us/cf_pdf/2023-24%20INT/',
        prefix,
        '/',
        prefix,
        num,
        '%20INT.pdf',
    ])])

def format_or(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://olis.oregonlegislature.gov/liz/2023R1/Downloads/MeasureDocument/',
        prefix,
        num,
        '/Introduced',
    ])])

def format_pa(frame):
    # TODO: hacky
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://www.legis.state.pa.us/CFDOCS/Legis/PN/Public/btCheck.cfm?txtType=HTM&sessYr=2023&sessInd=0&billBody=H&billTyp=B&billNbr=',
        num.zfill(4),
        '&pn=0122',
    ])])

def format_ri(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'http://webserver.rilegislature.gov/BillText/BillText23/',
        expand(prefix).capitalize(),
        'Text23/',
        prefix[0],        num.zfill(4),
        '.pdf',
    ])])

def format_sc(frame):
    # https://www.scstatehouse.gov/sess125_2023-2024/bills/3827.htm
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://www.scstatehouse.gov/sess125_2023-2024/bills/',
        num,
        '.htm',
    ])])

def format_tn(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://wapp.capitol.tn.gov/apps/BillInfo/Default.aspx?BillNumber=',
        prefix,
        num.zfill(4),
    ])])

def format_tx(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://capitol.texas.gov/tlodocs/88R/billtext/html/',
        prefix,
        num.zfill(5),
        'I.htm',
    ])])

def format_ut(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    # haha we have to return an entire range of possible items because
    # the S03 or whatever at the end seems random nice one Utah
    return ('html', [''.join([
        'https://le.utah.gov/~2023/bills/',
        prefix.lower()[0],
        'billint/',
        prefix,
        num.zfill(4),
        'S',
        str(idx).zfill(2),
        '.htm',
    ]) for idx in range(1,9)])

def format_va(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'https://lis.virginia.gov/cgi-bin/legp604.exe?231+ful+',
        prefix,
        num,
    ])])

def format_wa(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://lawfilesext.leg.wa.gov/biennium/2023-24/Pdf/Bills/',
        expand(prefix).capitalize(),
        '%20Bills/',
        num,
        '.pdf',
    ])])

def format_wv(frame):
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('html', [''.join([
        'http://www.wvlegislature.gov/Bill_Status/bills_text.cfm?billdoc=',
        prefix.lower(),
        num,
        '%20intr.htm&yr=2023&sesstype=RS&i=',
        num
    ])])

def format_wy(frame):
    # https://wyoleg.gov/2023/Introduced/SF0144.pdf
    bill_name = frame['name']
    prefix, num = bill_name.split(' ')
    return ('pdf', [''.join([
        'https://wyoleg.gov/2023/Introduced/',
        prefix,
        num.zfill(4),
        '.pdf',
    ])])

bill_formatters = {
    'AK': format_ak,
    'AR': format_ar,
    'AZ': format_az,
    'CO': format_co,
    'CT': format_ct,
    'FL': format_fl,
    'HI': format_hi,
    'IA': format_ia,
    'ID': format_id,
    'KS': format_ks,
    'KY': format_ky,
    'MD': format_md,
    'MI': format_mi,
    'MN': format_mn,
    'MO': format_mo,
    'MS': format_ms,
    'MT': format_mt,
    'NC': format_nc,
    'NE': format_ne,
    'NM': format_nm,
    'OK': format_ok,
    'OR': format_or,
    'PA': format_pa,
    'RI': format_ri,
    'SC': format_sc,
    'TN': format_tn,
    'TX': format_tx,
    'UT': format_ut,
    'VA': format_va,
    'WA': format_wa,
    'WV': format_wv,
    'WY': format_wy,
}

In [None]:
subset = aclu_pandas[['name', 'state', 'link']].copy()
subset['state'] = subset['state'].transform(lambda x: x['value'])
subset['link'] = subset['link'].transform(lambda x: html.unescape(x))

# why the fuck are LAWS using anti-scrape tech (looking at you, AZ and HI)
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

# the server in WV is run by hillbillies and occasionally extremely slow for no reason
# just time out and we'll get it on a future attempt
socket.setdefaulttimeout(5)

failed = 0
skipped = 0
downloaded = 0
start = time.time()
print('Downloading legislation')

for idx, item in subset.iterrows():
    if item['state'] not in bill_formatters:
        if item['state'] not in WEBAPP_SHENANIGANS:
            print(f'I can\'t resolve {item["state"]} legislation yet')
        skipped = skipped + 1
        continue
    
    # routing for OK is fucked right now
    if item['state'] == 'OK':
        skipped = skipped + 1
        continue
    
    target_extension, URLs = bill_formatters[item['state']](item)
    
    local_name = 'bills/' + '_'.join([item['state'], item['name'].replace(' ', '_').replace('.','')])+'.' + target_extension

    if os.path.exists(local_name):
        # print(f'Skipping {item["state"]} {item["name"]}, already downloaded')
        skipped = skipped + 1
        continue
    
    for URL in URLs:
        try:
            print(f'Downloading {URL}')
            urllib.request.urlretrieve(URL, local_name)
        except Exception as e:
            failed = failed + 1
            print(f'Failed to download {URL}: {e}')
        else:
            downloaded = downloaded + 1
            break

end = time.time()
print(f'Download complete ({downloaded} downloaded, {skipped} skipped, {failed} failed)')
print(f'{end-start:.2f}s elapsed')