In [1]:
import base64
import json
from legiscan import legiscan_api
import os
import pandas as pd
from pprint import pprint
import requests

def get_bill_contents_filename(row, extension):
    state = row['state']
    bill_id = '_'.join(row['billId'].split(' '))
    return f'bills/{state}_{bill_id}.{extension}'

def get_bill_text_response_filename(row):
    state = row['state']
    bill_id = '_'.join(row['billId'].split(' '))
    return f'tmp/bill_text_response_{state}_{bill_id}.json'

def get_bill_meta_filename(row):
    state = row['state']
    bill_id = '_'.join(row['billId'].split(' '))
    return f'tmp/bill_meta_{state}_{bill_id}.json'

@legiscan_api
def get_bill_meta(row, api_key: str):
    local_filename = get_bill_meta_filename(row)
    legiscan_bill_id = row['legiscanId']

    if os.path.exists(local_filename):
        print(f'skipping {local_filename}, exists')
        return local_filename

    assembled_url = f'https://api.legiscan.com/?key={api_key}&op=getBill&id={legiscan_bill_id}'
    resp = requests.get(assembled_url)

    if not resp.ok:
        print(f'Error {resp.status_code} downloading {local_filename}')
        return None
    
    parsed = json.loads(resp.text)
    if parsed['status'].upper() == 'ERROR':
        print(f'Error {parsed["alert"]["message"]} downloading {local_filename}')
        return None
    
    with open(local_filename, 'wb') as f:
        f.write(resp.content)
    
    print(f'got {local_filename}')
    return local_filename

@legiscan_api
def get_bill_text(row, bill_meta_path: str, api_key: str):
    local_filename = get_bill_text_response_filename(row)

    meta = None
    with open(bill_meta_path, 'r') as f:
        meta = json.load(f)

    texts = meta['bill']['texts']
    sorted_texts = sorted(texts, key=lambda x: x['date'], reverse=True)
    doc_id = sorted_texts[0]['doc_id']

    if os.path.exists(local_filename):
        print(f'skipping {local_filename}, exists')
        return local_filename

    assembled_url = f'https://api.legiscan.com/?key={api_key}&op=getBillText&id={doc_id}'
    resp = requests.get(assembled_url)

    if not resp.ok:
        print(f'Error {resp.status_code} downloading {local_filename}')
        return None
    
    parsed = json.loads(resp.text)
    if parsed['status'].upper() == 'ERROR':
        print(f'Error {parsed["alert"]["message"]} downloading {local_filename}')
        return None
    
    with open(local_filename, 'wb') as f:
        f.write(resp.content)
    
    print(f'got {local_filename}')
    return local_filename

def extract_bill_contents(row, response_path: str):
    result = None
    with open(response_path, 'r') as f:
        result = json.load(f)['text']

    doc = result['doc']
    extension = result['mime'].split('/')[-1]
    local_filename = get_bill_contents_filename(row, extension)

    with open(local_filename, 'wb') as f:
        f.write(base64.b64decode(doc))
    
    return local_filename


In [2]:
raw = pd.read_json('tracktranslegislation.json')
sample = raw.sample(n=15, random_state=1234)
sample

metas = [(row, get_bill_meta(row)) for idx, row in sample.iterrows()]
responses = [(meta[0], get_bill_text(*meta)) for meta in metas]
extracts = [extract_bill_contents(*response) for response in responses]
extracts

skipping tmp/bill_meta_IA_HF8.json, exists
skipping tmp/bill_meta_AZ_HB2312.json, exists
skipping tmp/bill_meta_TX_HB1532.json, exists
skipping tmp/bill_meta_UT_SB0039.json, exists
skipping tmp/bill_meta_KY_HB470.json, exists
skipping tmp/bill_meta_KY_HB58.json, exists
skipping tmp/bill_meta_TN_HB0239.json, exists
skipping tmp/bill_meta_NJ_A1630.json, exists
skipping tmp/bill_meta_OK_SB731.json, exists
skipping tmp/bill_meta_TX_HB2722.json, exists
got tmp/bill_meta_IA_HF482.json
got tmp/bill_meta_MS_HB1125.json
got tmp/bill_meta_TX_HB3213.json
got tmp/bill_meta_ND_SB2199.json
got tmp/bill_meta_IN_HB1118.json
skipping tmp/bill_text_response_IA_HF8.json, exists
skipping tmp/bill_text_response_AZ_HB2312.json, exists
skipping tmp/bill_text_response_TX_HB1532.json, exists
skipping tmp/bill_text_response_UT_SB0039.json, exists
skipping tmp/bill_text_response_KY_HB470.json, exists
skipping tmp/bill_text_response_KY_HB58.json, exists
skipping tmp/bill_text_response_TN_HB0239.json, exists
skipp

['bills/IA_HF8.html',
 'bills/AZ_HB2312.html',
 'bills/TX_HB1532.html',
 'bills/UT_SB0039.pdf',
 'bills/KY_HB470.pdf',
 'bills/KY_HB58.pdf',
 'bills/TN_HB0239.pdf',
 'bills/NJ_A1630.html',
 'bills/OK_SB731.pdf',
 'bills/TX_HB2722.html',
 'bills/IA_HF482.html',
 'bills/MS_HB1125.html',
 'bills/TX_HB3213.html',
 'bills/ND_SB2199.pdf',
 'bills/IN_HB1118.pdf']