In [1]:
import sys
sys.path.append('..')

import base64
from bs4 import BeautifulSoup as Soup
from collections import namedtuple
from glob import glob
import json
from operator import itemgetter
import os.path
import pandas as pd
from lib.legiscan import legiscan_api, legiscan_auth, make_legiscan_session
from urllib.parse import urljoin
import zipfile

Dataset = namedtuple("Dataset", "state year session modified exported json_url csv_url")

def enumerate_datasets(session):
    return session.get('https://legiscan.com/datasets').text

def retrieve_archive(json_url: str, session, *, force: bool = False):
    local_name = '-'.join(os.path.basename(json_url).split('_')[0:3]) + '.zip'
    local_path = os.path.join('../tmp/neutral_corpus', local_name)
    if os.path.exists(local_path) and not force:
        print(f'{local_name} already exists')
        return
    
    with open(local_path, 'wb') as f:
        f.write(session.get(urljoin('https://legiscan.com/', json_url)).content)
        print(f'Created archive {local_name}')

with make_legiscan_session() as session:
    soup = Soup(enumerate_datasets(session))
    dataset_table = soup.find(id='gaits-datasets')
    table_data = [
        Dataset(*(
            cell.text if len(cell.find_all('a'))<1 else cell.find_all('a')[0].attrs['href']
            for cell 
            in row.find_all('td')))
        for row
        in dataset_table.find_all('tbody')[0].find_all('tr')
    ]

    [retrieve_archive(item.json_url, session, force=False) for item in table_data if '2023' in item.session]

Created archive AL-2023-2023-1st.zip
Created archive AL-2023-2023-Regular.zip
Created archive AK-2023-2024-33rd.zip
Created archive AZ-2023-2023-Fifty-sixth.zip
Created archive AR-2023-2023-94th.zip
Created archive CA-2023-2024-Regular.zip
Created archive CO-2023-2023-Regular.zip
Created archive CT-2023-2023-General.zip
Created archive DE-2023-2024-152nd.zip
Created archive FL-2023-2023-Regular.zip
Created archive FL-2023-2023-2nd.zip
Created archive GA-2023-2024-Regular.zip
Created archive HI-2023-2023-Regular.zip
Created archive ID-2023-2023-Regular.zip
Created archive IL-2023-2024-103rd.zip
Created archive IN-2023-2023-Regular.zip
Created archive IA-2023-2024-90th.zip
Created archive KS-2023-2024-Regular.zip
Created archive KY-2023-2023-Regular.zip
Created archive LA-2023-2023-Regular.zip
Created archive LA-2023-2023-1st.zip
Created archive ME-2023-2024-131st.zip
Created archive MD-2023-2023-Regular.zip
Created archive MA-2023-2024-193rd.zip
Created archive MI-2023-2024-102nd.zip
Cr

In [2]:
WORK_DIR = '../tmp/neutral_corpus'

def extract_one(zip: str, path: str):
    print(f'Unpacking {zip}')
    with zipfile.ZipFile(zip,"r") as zip_ref:
        zip_ref.extractall(WORK_DIR)

zips = glob(f'{WORK_DIR}/*.zip')

for zip in zips:
    extract_one(zip, WORK_DIR)

print('Done')

Unpacking ../tmp/neutral_corpus/IL-2023-2024-103rd.zip
Unpacking ../tmp/neutral_corpus/OR-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/WA-2023-2024-Regular.zip
Unpacking ../tmp/neutral_corpus/MO-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/NM-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/AK-2023-2024-33rd.zip
Unpacking ../tmp/neutral_corpus/OK-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/WI-2023-2024-Regular.zip
Unpacking ../tmp/neutral_corpus/LA-2023-2023-1st.zip
Unpacking ../tmp/neutral_corpus/PA-2023-2024-Regular.zip
Unpacking ../tmp/neutral_corpus/SD-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/LA-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/CO-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/HI-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/UT-2023-2023-General.zip
Unpacking ../tmp/neutral_corpus/MT-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/KS-2023-2024-Regular.zip
Unpacking ../tmp/neutral_corpus/US-2023-

In [3]:
WORK_DIR = '../tmp/neutral_corpus'

def read_one(json_path: str):
    bill = None
    with open(json_path, 'r') as f:
        bill = json.load(f)['bill']
        
    return {
        'state': bill['state'],
        'bill_id': bill['bill_number'],
        'legiscan_bill_id': bill['bill_id'],
        'status_date': bill['status_date'],
        'title': bill['title'],
        'description': bill['description'],
        'legiscan_doc_id': None if len(bill['texts']) < 1 else (
            sorted(
                bill['texts'], 
                key=lambda t: t['date'], 
                reverse=True
            )[0]['doc_id']
        )
    }

bills = glob(f'{WORK_DIR}/*/*/bill/*.json')
all_bills = (outer_bill for outer_bill in (read_one(bill) for bill in bills) if outer_bill['legiscan_doc_id'])

with open('../tmp/neutral_corpus/all_bills.json', 'w') as f:
    json.dump(list(all_bills), f, indent=2)

In [31]:
aggregate_frame = pd.read_json('../artifacts/aggregate.json')
states_with_bills = aggregate_frame.state.unique().tolist()

bills_frame = pd.read_json('../tmp/neutral_corpus/all_bills.json')
candidates = bills_frame.apply(lambda x: x['state'] in states_with_bills, axis=1)

# TODO: we can do smarter sampling based on proportion of bills or something
neutral_bills = bills_frame.loc[candidates].sample(n=1000, random_state=1)

In [29]:
def get_bill_text_response_filename(state, bill_id):
    return f'../tmp/neutral_corpus/metas/bill_text_response_{state}_{bill_id}.json'

@legiscan_api
def get_bill_text(state, bill_id, legiscan_bill_id, doc_id: str, api_key: str, session):
    local_filename = get_bill_text_response_filename(state, bill_id)
    if os.path.exists(local_filename):
        # print(f'skipping {local_filename}, exists')
        return local_filename

    assembled_url = f'https://api.legiscan.com/?key={api_key}&op=getBillText&id={doc_id}'
    resp = session.get(assembled_url)

    if not resp.ok:
        print(f'Error {resp.status_code} downloading {local_filename}')
        return None
    
    parsed = json.loads(resp.text)
    if parsed['status'].upper() == 'ERROR':
        print(f'Error {parsed["alert"]["message"]} downloading {local_filename}')
        return None
    
    with open(local_filename, 'wb') as f:
        f.write(resp.content)
    
    print(f'got {local_filename}')
    return local_filename

def get_bill_contents_filename(state, bill_id, extension):
    return f'../tmp/neutral_corpus/bills/{state}_{bill_id}.{extension}'

def extract_bill_contents(state, bill_id, legiscan_bill_id, _meta_path, response_path: str):
    result = None
    
    if not response_path:
        print(f'Missing response data {get_bill_text_response_filename(state, bill_id)}')
        return None
    
    with open(response_path, 'r') as f:
        result = json.load(f)['text']

    doc = result['doc']
    extension = result['mime'].split('/')[-1]
    local_filename = get_bill_contents_filename(state, bill_id, extension)

    with open(local_filename, 'wb') as f:
        f.write(base64.b64decode(doc))
    
    print(f'Created {local_filename}')
    return local_filename

In [32]:
for idx, neutral_bill in neutral_bills.iterrows():
    state, bill_id, legiscan_bill_id, doc_id = itemgetter('state', 'bill_id', 'legiscan_bill_id', 'legiscan_doc_id')(neutral_bill)
    response_json = get_bill_text(state, bill_id, legiscan_bill_id, doc_id)
    extract = extract_bill_contents(state, bill_id, legiscan_bill_id, None, response_json)

Created ../tmp/neutral_corpus/bills/SC_H3299.html
Created ../tmp/neutral_corpus/bills/TN_SB1274.pdf
Created ../tmp/neutral_corpus/bills/TX_SB311.html
Created ../tmp/neutral_corpus/bills/IA_HF5.html
Created ../tmp/neutral_corpus/bills/NV_AB187.pdf
Created ../tmp/neutral_corpus/bills/IL_SB0722.html
Created ../tmp/neutral_corpus/bills/VA_HB2140.html
Created ../tmp/neutral_corpus/bills/MT_HB725.pdf
Created ../tmp/neutral_corpus/bills/RI_H5049.pdf
Created ../tmp/neutral_corpus/bills/MN_HF505.pdf
got ../tmp/neutral_corpus/metas/bill_text_response_RI_H5426.json
Created ../tmp/neutral_corpus/bills/RI_H5426.pdf
got ../tmp/neutral_corpus/metas/bill_text_response_RI_H5754.json
Created ../tmp/neutral_corpus/bills/RI_H5754.pdf
got ../tmp/neutral_corpus/metas/bill_text_response_MS_SB2304.json
Created ../tmp/neutral_corpus/bills/MS_SB2304.html
got ../tmp/neutral_corpus/metas/bill_text_response_OR_HB2668.json
Created ../tmp/neutral_corpus/bills/OR_HB2668.pdf
got ../tmp/neutral_corpus/metas/bill_text_r