In [2]:
import sys
sys.path.append('..')

from bs4 import BeautifulSoup as Soup
from collections import namedtuple
import json
import os.path
from retrieval.legiscan import legiscan_auth, make_legiscan_session
from urllib.parse import urljoin

Dataset = namedtuple("Dataset", "state year session modified exported json_url csv_url")

def enumerate_datasets(session):
    return session.get('https://legiscan.com/datasets').text

def retrieve_archive(json_url: str, session, *, force: bool = False):
    local_name = '-'.join(os.path.basename(json_url).split('_')[0:3]) + '.zip'
    local_path = os.path.join('../tmp/neutral_corpus', local_name)
    if os.path.exists(local_path) and not force:
        print(f'{local_name} already exists')
        return
    
    with open(local_path, 'wb') as f:
        f.write(session.get(urljoin('https://legiscan.com/', json_url)).content)
        print(f'Created archive {local_name}')

with make_legiscan_session() as session:
    soup = Soup(enumerate_datasets(session))
    dataset_table = soup.find(id='gaits-datasets')
    table_data = [
        Dataset(*(
            cell.text if len(cell.find_all('a'))<1 else cell.find_all('a')[0].attrs['href']
            for cell 
            in row.find_all('td')))
        for row
        in dataset_table.find_all('tbody')[0].find_all('tr')
    ]

    [retrieve_archive(item.json_url, session, force=False) for item in table_data if '2023' in item.session]

AL-2023-2023-1st.zip already exists
AL-2023-2023-Regular.zip already exists
AK-2023-2024-33rd.zip already exists
AZ-2023-2023-Fifty-sixth.zip already exists
AR-2023-2023-94th.zip already exists
CA-2023-2024-Regular.zip already exists
CO-2023-2023-Regular.zip already exists
CT-2023-2023-General.zip already exists
DE-2023-2024-152nd.zip already exists
FL-2023-2023-Regular.zip already exists
FL-2023-2023-2nd.zip already exists
GA-2023-2024-Regular.zip already exists
HI-2023-2023-Regular.zip already exists
ID-2023-2023-Regular.zip already exists
IL-2023-2024-103rd.zip already exists
IN-2023-2023-Regular.zip already exists
IA-2023-2024-90th.zip already exists
KS-2023-2024-Regular.zip already exists
KY-2023-2023-Regular.zip already exists
LA-2023-2023-Regular.zip already exists
LA-2023-2023-1st.zip already exists
ME-2023-2024-131st.zip already exists
MD-2023-2023-Regular.zip already exists
MA-2023-2024-193rd.zip already exists
MI-2023-2024-102nd.zip already exists
MN-2023-2024-93rd.zip alrea

In [15]:
from glob import glob
import zipfile

WORK_DIR = '../tmp/neutral_corpus'

def extract_one(zip: str, path: str):
    print(f'Unpacking {zip}')
    with zipfile.ZipFile(zip,"r") as zip_ref:
        zip_ref.extractall(WORK_DIR)

zips = glob(f'{WORK_DIR}/*.zip')

for zip in zips:
    extract_one(zip, WORK_DIR)

print('Done')

Unpacking ../tmp/neutral_corpus/IL-2023-2024-103rd.zip
Unpacking ../tmp/neutral_corpus/OR-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/WA-2023-2024-Regular.zip
Unpacking ../tmp/neutral_corpus/MO-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/NM-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/AK-2023-2024-33rd.zip
Unpacking ../tmp/neutral_corpus/OK-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/WI-2023-2024-Regular.zip
Unpacking ../tmp/neutral_corpus/LA-2023-2023-1st.zip
Unpacking ../tmp/neutral_corpus/PA-2023-2024-Regular.zip
Unpacking ../tmp/neutral_corpus/SD-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/LA-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/CO-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/HI-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/UT-2023-2023-General.zip
Unpacking ../tmp/neutral_corpus/MT-2023-2023-Regular.zip
Unpacking ../tmp/neutral_corpus/KS-2023-2024-Regular.zip
Unpacking ../tmp/neutral_corpus/US-2023-

In [53]:
from glob import glob
import json

WORK_DIR = '../tmp/neutral_corpus'

def read_one(json_path: str):
    bill = None
    with open(json_path, 'r') as f:
        bill = json.load(f)['bill']
        
    return {
        'state': bill['state'],
        'bill_id': bill['bill_number'],
        'legiscan_bill_id': bill['bill_id'],
        'status_date': bill['status_date'],
        'title': bill['title'],
        'description': bill['description'],
        'legiscan_doc_id': None if len(bill['texts']) < 1 else (
            sorted(
                bill['texts'], 
                key=lambda t: t['date'], 
                reverse=True
            )[0]['doc_id']
        )
    }

bills = glob(f'{WORK_DIR}/*/*/bill/*.json')
all_bills = (outer_bill for outer_bill in (read_one(bill) for bill in bills) if outer_bill['legiscan_doc_id'])

with open('../tmp/neutral_corpus/all_bills.json', 'w') as f:
    json.dump(list(all_bills), f, indent=2)

In [73]:
import pandas as pd

aggregate_frame = pd.read_json('../artifacts/aggregate.json')
states_with_bills = aggregate_frame.state.unique().tolist()

bills_frame = pd.read_json('../tmp/neutral_corpus/all_bills.json')
candidates = bills_frame.apply(lambda x: x['state'] in states_with_bills, axis=1)

# TODO: we can do smarter sampling based on proportion of bills or something
neutral_bills = bills_frame.loc[candidates].sample(n=100, random_state=1)

Unnamed: 0,state,bill_id,legiscan_bill_id,status_date,title,description,legiscan_doc_id
5584,SC,H3299,1637518,2023-01-10,Grade inflation prohibition,Amend The South Carolina Code Of Laws By Amend...,2617075
101051,TN,SB1274,1688878,2023-01-31,"AN ACT to amend Tennessee Code Annotated, Titl...","As introduced, adds medical treatment to the l...",2673466
55451,TX,SB311,1639278,2022-12-19,Relating to exempting the intrastate manufactu...,Relating to exempting the intrastate manufactu...,2619317
36441,IA,HF5,1653298,2023-01-11,"A bill for an act relating to education, inclu...","A bill for an act relating to education, inclu...",2633595
62764,NV,AB187,1715347,2023-02-20,Prohibits certain instruction in public school...,AN ACT relating to education; prohibiting a pr...,2710041
...,...,...,...,...,...,...,...
62538,AR,SB42,1647597,2023-02-23,To Amend The Practice And Regulation Of Counse...,To Amend The Practice And Regulation Of Counse...,2709537
5400,SC,H3477,1637365,2023-01-10,"Search warrants, electronic data",Amend The South Carolina Code Of Laws By Addin...,2616933
93527,IL,HB3417,1712517,2023-02-17,MENTAL HLTH-COURT JURSIDICTION,Amends the Mental Health and Developmental Dis...,2705750
11120,MS,HB653,1659974,2023-01-31,"Autopsies; create ""Jenna's Law"" to require aut...","An Act To Create New Section 41-37-27, Mississ...",2642255


In [None]:
from retrieval.legiscan import legiscan_api

