In [36]:
import base64
import json
from pprint import pprint

filename = 'bills/IA_HF8'

result = None
with open(filename, 'r') as f:
    result = json.load(f)['text']

doc = result['doc']
# result['doc'] = base64.b64decode(doc).decode('ascii')[0:100]
extension = result['mime'].split('/')[-1]
new_filename = '.'.join([filename, extension])

with open(new_filename, 'wb') as f:
    f.write(base64.b64decode(doc))


In [16]:
import json
from legiscan import legiscan_api
import os
import pandas as pd
from pprint import pprint
import requests

@legiscan_api
def get_bill_text(legiscan_bill_id: str, api_key: str):
    # https://api.legiscan.com/?key=5f61f50916512f9f21500f38877c22f7&op=getBillText&id=2736883
    assembled_url = f'https://api.legiscan.com/?key={api_key}&op=getBillText&id={legiscan_bill_id}'
    resp = requests.get(assembled_url)

    if resp.ok:
        parsed = json.loads(resp.text)
        if parsed['status'] == 'ERROR':
            print(parsed['alert']['message'])
            return None
        return resp
    else:
        print(resp.status_code)
        return None

raw = pd.read_json('tracktranslegislation.json')
sample = raw.sample(n=3, random_state=1234)

for idx, row in sample.iterrows():
    local_filename = os.path.join(
        'bills',
        '_'.join([
            row["state"],
            *row["billId"].split(' '),
        ])
    )
    
    if os.path.exists(local_filename):
        print(f'skipping {local_filename}')
    
    resp = get_bill_text(row['legiscanId'])
    if not resp:
        print(f'Could not download {local_filename}')
        continue
    
    with open(local_filename, 'wb') as f:
        f.write(resp.content)


In [2]:
from functools import wraps
from legiscan import legiscan_api
import os

@legiscan_api
def sample_api_action(api_key: str):
    print(f'api key is {api_key}')

@legiscan_api
def get_bill_text(legiscan_bill_id: str, api_key: str):
    # https://api.legiscan.com/?key=5f61f50916512f9f21500f38877c22f7&op=getBillText&id=2736883
    assembled_url = f'https://api.legiscan.com/?key={api_key}&op=getBillText&id={legiscan_bill_id}'
    print(assembled_url)
    
sample_api_action(api_key='foo')


get_bill_text('12345')

api key is foo
https://api.legiscan.com/?key=5f61f50916512f9f21500f38877c22f7&op=getBillText&id=12345


In [None]:
from bs4 import BeautifulSoup
import requests
import urllib.parse

host = 'https://www.house.mo.gov'
# url = f'{host}/BillContent.aspx?bill=HB1258&year=2023&code=R'
url = urllib.parse.urljoin(host, 'BillContent.aspx?bill=HB1258&year=2023&code=R')
page = requests.get(url)

# print(page.text)
soup = BeautifulSoup(page.content)
urllib.parse.urljoin(
    host, 
    soup.find_all(class_='textType')[0].find('a')['href'],
)


In [None]:
import pandas as pd
import requests
from typing import Optional
import urllib.parse

def test_url(url: str, parent_id: str, anchor_index: int):
    page = requests.get(url)
    soup = BeautifulSoup(page.content)
    container = soup.find(id=parent_id)
    if not container:
        return None
    
    try:
        return container.find_all('a')[anchor_index]['href']
    except:
        return None

def prepare_url(relative_url: Optional[str]):
    if not relative_url:
        return 'NO RESULT'
    
    return urllib.parse.urljoin('https://legiscan.com/', relative_url)

def process_as_bill(frame) -> Optional[str]:
    return process_bill_link(frame['billLink'])

def process_bill_link(url: str) -> Optional[str]:
    print(url)
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='bill-last-action')
        anchors = container.find_all('a')
        href = anchors[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

def process_as_text(frame) -> Optional[str]:
    state = frame['state']
    bill_id = frame['billId'].replace(' ', '')
    year = row['billLink'].split('/')[-1] # don't trust statusDate
    text_link = f'https://legiscan.com/{state}/text/{bill_id}/{year}'
    return process_text_link(text_link)

def process_text_link(url: str) -> Optional[str]:
    print(url)
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='gaits-wrapper')
        href = container.find_all('a')[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

def process_as_draft(frame) -> Optional[str]:
    state = frame['state']
    bill_id = frame['billId'].replace(' ', '')
    year = row['billLink'].split('/')[-1] # don't trust statusDate
    text_link = f'https://legiscan.com/{state}/drafts/{bill_id}/{year}'
    return process_text_link(text_link)
    
def process_draft_link(url: str) -> Optional[str]:
    print(url)
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='gaits-wrapper')
        href = container.find_all('a')[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

raw = pd.read_json('tracktranslegislation.json')
sample = raw.copy()
# sample = raw.sample(n=20, random_state=2)
# sample = raw.loc[raw.state == 'AR']
# sample = raw.loc[0:20]
# print(sample)

for idx, row in sample.iterrows():
    continue
    bill_id = ' '.join([row['state'], row['billId']])
#    year = row['billLink'].split('/')[-1]
    
#    bill_link = row['billLink']
#    draft_link = f'https://legiscan.com/{row["state"]}/drafts/{row["billId"].replace(" ", "")}/{year}' # https://legiscan.com/AZ/drafts/HB2517/2023
#    text_link = f'https://legiscan.com/{row["state"]}/text/{row["billId"].replace(" ", "")}/{year}'
#    comments_link = f'https://legiscan.com/{row["state"]}/comments/{row["billId"].replace(" ", "")}/{year}'
    
    searches = [
        process_as_bill,
        process_as_text,
    ]

    print(f'{bill_id}')
    for search in searches:
        print(search(row))

    print()


In [None]:
# https://legiscan.com/TX/comments/HB1029/2023

from bs4 import BeautifulSoup
from pprint import pprint
import requests
from typing import Optional
import urllib.parse

def process_as_bill(frame) -> Optional[str]:
    return process_bill_link(frame['billLink'])

def process_bill_link(url: str) -> Optional[str]:
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='bill-last-action')
        anchors = container.find_all('a')
        href = anchors[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

def process_as_text(frame) -> Optional[str]:
    state = frame['state']
    bill_id = frame['billId'].replace(' ', '')
    year = row['billLink'].split('/')[-1] # don't trust statusDate
    text_link = f'https://legiscan.com/{state}/text/{bill_id}/{year}'
    return process_text_link(text_link)

def process_text_link(url: str) -> Optional[str]:
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='gaits-wrapper')
        href = container.find_all('a')[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

def process_as_draft(frame) -> Optional[str]:
    state = frame['state']
    bill_id = frame['billId'].replace(' ', '')
    year = row['billLink'].split('/')[-1] # don't trust statusDate
    text_link = f'https://legiscan.com/{state}/drafts/{bill_id}/{year}'
    return process_text_link(text_link)
    
def process_draft_link(url: str) -> Optional[str]:
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='gaits-wrapper')
        href = container.find_all('a')[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

    
#text_link = 'https://legiscan.com/TX/text/HB3147/2023'
#print(text_link)
#print(process_text_link(text_link))

#bill_link = 'https://legiscan.com/TX/bill/HB976/2023'
#print(bill_link)
#print(process_bill_link(bill_link))
#print(process_as_bill(sample.loc[303]))

print(process_as_bill(raw.loc[251]))
print(process_as_text(raw.loc[251]))
print(process_as_draft(raw.loc[251]))

#host = 'https://www.house.mo.gov'
# url = f'{host}/BillContent.aspx?bill=HB1258&year=2023&code=R'
#url = 'https://legiscan.com/SD/text/HB1080/2023' #urllib.parse.urljoin(host, 'BillContent.aspx?bill=HB1258&year=2023&code=R')
#page = requests.get(url)

# print(page.text)
#soup = BeautifulSoup(page.content)
#container = soup.find(id='gaits-wrapper')
#pprint(container.find_all('a'))
#urllib.parse.urljoin(
#    host, 
#    soup.find_all(class_='textType')[0].find('a')['href'],
#)