In [None]:
import pdfplumber
from pprint import pprint
from PyPDF2 import PdfReader

def extract_pdf_tokens_pypdf2(file_path: str) -> str:
    reader = PdfReader(file_path)
    return reader.pages[0].extract_text()

def extract_pdf_tokens_pdfplumber(file_path: str) -> str:
    with pdfplumber.open(file_path) as pdf:
        return pdf.pages[0].extract_text()

pprint(extract_pdf_tokens_pdfplumber('../archive/bills/OK_SB973.pdf'))

In [36]:
from glob import glob
import re

def find_hyphenate(text: str):
    return re.subn(r'(\w)+-\s*\n\s*(\w+)', r'\g<1>\g<2>\n', text, re.MULTILINE)

sample = 'the quick the \n brown fox jumps o- \n ver the \n lazy dog'
print(find_hyphenate(sample))

re.search(r'(\w)+-\s*\n\s+(\w+)', sample).groups()
# glob('../tmp/neutral_corpus/bills/*.pdf')

('the quick the \n brown fox jumps over\n the \n lazy dog', 1)


('o', 'ver')

In [40]:
import sys
sys.path.append('..')

from bs4 import BeautifulSoup as Soup
from collections import namedtuple
import json
from retrieval.legiscan import legiscan_auth

Dataset = namedtuple("Dataset", "state year session modified exported json csv")

@legiscan_auth
def enumerate_datasets(session):
    return session.get('https://legiscan.com/datasets').text

soup = Soup(enumerate_datasets())
dataset_table = soup.find(id='gaits-datasets')
table_data = [
    Dataset(*(
        cell.text if len(cell.find_all('a'))<1 else cell.find_all('a')[0].attrs['href']
        for cell 
        in row.find_all('td')))
    for row
    in dataset_table.find_all('tbody')[0].find_all('tr')
]

len([item.json for item in table_data if '2023' in item.session])


55

In [7]:
import json

with open('../datasets/geography.json', 'r') as f:
    geography = json.load(f)
    
pairs = [
('ALABAMA', 'AL'),
('ALASKA', 'AK'),
('AMERICAN SAMOA', 'AS'),
('ARIZONA', 'AZ'),
('ARKANSAS', 'AR'),
('CALIFORNIA', 'CA'),
('COLORADO', 'CO'),
('CONNECTICUT', 'CT'),
('DELAWARE', 'DE'),
('DISTRICT OF COLUMBIA', 'DC'),
('FLORIDA', 'FL'),
('GEORGIA', 'GA'),
('GUAM', 'GU'),
('HAWAII', 'HI'),
('IDAHO', 'ID'),
('ILLINOIS', 'IL'),
('INDIANA', 'IN'),
('IOWA', 'IA'),
('KANSAS', 'KS'),
('KENTUCKY', 'KY'),
('LOUISIANA', 'LA'),
('MAINE', 'ME'),
('MARYLAND', 'MD'),
('MASSACHUSETTS', 'MA'),
('MICHIGAN', 'MI'),
('MINNESOTA', 'MN'),
('MISSISSIPPI', 'MS'),
('MISSOURI', 'MO'),
('MONTANA', 'MT'),
('NEBRASKA', 'NE'),
('NEVADA', 'NV'),
('NEW HAMPSHIRE', 'NH'),
('NEW JERSEY', 'NJ'),
('NEW MEXICO', 'NM'),
('NEW YORK', 'NY'),
('NORTH CAROLINA', 'NC'),
('NORTH DAKOTA', 'ND'),
('NORTHERN MARIANA IS', 'MP'),
('OHIO', 'OH'),
('OKLAHOMA', 'OK'),
('OREGON', 'OR'),
('PENNSYLVANIA', 'PA'),
('PUERTO RICO', 'PR'),
('RHODE ISLAND', 'RI'),
('SOUTH CAROLINA', 'SC'),
('SOUTH DAKOTA', 'SD'),
('TENNESSEE', 'TN'),
('TEXAS', 'TX'),
('UNITED STATES', 'US'),
('UTAH', 'UT'),
('VERMONT', 'VT'),
('VIRGINIA', 'VA'),
('VIRGIN ISLANDS', 'VI'),
('WASHINGTON', 'WA'),
('WEST VIRGINIA', 'WV'),
('WISCONSIN', 'WI'),
('WYOMING', 'WY'),
]

def fix_case(state: str):
    return ' '.join(word.capitalize() for word in state.split(' '))

geography['state_abbreviations'] = {fix_case(tup[0]): tup[1] for tup in pairs}
geography['state_names'] = {tup[1]: fix_case(tup[0]) for tup in pairs}

with open('../datasets/geography.json', 'w') as f:
    json.dump(geography, f, indent=2)

In [42]:
import glob
from operator import itemgetter
import json

metas =  glob.glob('../tmp/legiscan/*meta*.json')
def get_meta(meta_name):
    with open(meta_name, 'r') as f:
        return json.load(f)
    
def summarize(meta):
    bill = meta['bill']
    state, bill_id, status_date, legiscan_bill_id = itemgetter('state', 'bill_number', 'status_date', 'bill_id')(bill)
    
    return {
        'state': state,
        'bill_id': bill_id,
        'status_date': status_date,
        'legiscan_bill_id': legiscan_bill_id,
    }

[summarize(get_meta(meta)) for meta in metas][0:20]

[{'state': 'US',
  'bill_id': 'HB1112',
  'status_date': '2023-02-21',
  'legiscan_bill_id': 1717626},
 {'state': 'OK',
  'bill_id': 'SB937',
  'status_date': '2023-02-06',
  'legiscan_bill_id': 1669660},
 {'state': 'KS',
  'bill_id': 'SB233',
  'status_date': '2023-03-01',
  'legiscan_bill_id': 1704027},
 {'state': 'MI',
  'bill_id': 'HB4075',
  'status_date': '2023-02-07',
  'legiscan_bill_id': 1697993},
 {'state': 'SC',
  'bill_id': 'H3801',
  'status_date': '2023-01-25',
  'legiscan_bill_id': 1679817},
 {'state': 'AR',
  'bill_id': 'SB270',
  'status_date': '2023-03-07',
  'legiscan_bill_id': 1708046},
 {'state': 'NM',
  'bill_id': 'HM57',
  'status_date': '2023-02-17',
  'legiscan_bill_id': 1714178},
 {'state': 'TX',
  'bill_id': 'SB437',
  'status_date': '2023-01-12',
  'legiscan_bill_id': 1657171},
 {'state': 'PA',
  'bill_id': 'HB138',
  'status_date': '2023-03-08',
  'legiscan_bill_id': 1730236},
 {'state': 'ID',
  'bill_id': 'S1100',
  'status_date': '2023-03-09',
  'legiscan

In [54]:
from bs4 import BeautifulSoup as Soup
from itertools import chain, islice, takewhile
import json
import re
import requests
import string
import time

TRACKER_URL = 'https://www.equalitytexas.org/legislature/legislative-bill-tracker-2023'
OUTPUT_PATH = '../datasets/equalitytexas.json'

def extract_row(row):
    cells = row.find_all('td')
    d = re.search(r'\d{2}/\d{2}/\d{4}', cells[3].text)
    return {
        'state': 'TX',
        'bill_id': cells[0].text,
        'sponsors': [sponsor for sponsor in cells[1].text.split(' ') if sponsor not in string.punctuation],
        'description': cells[2].text,
        'status_date': d.group(0) if d else '',
    }

start_time = time.time()

page = requests.get(TRACKER_URL)

soup = Soup(page.content, 'html.parser')
bad_bills = soup.find(id='bad-bills')

bill_tables = islice(bad_bills.parent.parent.find_all('table'), 1, None)
relevant_rows = chain.from_iterable(
    (row for row in tbl.find_all('tr') if not row.find('th'))
    for tbl
    in bill_tables
)

dataset = list((takewhile(lambda r: r['bill_id'] != '#N/A', (extract_row(row) for row in relevant_rows))))

with open(OUTPUT_PATH, 'w') as f:
    json.dump(dataset, f, indent=2)

end_time = time.time()

print(f'Data "equalitytexas.json" refreshed with {len(dataset)} items ({(end_time-start_time):.2f}s elapsed)')

Data "equalitytexas.json" refreshed with 139 items (0.16s elapsed)


In [265]:
from bs4 import BeautifulSoup as Soup
from itertools import chain
import json
from pprint import pprint
from pyjsparser import parse
import requests
from urllib.parse import urljoin

URL = 'https://tracktranslegislation.com'
page = requests.get(URL)
soup = Soup(page.content, 'html.parser')
script_tags = soup.find_all('script')
sources = [source_tag.attrs['src'] for source_tag in soup.find_all('script') if source_tag.has_attr('src')]
quarry = urljoin(URL, next(source for source in sources if 'chunks/70-' in source))
script_contents = requests.get(quarry).text

def find_in_graph(subgraph):
    results = []
    items = subgraph.values() if isinstance(subgraph, dict) else subgraph
    local_results = (item for item in items if isinstance(item, str) and len(item) > 10000)
    return (chain.from_iterable([local_results, *(find_in_graph(item) for item in items if isinstance(item, dict) or isinstance(item,list))]))

parsed = parse(script_contents)
candidates = find_graph(parsed)
jsonstr = next(candidates)
len(json.loads(jsonstr))


372


In [181]:
t1 = ('a', 3)
t2 = (*t1, 'b')
t2

('a', 3, 'b')

In [106]:
from itertools import chain, islice
import json

mapper = {}
with open('resolver_map.json', 'r') as f:
    mapper = json.load(f)

list(islice(chain.from_iterable(
    ((state, k, v) for k, v in m['bills'].items())
    for state, m 
    in mapper.items()
), 10))


[('AK', 'HB27', 1646385),
 ('AK', 'SB96', 1730580),
 ('AK', 'HB105', 1730818),
 ('AR', 'HB1156', 1662211),
 ('AR', 'HB1468', 1715730),
 ('AR', 'SB125', 1680399),
 ('AR', 'SB199', 1696352),
 ('AR', 'SB270', 1708046),
 ('AR', 'SB43', 1646838),
 ('AR', 'SB294', 1715618)]

In [77]:
from mergedeep import merge

a = {
    'foo': {
        'bar': 'baz',
        'glarch': 'glarch',
    },
    'bar': 3,
    'arr': [3]
}

b = {
    'foo': {
        'bar': 'bar',
    },
    'baz': 'baz',
    'bar': {
        'blah': 'blah'
    },
    'arr': [4]
}

c = {
    'foo': { }
}

dummy = {}
merge(dummy, a, b, c)
pprint(dummy)


{'arr': [4],
 'bar': {'blah': 'blah'},
 'baz': 'baz',
 'foo': {'bar': 'bar', 'glarch': 'glarch'}}


In [81]:
import json
from legiscan import legiscan_api
import requests

@legiscan_api
def do_search(state: str, term: str, api_key: str):
    short_url = 'https://api.legiscan.com/'
    assemble_url = f'https://api.legiscan.com/?key={api_key}&op=getSearch&state={state}&query={term.replace(" ", "+")}'
    assemble_params = {
        'key': api_key,
        'op': 'getSearch',
        'state': state,
        'query': term,
    }
    return requests.get(short_url, params=assemble_params).text

json.loads(do_search('ME', '577')) # needs to be exact or the search can't find it :/

{'status': 'OK',
 'searchresult': {'summary': {'page': '1 of 1',
   'range': '1 - 4',
   'relevancy': '100% - 91%',
   'count': 4,
   'page_current': 1,
   'page_total': 1,
   'query': '(577:(pos=1))'},
  '0': {'relevance': 100,
   'state': 'ME',
   'bill_number': 'LD930',
   'bill_id': 1724385,
   'change_hash': 'e268ccb1b9c79c60f817f50ca912edd3',
   'url': 'https://legiscan.com/ME/bill/LD930/2023',
   'text_url': 'https://legiscan.com/ME/text/LD930/2023',
   'research_url': 'https://legiscan.com/ME/research/LD930/2023',
   'last_action_date': '2023-03-02',
   'last_action': 'On motion by Senator Carney of Cumberland, REFERRED to the Committee on Judiciary, in concurrence.',
   'title': "An Act to Allow Only Students of Female Gender to Participate in Women's and Girls' Scholastic Sports"},
  '1': {'relevance': 99,
   'state': 'ME',
   'bill_number': 'LD577',
   'bill_id': 1702330,
   'change_hash': '17188394c2eec6b2163091eb1ddda484',
   'url': 'https://legiscan.com/ME/bill/LD577/2023

In [2]:
from bs4 import BeautifulSoup as Soup
from itertools import islice
from nltk import download as nltk_download, pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string

nltk_download('punkt')
nltk_download('averaged_perceptron_tagger')
nltk_download('wordnet')

lem = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def extract_html_file(file_path: str):
    soup = None
    with open(file_path, 'r') as f:
        soup = Soup(f, 'html.parser')

    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    text = soup.get_text()
    tokens = (token for token in word_tokenize(text) if token not in string.punctuation)
    return list(tokens)

[(orig, lem) for orig, lem in ((word, lem.lemmatize(word, get_wordnet_pos(pos))) for word, pos in pos_tag(extract_html_file('bills/FL_S0254.html'))) if orig != lem]

[nltk_data] Downloading package punkt to /Users/amy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/amy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/amy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[('entitled', 'entitle'),
 ('relating', 'relate'),
 ('treatments', 'treatment'),
 ('amending', 'amend'),
 ('granting', 'grant'),
 ('courts', 'court'),
 ('children', 'child'),
 ('are', 'be'),
 ('are', 'be'),
 ('being', 'be'),
 ('subjected', 'subject'),
 ('prescriptions', 'prescription'),
 ('procedures', 'procedure'),
 ('amending', 'amend'),
 ('requiring', 'require'),
 ('determining', 'determine'),
 ('is', 'be'),
 ('proceedings', 'proceeding'),
 ('amending', 'amend'),
 ('requiring', 'require'),
 ('courts', 'court'),
 ('as', 'a'),
 ('purposes', 'purpose'),
 ('determining', 'determine'),
 ('proceedings', 'proceeding'),
 ('prohibiting', 'prohibit'),
 ('treating', 'treat'),
 ('as', 'a'),
 ('circumstances', 'circumstance'),
 ('amending', 'amend'),
 ('defining', 'define'),
 ('purposes', 'purpose'),
 ('warrants', 'warrant'),
 ('proceedings', 'proceeding'),
 ('amending', 'amend'),
 ('providing', 'provide'),
 ('courts', 'court'),
 ('determinations', 'determination'),
 ('made', 'make'),
 ('circums

In [3]:
from bs4 import BeautifulSoup as Soup
from itertools import islice
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from pprint import pprint
import string

nltk.download('punkt')

def extract_html_file(file_path: str):
    soup = None
    with open(file_path, 'r') as f:
        soup = Soup(f, 'html.parser')

    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    text = soup.get_text()
    tokens = (token for token in word_tokenize(text) if token not in string.punctuation)
    return tokens
    # break into lines and remove leading and trailing space on each
    #lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    #chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    #text = "\n".join(chunk for chunk in chunks if chunk)

    #return text

ps = PorterStemmer()

pprint([(word, ps.stem(word)) for word in islice(extract_html_file('bills/FL_S0254.html'), 100)])


[('Florida', 'florida'),
 ('Senate', 'senat'),
 ('2023', '2023'),
 ('SB', 'sb'),
 ('254', '254'),
 ('By', 'by'),
 ('Senator', 'senat'),
 ('Yarborough', 'yarborough'),
 ('4-01859G-23', '4-01859g-23'),
 ('2023254__', '2023254__'),
 ('1', '1'),
 ('A', 'a'),
 ('bill', 'bill'),
 ('to', 'to'),
 ('be', 'be'),
 ('entitled', 'entitl'),
 ('2', '2'),
 ('An', 'an'),
 ('act', 'act'),
 ('relating', 'relat'),
 ('to', 'to'),
 ('treatments', 'treatment'),
 ('for', 'for'),
 ('sex', 'sex'),
 ('reassignment', 'reassign'),
 ('3', '3'),
 ('amending', 'amend'),
 ('s.', 's.'),
 ('61.517', '61.517'),
 ('F.S', 'f.'),
 ('granting', 'grant'),
 ('courts', 'court'),
 ('of', 'of'),
 ('this', 'thi'),
 ('4', '4'),
 ('state', 'state'),
 ('temporary', 'temporari'),
 ('emergency', 'emerg'),
 ('jurisdiction', 'jurisdict'),
 ('over', 'over'),
 ('children', 'children'),
 ('5', '5'),
 ('present', 'present'),
 ('in', 'in'),
 ('this', 'thi'),
 ('state', 'state'),
 ('if', 'if'),
 ('they', 'they'),
 ('are', 'are'),
 ('at', 'at')

[nltk_data] Downloading package punkt to /Users/amy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# import base64
import json
from pprint import pprint

filename = 'tmp/bill_meta_TX_HB1532.json'

result = None
with open(filename, 'r') as f:
    result = json.load(f)['bill']

extra = result['texts'][0].copy()
extra['date'] = '2023-01-01'
result['texts'].append(extra)
# pprint(result['texts'])
s = sorted(result['texts'], key=lambda x: x['date'], reverse=True)
doc_id = result['texts'][0]['doc_id']
doc_id

2658134

In [5]:
from legiscan import legiscan_api
import pandas as pd

raw = pd.read_json('tracktranslegislation.json')
sample = raw.sample(n=3, random_state=1234)

@legiscan_api
def get_bill_meta(legiscan_bill_id: str, api_key: str):
    assembled_url = f'https://api.legiscan.com/?key={api_key}&op=getBill&id={legiscan_bill_id}'
    resp = requests.get(assembled_url)

    if resp.ok:
        parsed = json.loads(resp.text)
        if parsed['status'] == 'ERROR':
            print(parsed['alert']['message'])
            return None
        return resp
    else:
        print(resp.status_code)
        return None
    
for idx, row in sample.iterrows():
    local_filename = os.path.join(
        'tmp',
        '_'.join([
            'bill',
            'meta',
            row["state"],
            *row["billId"].split(' '),
        ])
    ) + '.json'
    
    if os.path.exists(local_filename):
        print(f'skipping {local_filename}')
        continue
    
    resp = None
    resp = get_bill_meta(row['legiscanId'])
    if not resp:
        print(f'Could not download {local_filename}')
        continue
    
    print(f'got {local_filename}')
    with open(local_filename, 'wb') as f:
        f.write(resp.content)


NameError: name 'os' is not defined

In [None]:
raw = pd.read_json('tracktranslegislation.json')
sample = raw.sample(n=3, random_state=1234)
sample

# 7 has wrong legiscanId, should be 2721785 i think
# 44 also has wrong legiscanId

In [None]:
import base64
import json
from pprint import pprint

filename = 'bills/TX_HB1532'

result = None
with open(filename, 'r') as f:
    result = json.load(f)['text']

doc = result['doc']
extension = result['mime'].split('/')[-1]
new_filename = '.'.join([filename, extension])

with open(new_filename, 'wb') as f:
    f.write(base64.b64decode(doc))


In [None]:
import json
from legiscan import legiscan_api
import os
import pandas as pd
from pprint import pprint
import requests

@legiscan_api
def get_bill_text(legiscan_bill_id: str, api_key: str):
    # https://api.legiscan.com/?key=5f61f50916512f9f21500f38877c22f7&op=getBillText&id=2736883
    assembled_url = f'https://api.legiscan.com/?key={api_key}&op=getBillText&id={legiscan_bill_id}'
    resp = requests.get(assembled_url)

    if resp.ok:
        parsed = json.loads(resp.text)
        if parsed['status'] == 'ERROR':
            print(parsed['alert']['message'])
            return None
        return resp
    else:
        print(resp.status_code)
        return None

raw = pd.read_json('tracktranslegislation.json')
sample = raw.sample(n=3, random_state=1234)

for idx, row in sample.iterrows():
    local_filename = os.path.join(
        'bills',
        '_'.join([
            row["state"],
            *row["billId"].split(' '),
        ])
    )
    
    if os.path.exists(local_filename):
        print(f'skipping {local_filename}')
    
    resp = get_bill_text(row['legiscanId'])
    if not resp:
        print(f'Could not download {local_filename}')
        continue
    
    with open(local_filename, 'wb') as f:
        f.write(resp.content)


In [None]:
from functools import wraps
from legiscan import legiscan_api
import os

@legiscan_api
def sample_api_action(api_key: str):
    print(f'api key is {api_key}')

@legiscan_api
def get_bill_text(legiscan_bill_id: str, api_key: str):
    # https://api.legiscan.com/?key=5f61f50916512f9f21500f38877c22f7&op=getBillText&id=2736883
    assembled_url = f'https://api.legiscan.com/?key={api_key}&op=getBillText&id={legiscan_bill_id}'
    print(assembled_url)
    
sample_api_action(api_key='foo')


get_bill_text('12345')

In [None]:
from bs4 import BeautifulSoup
import requests
import urllib.parse

host = 'https://www.house.mo.gov'
# url = f'{host}/BillContent.aspx?bill=HB1258&year=2023&code=R'
url = urllib.parse.urljoin(host, 'BillContent.aspx?bill=HB1258&year=2023&code=R')
page = requests.get(url)

# print(page.text)
soup = BeautifulSoup(page.content)
urllib.parse.urljoin(
    host, 
    soup.find_all(class_='textType')[0].find('a')['href'],
)


In [None]:
import pandas as pd
import requests
from typing import Optional
import urllib.parse

def test_url(url: str, parent_id: str, anchor_index: int):
    page = requests.get(url)
    soup = BeautifulSoup(page.content)
    container = soup.find(id=parent_id)
    if not container:
        return None
    
    try:
        return container.find_all('a')[anchor_index]['href']
    except:
        return None

def prepare_url(relative_url: Optional[str]):
    if not relative_url:
        return 'NO RESULT'
    
    return urllib.parse.urljoin('https://legiscan.com/', relative_url)

def process_as_bill(frame) -> Optional[str]:
    return process_bill_link(frame['billLink'])

def process_bill_link(url: str) -> Optional[str]:
    print(url)
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='bill-last-action')
        anchors = container.find_all('a')
        href = anchors[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

def process_as_text(frame) -> Optional[str]:
    state = frame['state']
    bill_id = frame['billId'].replace(' ', '')
    year = row['billLink'].split('/')[-1] # don't trust statusDate
    text_link = f'https://legiscan.com/{state}/text/{bill_id}/{year}'
    return process_text_link(text_link)

def process_text_link(url: str) -> Optional[str]:
    print(url)
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='gaits-wrapper')
        href = container.find_all('a')[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

def process_as_draft(frame) -> Optional[str]:
    state = frame['state']
    bill_id = frame['billId'].replace(' ', '')
    year = row['billLink'].split('/')[-1] # don't trust statusDate
    text_link = f'https://legiscan.com/{state}/drafts/{bill_id}/{year}'
    return process_text_link(text_link)
    
def process_draft_link(url: str) -> Optional[str]:
    print(url)
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='gaits-wrapper')
        href = container.find_all('a')[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

raw = pd.read_json('tracktranslegislation.json')
sample = raw.copy()
# sample = raw.sample(n=20, random_state=2)
# sample = raw.loc[raw.state == 'AR']
# sample = raw.loc[0:20]
# print(sample)

for idx, row in sample.iterrows():
    continue
    bill_id = ' '.join([row['state'], row['billId']])
#    year = row['billLink'].split('/')[-1]
    
#    bill_link = row['billLink']
#    draft_link = f'https://legiscan.com/{row["state"]}/drafts/{row["billId"].replace(" ", "")}/{year}' # https://legiscan.com/AZ/drafts/HB2517/2023
#    text_link = f'https://legiscan.com/{row["state"]}/text/{row["billId"].replace(" ", "")}/{year}'
#    comments_link = f'https://legiscan.com/{row["state"]}/comments/{row["billId"].replace(" ", "")}/{year}'
    
    searches = [
        process_as_bill,
        process_as_text,
    ]

    print(f'{bill_id}')
    for search in searches:
        print(search(row))

    print()


In [None]:
# https://legiscan.com/TX/comments/HB1029/2023

from bs4 import BeautifulSoup
from pprint import pprint
import requests
from typing import Optional
import urllib.parse

def process_as_bill(frame) -> Optional[str]:
    return process_bill_link(frame['billLink'])

def process_bill_link(url: str) -> Optional[str]:
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='bill-last-action')
        anchors = container.find_all('a')
        href = anchors[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

def process_as_text(frame) -> Optional[str]:
    state = frame['state']
    bill_id = frame['billId'].replace(' ', '')
    year = row['billLink'].split('/')[-1] # don't trust statusDate
    text_link = f'https://legiscan.com/{state}/text/{bill_id}/{year}'
    return process_text_link(text_link)

def process_text_link(url: str) -> Optional[str]:
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='gaits-wrapper')
        href = container.find_all('a')[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

def process_as_draft(frame) -> Optional[str]:
    state = frame['state']
    bill_id = frame['billId'].replace(' ', '')
    year = row['billLink'].split('/')[-1] # don't trust statusDate
    text_link = f'https://legiscan.com/{state}/drafts/{bill_id}/{year}'
    return process_text_link(text_link)
    
def process_draft_link(url: str) -> Optional[str]:
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        container = soup.find(id='gaits-wrapper')
        href = container.find_all('a')[-1]['href']
        return urllib.parse.urljoin('https://legiscan.com', href)
    except:
        return None

    
#text_link = 'https://legiscan.com/TX/text/HB3147/2023'
#print(text_link)
#print(process_text_link(text_link))

#bill_link = 'https://legiscan.com/TX/bill/HB976/2023'
#print(bill_link)
#print(process_bill_link(bill_link))
#print(process_as_bill(sample.loc[303]))

print(process_as_bill(raw.loc[251]))
print(process_as_text(raw.loc[251]))
print(process_as_draft(raw.loc[251]))

#host = 'https://www.house.mo.gov'
# url = f'{host}/BillContent.aspx?bill=HB1258&year=2023&code=R'
#url = 'https://legiscan.com/SD/text/HB1080/2023' #urllib.parse.urljoin(host, 'BillContent.aspx?bill=HB1258&year=2023&code=R')
#page = requests.get(url)

# print(page.text)
#soup = BeautifulSoup(page.content)
#container = soup.find(id='gaits-wrapper')
#pprint(container.find_all('a'))
#urllib.parse.urljoin(
#    host, 
#    soup.find_all(class_='textType')[0].find('a')['href'],
#)