In [48]:
from itertools import chain, islice
import json
from legiscan import legiscan_api
from mergedeep import merge
from operator import itemgetter
import pandas as pd
from pprint import pprint
import re
import requests
from typing import Dict, Iterable, Set, Tuple

LEGISCAN_API_URL = 'https://api.legiscan.com/'

def infer_structure_updates(
    ttl_data_path: str = 'tracktranslegislation.json', 
    resolver_map: str = 'resolver_map.json',
    hint_map: str = 'resolver_hints.json',
    persist_changes: bool = False,
):
    def infer_prefixes(bill_ids: Iterable[str]) -> Set[str]:
        return set(
            re.sub(r'(\d|\s)+$', '', bill_id)
            for bill_id
            in bill_ids
        )

    def infer_zfill(bill_ids: Iterable[str]) -> int:
        digits = [len(re.sub(r'^\D+', '', bill_id)) for bill_id in bill_ids]
        return min([100, *digits])

    mapper = {}
    try:
        with open(resolver_map, 'r') as f:
            mapper = json.load(f)
    except Exception as e:
        print(f'Failed to load mapper: {e}')
        mapper = {}

    hints = {}
    try:
        with open(hint_map, 'r') as f:
            hints = json.load(f)
    except Exception as e:
        print(f'Failed to load hints: {e}')
        hints = {}

    ttl_data = pd.read_json(ttl_data_path)
    
    for state, group in ttl_data.groupby(by='state'):
        state_map = mapper[state] if state in mapper else {}

        existing_bills_map = state_map['bills'] if 'bills' in state_map else {}
        observed_bills_map = {
            row['billId']: row['legiscanId']
            for idx, row in group.iterrows()
        }
        
        state_map['bills'] = {
            **observed_bills_map,
            **existing_bills_map,
        }
        
        existing_meta = state_map['meta'] if 'meta' in state_map else {}
        observed_meta = {
            'prefixes': list(infer_prefixes(state_map['bills'].keys())),
            'zfill': infer_zfill(state_map['bills'].keys()),
        }

        state_map['meta'] = {
            **observed_meta,
            **existing_meta,
        }
        
        mapper[state] = state_map

    if persist_changes:
        with open('resolver_map.json', 'w') as f:
            json.dump(mapper, f)

    # after generating and saving inferred map, merge in explicit hints before returning
    hinted = {}
    merge(hinted, mapper, hints)
    return hinted

@legiscan_api
def locate_matches(state: str, candidate_name: str, api_key: str) -> Iterable[Dict]:
    assemble_params = {
        'key': api_key,
        'op': 'getSearch',
        'state': state,
        'query': candidate_name,
    }

    search_result = json.loads(requests.get(LEGISCAN_API_URL, params=assemble_params).text)['searchresult']
    result_count = search_result['summary']['count']
    return (search_result[str(match)] for match in range(result_count))

def match_is_relevant(state: str, bill_id: str, match: Dict) -> bool:
    if match['state'] != state:
        return False
    
    if match['relevance'] < 50:
        return False
    
    bill_number = match['bill_number']
    
    if bill_number[0] != bill_id[0]:
        return False
    
    if not bill_number.endswith(re.sub(r'^\D+', '', bill_id)):
        return False
    
    return True

def attempt_resolve_one(mapper, state: str, bill_id: str, persist_changes=False):
    if state not in mapper:
        raise ValueError(f'Unknown state {state} for {state} {bill_id}')
    
    state_record = mapper[state]
    state_known_bills = state_record['bills'].keys()
    
    if bill_id in state_known_bills:
        # print(f'Matching {state} {bill_id} with known lsid {state_record["bills"][bill_id]}')
        return
    
    prefixes, zfill = itemgetter('prefixes', 'zfill')(state_record['meta'])
    candidate_prefixes = [prefix for prefix in prefixes if prefix.startswith(bill_id[0])]
    if len(candidate_prefixes) < 1:
        raise ValueError(f'Unknown prefix {bill_id[0]} for {state} {bill_id}')
    
    bill_digits = re.sub(r'^\D+', '', bill_id)

    candidate_names = [f'{prefix}{bill_digits.zfill(zfill)}' for prefix in candidate_prefixes]
    synonyms = [bill for bill in state_known_bills if bill in candidate_names]
    
    if len(synonyms) == 1:
        # print(f'Matching {state} {bill_id} with known lsid {state_record["bills"][synonyms[0]]}')
        return None
    elif len(synonyms) > 1:
        raise ValueError(f'Multiple candidate synonyms {synonyms} for {state} {bill_id}')
    
    matches = chain.from_iterable(locate_matches(state, candidate_name) for candidate_name in candidate_names)
    relevant_matches = [match for match in matches if match_is_relevant(state, bill_id, match)]
    
    if len(relevant_matches) == 0:
        raise ValueError(f'No relevant matches for {state} {bill_id}')
    
    if len(relevant_matches) == 1:
        match = relevant_matches[0]
        print(f'Matching {state} {bill_id} with lsid {match["bill_id"]} {match["state"]} {match["bill_number"]} ({match["title"]})')
        return match
    
    if len(relevant_matches) > 1:
        match_ids = [match['bill_id'] for match in relevant_matches]
        raise ValueError(f'Multiple relevant matches for {state} {bill_id}: {match_ids}')

def augment_map(mapper, new_bills: Tuple[str, str], persist_changes: bool=False):
    mapped = 0
    total_bills = len(new_bills)
    for new_bill in new_bills:
        try:
            match = attempt_resolve_one(mapper, *new_bill)
            mapped = mapped + 1
            if match:
                state, bill_number, bill_id = itemgetter('state', 'bill_number', 'bill_id')(match)
                mapper[state]['bills'][bill_number] = bill_id
        except Exception as e:
            print(f'Error: {e}')

    print(f'Successfully mapped {mapped}/{total_bills} bills ({total_bills-mapped} not mapped)')
    if persist_changes:
        with open('resolver_map.json', 'w') as f:
            json.dump(mapper, f)
        
new_mapper = infer_structure_updates(persist_changes=True)

aclu_data = pd.read_json('aclu.json')
augment_map(
    new_mapper, 
    [(row['state']['value'], row['name']) for idx, row in aclu_data.iterrows()], 
    persist_changes=True,
)


Error: No relevant matches for ID HB 265
Error: No relevant matches for TX SB 17
Error: Unknown state NV for NV SB 288
Error: No relevant matches for TX SB 2199
Error: No relevant matches for TX SB 8
Error: Unknown prefix S for AK SB 96
Error: No relevant matches for AK HB 105
Error: No relevant matches for TX HB 4624
Error: No relevant matches for PA HB 138
Error: No relevant matches for IA SF 538
Error: No relevant matches for IA HF 616
Error: No relevant matches for IA HF 623
Error: No relevant matches for IA SF 482
Error: No relevant matches for IA SF 496
Error: No relevant matches for IA HF 622
Error: No relevant matches for TX HB 3164
Error: No relevant matches for KS SB 228
Error: No relevant matches for IA HF 508
Error: No relevant matches for ME LD 930
Error: Unknown prefix H for ME HP 577
Error: No relevant matches for IA HSB 208
Error: No relevant matches for IA HSB 222
Error: No relevant matches for FL SB 266
Error: No relevant matches for FL HB 999
Error: No relevant match