In [1]:
import os
import json 
import csv
import requests
from datetime import datetime as dt, timezone

import numpy as np
import pandas as pd

In [6]:
CONGRESS = 116
CONGRESS_DATA_ROOT = f'E:/congress/data/{CONGRESS}'
BILL_DATA_ROOT = os.path.join(CONGRESS_DATA_ROOT, 'bills')
BT_DATA_ROOT = f'E:/BillTracker/data'
BT_CACHE_ROOT = f'E:/BillTracker/cache'
BT_LEGISLATORS = f'E:/BillTracker/legislators/legislators-{CONGRESS}.csv'
BT_PARTY_CACHE = os.path.join(BT_CACHE_ROOT, f'party-cache-{CONGRESS}.json')
BT_BILL_CACHE = os.path.join(BT_CACHE_ROOT, f'bill-cache-{CONGRESS}.json')
BT_BILLS = os.path.join(BT_DATA_ROOT, f'bills-{CONGRESS}.csv')
BT_SPONSORS =  os.path.join(BT_CACHE_ROOT, f'bill-sponsors-{CONGRESS}.json')

KEY = 'Jx3d2IH9lVJ6LByLVnPLOUj120B40l2FLDKYCopw'

columns = ['bill_id', 'bill_type', 'party', 'introduced_at', 'status', 'subject', 'sponsor', 'cosponsors', 'status_at', 'lastmod']
# bill_types = ['hr', 's', 'hres', 'hconres', 'hjres', 'sconres', 'sjres', 'sres']



In [3]:
path = os.path.normpath(BILL_DATA_ROOT)
path = os.path.join(path, 'hconres', 'hconres37', 'data.json')
path

def test(path):
    testbill_cache = {}
    test = load_bill(path)
    
  
    path = os.path.join(BILL_DATA_ROOT, 'hconres', 'hconres37', 'fdsys_billstatus-lastmod.txt')
    set_bill_cache('id', testbill_cache, entry={'lastmod':get_govinfo_lastmod(path)})
    #print(testbill_cache)
    
   
    bt_bill_path = os.path.join(BT_DATA_ROOT, f'bills-{CONGRESS}.csv')
    bt_data_path = os.path.join(BT_DATA_ROOT, f'bills-{CONGRESS}-s.csv')
    data = get_bill_data(bt_bill_path)
    print(data)
    dump_to_csv(bt_data_path, list(data.values()))
        

# test(path)

In [9]:
def get_legislators(path):
    return pd.read_csv(path)

legislators = get_legislators(BT_LEGISLATORS)

def get_party(id): 
    try:
        return legislators[legislators['bioguide_id'] == id]['party'].item()
    except ValueError as e:
        party = check_party_cache(id)
        return party if party else get_party_req(id) 
    except Exception as e: 
        print("Unexpected Error: ", e)
        return None
    
# TODO: Create legislator cache  
def get_party_req(id):
    url = f'https://api.propublica.org/congress/v1/members/{id}.json'
    r = requests.get(url, headers={"X-API-Key": KEY})
    
    try:
        print("Error getting {0}. Trying alternate download.".format(id))
        r.raise_for_status()
        data = r.json()
        party = data['results'][0]['current_party']
        
        set_party_entry( id, entry={ 'party': party } )
        
        return party
    except requests.exceptions.HTTPError as e:
        print("Error getting ProPublica request.", e)
        return None
    except Exception as e:
        print("Unexpected Error: ", e)
        return None

def get_json_cache(path):
    data = {}
    
    if os.path.exists(path):
        with open(path, 'r') as f:
            data = json.load(f)
    
    return data

party_cache = get_json_cache(BT_PARTY_CACHE)

def set_party_entry(id, entry):
    party_cache[id] = entry

def dump_cache(path, cache):
    with open(path, 'w') as f:
        json.dump(cache, f, indent=4)

def check_party_cache(id):
    if id in party_cache:
        return party_cache[id]['party']
    
    return None

def set_cache_entry(id, cache, entry):
    cache[id] = entry
    
    return cache

def is_in_cache(id, cache):
    if cache and id in cache:
        return cache[id]
    
    return None

In [8]:
def load_bill(path):
    data = {}
    
    try: 
        with open(path, 'r') as f:
            raw_data = json.load(f)
            data['bill_id'] = raw_data['bill_id'].split('-')[0]
            data['bill_type'] = raw_data['bill_type']
            data['party'] = get_party(raw_data['sponsor']['bioguide_id'])
            data['introduced_at'] = raw_data['introduced_at']
            data['status'] = raw_data['status']
            data['subject'] = raw_data['subjects_top_term']
            data['sponsor'] = raw_data['sponsor']['name']
            data['cosponsors'] = len(raw_data['cosponsors'])
            data['status_at'] = raw_data['status_at']
            data['lastmod'] = dt.astimezone(dt.now(), tz=timezone.utc).isoformat(timespec='minutes') 
            
    except Exception as e:
        print("Error loading: {0}", path, e)
    finally: 
        return data

def load_sponsor_id(path):
    data = {}
    
    try: 
        with open(path, 'r') as f:
            raw_data = json.load(f)
            data['bio_id'] = raw_data['sponsor']['bioguide_id']
            data['bill_id'] = raw_data['bill_id'].split('-')[0]
            data['party'] = get_party(raw_data['sponsor']['bioguide_id'])
            data['introduced_at'] = raw_data['introduced_at']
            data['subject'] = raw_data['subjects_top_term']
            data['sponsor'] = raw_data['sponsor']['name']
            data['cosponsors'] = len(raw_data['cosponsors'])
            data['status_at'] = raw_data['status_at']
            data['lastmod'] = dt.astimezone(dt.now(), tz=timezone.utc).isoformat(timespec='minutes') 
            
    except Exception as e:
        print("Error loading: {0}", path, e)
    finally: 
        return data

In [5]:
def create_bill_csv(path):
    with open(path, 'w') as f:
        dict_writer = csv.DictWriter(f, delimiter=';', fieldnames=columns)
        dict_writer.writeheader()

def is_bill_cache(id, lastmod, cache):
    if cache and id in cache:
        return cache[id]['lastmod']
    
    return None

def get_govinfo_lastmod(path):
    with open(path, 'r') as f:
        return f.read().strip()
    
    return None

def set_bill_cache(id, cache, entry):
    cache[id] = entry
    return cache

def get_bill_data(path):
    data = {}
    
    if os.path.exists(path):
        with open(path, 'r') as f:
            r = csv.DictReader(f, delimiter=';')
            for row in r:
                data[row['bill_id']] = row
                
        
    return data

def dump_to_csv(path, data):
    with open(path, 'w') as f:
        dict_writer = csv.DictWriter(f, delimiter=';', fieldnames=columns)
        dict_writer.writeheader()
        dict_writer.writerows(data)
        


In [9]:
# TODO: Test cache 
# TODO: Segment cache into seperate bill_type files

def write_bills_to_gcp(force=False):
    # Get bill cache, { bill_id: lastmod }
    bill_cache = get_json_cache(BT_BILL_CACHE) 
    
    #Iterate over bill_type root dir
    for root_path in os.scandir(BILL_DATA_ROOT):
        bill_root = os.path.basename(root_path)
        
        # Path to BillTracker data storage for [bill_type]
        bt_bill_path = os.path.join(BT_DATA_ROOT, f'bills-{CONGRESS}-{bill_root}.csv')
        
        # Get bill data if it exists, otherwise create new csv 
        bill_data = get_bill_data(bt_bill_path)
        print("Processing directory: {0}".format(bill_root))
        
        #
        for bill_path in os.scandir(os.path.join(BILL_DATA_ROOT, root_path)): 

            data_path = os.path.join(bill_path, 'data.json')
            lastmod_path = os.path.join(bill_path, 'fdsys_billstatus-lastmod.txt')

            if os.path.exists(data_path) and os.path.exists(lastmod_path):

                lastmod = get_govinfo_lastmod(lastmod_path)
                bill_id = os.path.basename(bill_path)
                
                # Get cache result
                cache_result = is_bill_cache(bill_id, lastmod, bill_cache)
                if cache_result and not force:
                    # On cache hit check if lastmods match
                    if not cache_result == lastmod:
                        print(bill_id)
                        
                        # If they do not match, update the row 
                        bill_cache = set_bill_cache(bill_id, bill_cache, entry={ 'lastmod': lastmod })
                        bill_data[bill_id] = load_bill(data_path) 
                        
                    # If cache match, bill was not updated
                    # Do nothing 
                else:
                    print(bill_id)
                    # Update bill cache 
                    bill_cache = set_bill_cache(bill_id, bill_cache, entry={ 'lastmod': lastmod })
                    bill_data[bill_id] = load_bill(data_path) 
                    
        
        dump_to_csv(bt_bill_path, list(bill_data.values()))
            
        print("Success.")
        
    dump_cache(BT_PARTY_CACHE, party_cache)
    dump_cache(BT_BILL_CACHE, bill_cache)
 
    
write_bills_to_gcp()

Processing directory: hconres
Success.
Processing directory: hjres
Success.
Processing directory: hr
Success.
Processing directory: hres
Success.
Processing directory: s
Success.
Processing directory: sconres
Success.
Processing directory: sjres
Success.
Processing directory: sres
Success.


In [13]:
def write_bill_sponsors():
    sponsor_cache = get_json_cache(BT_SPONSORS)
    
     #Iterate over bill_type root dir
    for root_path in os.scandir(BILL_DATA_ROOT):
        bill_root = os.path.basename(root_path)
        
        
        print("Processing directory: {0}".format(bill_root))
        
        #
        for bill_path in os.scandir(os.path.join(BILL_DATA_ROOT, root_path)): 
            data_path = os.path.join(bill_path, 'data.json')
            
            if os.path.exists(data_path):
                sponsor = load_sponsor_id(data_path)
                
                id = sponsor['bio_id']
                if id and not is_in_cache(id, sponsor_cache):
                    print("F")
                    sponsor_cache = set_cache_entry(id, sponsor_cache, sponsor)
        
    
    dump_cache(BT_SPONSORS, sponsor_cache)
                        
write_bill_sponsors()

Processing directory: hconres
Processing directory: hjres
Processing directory: hr
Processing directory: hres
Processing directory: s
Processing directory: sconres
Processing directory: sjres
Processing directory: sres
