In [1]:
import requests
from bs4 import BeautifulSoup

import json
import os

os.environ["PRO_PUBLICA_API_KEY"] = "xwiuchap9kE9Ts4kNKTmBsoCM3XjgO3meCWW7qsx"

import pandas as pd
import numpy as np

import zipfile
import pathlib
import tempfile
import itertools
import shutil

from pymongo import MongoClient

In [None]:
class FileMigrator(object):

    def __init__(self, congress):
        self.dir = tempfile.TemporaryDirectory()
        self.congress = str(congress)
        print(f"{self.dir.name}")
        
    def __enter__(self):
        return self
    
    def __exit__(self, *args):
        self.dir.cleanup()
        
    def extract_file(self, f):
        with zipfile.ZipFile(f,"r") as zip_ref:
            zip_ref.extractall(self.dir.name)

    def get_filetypes(self, ftype=".json"):
        found = []
        for root, dirs, files in os.walk(self.dir.name):
            for file in files:
                if file.endswith(ftype):
                    found.append(os.path.join(root, file))
        return found
    
    def migrate(self, files, loc):
        p = pathlib.Path(loc)
        p.mkdir(parents=True, exist_ok=True)
        for f in files:
            simpleid = f.split("/")[-2]
            shutil.move(f, os.path.join(loc, self.congress + "." + simpleid + ".json"))
    
    def migrate_to_mongo(self, files, db="congress", collection="bills", batchsize=1000):  
        client = MongoClient('localhost', 27017)
        coll = client[db][collection]
        
        batches = np.array_split(files, int(len(files) / batchsize) + 1)
        
        for batch in batches:
            batch_data = []
            for file in batch:
                data = json.load(open(file, "r"))
                batch_data.append(data)
            coll.insert_many(batch_data)
            
        client.close()
            
    def migrate_to_mongo_one_at_time(self, files, db="congress", collection="bills", batchsize=1000):  
        client = MongoClient('localhost', 27017)
        db = client[db]
        coll = db[collection]
        
        nerrors = 0
        try:
            for file in files:
                data = json.load(open(file, "r"))
                res = coll.insert_one(data)
                if not res:
                    nerrors += 1
        except:
            nerrors += 1
            
        print(len(files), nerrors)
            
        client.close()
            
                

In [None]:
for i in range(93, 116):
    with FileMigrator(i) as migrator:
        fname = migrator.dir
        migrator.extract_file(f"/Users/benellis/Downloads/all_bills/{i}.zip")
        files = migrator.get_filetypes()
        migrator.migrate_to_mongo_one_at_time(files)

In [59]:
def fetch_bill_text(bill_type, bill_id, congress):
        ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])
        mapper = {
            "hconres": "house-concurrent-resolution",
            "hjres": "house-joint-resolution",
            "hr": "house-bill",
            "hres": "house-resolution",
            "s": "senate-bill",
            "sconres": "senate-concurrent-resolution",
            "sjres" : "senate-joint-resolution",
            "sres": "senate-resolution"
        }
        if bill_type not in mapper:
            return None
        
        bill_num = bill_id.split("-")[0]
        bill_num = "".join([i for i in bill_num if i.isdigit()])
        
        url = f"https://www.congress.gov/bill/{ordinal(int(congress))}-congress/{mapper[bill_type]}/{bill_num}/text?format=txt"
        
        try:
            p = BeautifulSoup(requests.get(url).text)
            body = p.find(id="billTextContainer").get_text()
        except AttributeError:
            print(bill_type, bill_id, congress, url)
        return body
    
    
def fetch_bodies(congress):
    client = MongoClient('localhost', 27017)
    db = client["congress"]
    coll = db["bills"]
    data = coll.find(
        {"congress": {"$eq": congress}, "bill_id": {"$exists": True}, "bill_body": {"$eq": "NO BODY"}},
        {"bill_type": True, "bill_id": True, "congress": True}
    )
    
    errors = []
    
    for i, entry in enumerate(data):
        try:
            body = fetch_bill_text(entry["bill_type"], entry["bill_id"], entry["congress"])
            if body is None:
                adder = {"bill_body": "NO BODY"}
            else:
                adder = {"bill_body": body}
            coll.update_one({"_id": entry["_id"]}, {"$set": adder})
        except Exception as e:
            print(e)

    client.close()

In [54]:
def test_congress_billbody_avail():
    good_ones = []
    client = MongoClient('localhost', 27017)
    db = client["congress"]
    coll = db["bills"]
    for c in coll.distinct("congress"):
        data = coll.find_one(
            {"congress": {"$eq": str(c)}, "bill_id": {"$exists": True}},  # amendments have no bill_id
            {"bill_type": True, "bill_id": True, "congress": True}        # only need these cols to fetch bodies
        )
        body = fetch_bill_text(data["bill_type"], data["bill_id"], data["congress"])
        if body is not None:
            good_ones.append(str(c))
    client.close()
    return good_ones

In [4]:
good_congresses = test_congress_billbody_avail()

In [5]:
good_congresses

['101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115']

In [65]:
import multiprocessing.dummy as mp

In [68]:
with mp.Pool(4) as pool:
    pool.map(fetch_bodies, good_congresses)

hconres hconres330-101 101 https://www.congress.gov/bill/101st-congress/house-concurrent-resolution/330/text?format=txt
local variable 'body' referenced before assignment
hconres hconres258-102 102 https://www.congress.gov/bill/102nd-congress/house-concurrent-resolution/258/text?format=txt
local variable 'body' referenced before assignment
hr hr419-103 103 https://www.congress.gov/bill/103rd-congress/house-bill/419/text?format=txt
local variable 'body' referenced before assignment
s s1795-104 104 https://www.congress.gov/bill/104th-congress/senate-bill/1795/text?format=txt
local variable 'body' referenced before assignment
hr hr1592-101 101 https://www.congress.gov/bill/101st-congress/house-bill/1592/text?format=txt
local variable 'body' referenced before assignment
hjres hjres265-102 102 https://www.congress.gov/bill/102nd-congress/house-joint-resolution/265/text?format=txt
local variable 'body' referenced before assignment
hr hr2199-101 101 https://www.congress.gov/bill/101st-congres

In [69]:
print("DONE")

DONE


In [101]:
client = MongoClient('localhost', 27017)
db = client["congress"]
coll = db["bills"]
data = coll.find({"bill_body": {"$ne": "NO BODY"}, "bill_body": {"$exists": True}})
data_list = []
for i, d in enumerate(data):
    data_list.append(d)
    if i == 250:
        break
client.close()

In [102]:
sorted(data_list[0].keys())

['_id',
 'actions',
 'amendments',
 'bill_body',
 'bill_id',
 'bill_type',
 'committees',
 'congress',
 'cosponsors',
 'enacted_as',
 'history',
 'introduced_at',
 'number',
 'official_title',
 'popular_title',
 'related_bills',
 'short_title',
 'sponsor',
 'status',
 'status_at',
 'subjects',
 'subjects_top_term',
 'summary',
 'titles',
 'updated_at']

In [118]:
# data_list[2]["popular_title"]
for i in data_list:
    print(i["updated_at"])

2013-02-02T20:09:18-05:00
2013-02-02T20:09:19-05:00
2013-02-02T20:09:21-05:00
2013-02-02T20:09:21-05:00
2013-02-02T20:09:21-05:00
2013-02-02T20:09:21-05:00
2013-02-02T20:09:21-05:00
2013-02-02T20:09:21-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:19-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:19-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:19-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T20:09:22-05:00
2013-02-02T2