In [1]:
import numpy as np
import pandas as pd
import pymongo
import requests
import os 
from bson.json_util import loads, dumps

In [2]:
propublica_token = os.environ['propublica_token']
mongo_user = os.environ['MONGO_INITDB_ROOT_USERNAME']
mongo_pwd= os.environ['MONGO_INITDB_ROOT_PASSWORD']
mongo_initdb = os.environ['MONGO_INITDB_DATABASE']

In [4]:
import get_data

In [6]:
useragent= get_data.get_useragent()
useragent

'python-requests/2.28.1'

In [12]:
bills_list, num_results = get_data.get_bills_pp(propublica_token, useragent, email = 'brc4cb@virginia.edu', offset = 0 )

In [13]:
bills_list

[{'bill_id': 'sres830-117',
  'bill_slug': 'sres830',
  'bill_type': 'sres',
  'number': 'S.RES.830',
  'bill_uri': 'https://api.propublica.org/congress/v1/117/bills/sres830.json',
  'title': 'A resolution expressing support for the designation of the week of October 24, 2022, to October 31, 2022, as "Bat Week".',
  'short_title': 'A resolution expressing support for the designation of the week of October 24, 2022, to October 31, 2022, as "Bat Week".',
  'sponsor_title': 'Sen.',
  'sponsor_id': 'L000174',
  'sponsor_name': 'Patrick J. Leahy',
  'sponsor_state': 'VT',
  'sponsor_party': 'D',
  'sponsor_uri': 'https://api.propublica.org/congress/v1/members/L000174.json',
  'gpo_pdf_uri': None,
  'congressdotgov_url': 'https://www.congress.gov/bill/117th-congress/senate-resolution/830',
  'govtrack_url': 'https://www.govtrack.us/congress/bills/117/sres830',
  'introduced_date': '2022-11-14',
  'active': False,
  'last_vote': None,
  'house_passage': None,
  'senate_passage': None,
  'enac

In [22]:
myclient = pymongo.MongoClient(f"mongodb://{mongo_user}:{mongo_pwd}@mongo:27017/{mongo_initdb}?authSource=admin")
#@mongo is whatever you named the service in your docker compose file and the port is the port you allocated in the file as well

In [23]:
contrans_db = myclient['contrans']
#connects to contrans if it already exists and if it doesn't it creates it (it being your mongo DB)

In [24]:
#go inside database and define a new collection for the bills 

collist = contrans_db.list_collection_names()
#returns all the collections currently in the DB
if "bills" in collist:
    contrans_db.bills.drop()
#gets rid of bills if it is in collections already (i.e. in the database)

In [25]:
#create bills collection
bills = contrans_db['bills']

In [26]:
#add data to mongodb- pass in bills list
bills_insert = bills.insert_many(bills_list)

## Queries 

In [28]:
#get all records
myquery = bills.find({})
loads(dumps(myquery))
#only difference between this and original list is that it creates the _id object associated with the database

[{'_id': ObjectId('6373b28c984f175941eafba0'),
  'bill_id': 'sres830-117',
  'bill_slug': 'sres830',
  'bill_type': 'sres',
  'number': 'S.RES.830',
  'bill_uri': 'https://api.propublica.org/congress/v1/117/bills/sres830.json',
  'title': 'A resolution expressing support for the designation of the week of October 24, 2022, to October 31, 2022, as "Bat Week".',
  'short_title': 'A resolution expressing support for the designation of the week of October 24, 2022, to October 31, 2022, as "Bat Week".',
  'sponsor_title': 'Sen.',
  'sponsor_id': 'L000174',
  'sponsor_name': 'Patrick J. Leahy',
  'sponsor_state': 'VT',
  'sponsor_party': 'D',
  'sponsor_uri': 'https://api.propublica.org/congress/v1/members/L000174.json',
  'gpo_pdf_uri': None,
  'congressdotgov_url': 'https://www.congress.gov/bill/117th-congress/senate-resolution/830',
  'govtrack_url': 'https://www.govtrack.us/congress/bills/117/sres830',
  'introduced_date': '2022-11-14',
  'active': False,
  'last_vote': None,
  'house_pa

In [30]:
myquery = bills.find({'sponsor_name': 'Patrick J. Leahy'})
loads(dumps(myquery))
#just bills by patrick leahy

[{'_id': ObjectId('6373b28c984f175941eafba0'),
  'bill_id': 'sres830-117',
  'bill_slug': 'sres830',
  'bill_type': 'sres',
  'number': 'S.RES.830',
  'bill_uri': 'https://api.propublica.org/congress/v1/117/bills/sres830.json',
  'title': 'A resolution expressing support for the designation of the week of October 24, 2022, to October 31, 2022, as "Bat Week".',
  'short_title': 'A resolution expressing support for the designation of the week of October 24, 2022, to October 31, 2022, as "Bat Week".',
  'sponsor_title': 'Sen.',
  'sponsor_id': 'L000174',
  'sponsor_name': 'Patrick J. Leahy',
  'sponsor_state': 'VT',
  'sponsor_party': 'D',
  'sponsor_uri': 'https://api.propublica.org/congress/v1/members/L000174.json',
  'gpo_pdf_uri': None,
  'congressdotgov_url': 'https://www.congress.gov/bill/117th-congress/senate-resolution/830',
  'govtrack_url': 'https://www.govtrack.us/congress/bills/117/sres830',
  'introduced_date': '2022-11-14',
  'active': False,
  'last_vote': None,
  'house_pa

In [33]:
#find function has two arguments - first filters the rows, second filters the columns by setting them equal to 1 
myquery = bills.find({'sponsor_name': 'Patrick J. Leahy'}, {"_id":0,'number':1, 'sponsor_name':1, 'title':1})
loads(dumps(myquery))
#just bills by patrick leahy and we just want number, sponsor name, and title

[{'number': 'S.RES.830',
  'title': 'A resolution expressing support for the designation of the week of October 24, 2022, to October 31, 2022, as "Bat Week".',
  'sponsor_name': 'Patrick J. Leahy'}]

In [36]:
myquery = bills.find({}, {"_id":0,'number':1, 'sponsor_name':1, 'title':1})
pd.DataFrame.from_records(loads(dumps(myquery)))
#pull those columns for all rows and put in dataframe

Unnamed: 0,number,title,sponsor_name
0,S.RES.830,A resolution expressing support for the design...,Patrick J. Leahy
1,S.5082,A bill to impose sanctions with respect to Gen...,Joshua Hawley
2,S.5077,A bill to amend the Alaska Native Claims Settl...,Dan Sullivan
3,S.5081,A bill to establish an Office of Environmental...,Alex Padilla
4,S.5079,A bill to designate the facility of the United...,Amy Klobuchar
5,S.5076,A bill to require training for employees of Fe...,Joni Ernst
6,S.5080,A bill to direct the Secretary of Transportati...,Richard Blumenthal
7,S.5084,A bill to reprioritize Federal law enforcement...,Bill Hagerty
8,S.5083,A bill to require the Secretary of State to su...,Robert Menendez
9,S.5085,A bill to prohibit the government of the Distr...,Ted Cruz


In [38]:
#see if any bills have more than 0 cosponsors- it pulls all of them - full list of operators is in the textbook
myquery = bills.find({'cosponsors': {'$gt':0}}, {"_id":0,'number':1, 'sponsor_name':1, 'title':1, 'cosponsors':1})
pd.DataFrame.from_records(loads(dumps(myquery)))

Unnamed: 0,number,title,sponsor_name,cosponsors
0,S.RES.830,A resolution expressing support for the design...,Patrick J. Leahy,1
1,S.5081,A bill to establish an Office of Environmental...,Alex Padilla,7
2,S.5079,A bill to designate the facility of the United...,Amy Klobuchar,1
3,S.5076,A bill to require training for employees of Fe...,Joni Ernst,2
4,S.5084,A bill to reprioritize Federal law enforcement...,Bill Hagerty,5
5,S.5083,A bill to require the Secretary of State to su...,Robert Menendez,2
6,S.5078,A bill to designate the facility of the United...,Amy Klobuchar,1
7,S.J.RES.64,A joint resolution disapproving of the rule su...,Roger Marshall,1
8,H.R.9296,To direct the Director of the Bureau of Prison...,Jackie Speier,5
9,H.R.9291,To require a report on the death of Shireen Ab...,André Carson,18


In [41]:
#text search - search in summary column for all text 
bills.create_index(['summary', 'text'])

TypeError: not enough arguments for format string