In [47]:
import numpy as np
import pandas as pd
import os 
import requests
import json
from bs4 import BeautifulSoup
import collections
collections.Callable = collections.abc.Callable
import time

In [2]:
propublica_token=os.environ['propublica_token']

In [3]:
useragent_url = 'https://httpbin.org/user-agent'
#website that exists to help you work with other APIs, generalized code that works in any situation, including a virtual machine
r= requests.get(useragent_url)
useragent= json.loads(r.text)['user-agent']
#standard thing to put in HTTP header to let websites know what kind of system you are using: best practice is to include it even
#though not all websites require it

In [4]:
headers = {'X-API-Key': propublica_token, 
          'User-Agent': useragent, 
          'From': 'brc4cb@virginia.edu'}
#Headers are what we send API about ourselves, adds an email address so they can reach you if you abuse the API

## Goal: Get the text of all the bills sponsored by Bob Good in the 117th Congress
### Step 1: Get Bob Good's ID Number from the Propublica members API

In [5]:
root = "https://api.propublica.org"
congress = '117'
chamber= 'house'
endpoint = "/congress/v1/{congress}/{chamber}/members.json".format(congress=congress, chamber=chamber)

r = requests.get(root + endpoint, 
                 headers = headers)
r

<Response [200]>

In [6]:
useragent

'python-requests/2.28.1'

In [7]:
myjson = json.loads(r.text)
membersdf = pd.json_normalize(myjson, record_path = ['results', 'members'])
membersdf.head(3).T
#Transposes the df and displays the first three observations

Unnamed: 0,0,1,2
id,A000370,A000055,A000371
title,Representative,Representative,Representative
short_title,Rep.,Rep.,Rep.
api_uri,https://api.propublica.org/congress/v1/members...,https://api.propublica.org/congress/v1/members...,https://api.propublica.org/congress/v1/members...
first_name,Alma,Robert,Pete
middle_name,,B.,
last_name,Adams,Aderholt,Aguilar
suffix,,,
date_of_birth,1946-05-27,1965-07-22,1979-06-19
gender,F,M,M


In [8]:
#Pull bob good by deleting all rows except the ones where last name equals good
bobgood = membersdf.query("last_name=='Good'")

In [9]:
#pull Bob Good's member ID
bobgoodid = bobgood.reset_index()['id'][0]

## Step 2: Use BG's ID to query the bills API

In [10]:
endpoint = '/congress/v1/members/{memberid}/bills/{billtype}.json'.format(memberid = bobgoodid, 
                                                                         billtype = 'introduced')
r = requests.get(root + endpoint, 
                 headers = headers)
r
myjson = json.loads(r.text)
# Only gives first 20 bills so we need to run it again and combine them into 1 df

bgbills1 = pd.json_normalize(myjson, record_path = ['results', 'bills'])

In [11]:
r = requests.get(root + endpoint, 
                 headers = headers, 
                params = {'offset': 20})
r
myjson = json.loads(r.text)
#Running again pulling only bills after bill 20

bgbills2 = pd.json_normalize(myjson, record_path = ['results', 'bills'])

In [12]:
bgbills = pd.concat([bgbills1, bgbills2], ignore_index=True)
#combine two DFs with ignore index which resets the row numbers 

In [13]:
bgbills
#Show df

Unnamed: 0,congress,bill_id,bill_type,number,bill_uri,title,short_title,sponsor_title,sponsor_id,sponsor_name,...,enacted,vetoed,cosponsors,committees,primary_subject,summary,summary_short,latest_major_action_date,latest_major_action,cosponsors_by_party.R
0,117,hr8935-117,hr,H.R.8935,https://api.propublica.org/congress/v1/117/bil...,To amend the Labor-Management Reporting and Di...,To amend the Labor-Management Reporting and Di...,Rep.,G000595,Robert Good,...,,,4,House Education and Labor Committee,,,,2022-09-21,Referred to the House Committee on Education a...,4
1,117,hr8767-117,hr,H.R.8767,https://api.propublica.org/congress/v1/117/bil...,To establish a private right of action for par...,Empowering Parents Act,Rep.,G000595,Robert Good,...,,,5,House Education and Labor Committee,Education,,,2022-09-02,Referred to the House Committee on Education a...,5
2,117,hres1297-117,hres,H.RES.1297,https://api.propublica.org/congress/v1/117/bil...,"Designating the week beginning November 7, 202...","Designating the week beginning November 7, 202...",Rep.,G000595,Robert Good,...,,,32,House Energy and Commerce Committee,Health,,,2022-07-28,Referred to the House Committee on Energy and ...,32
3,117,hres1167-117,hres,H.RES.1167,https://api.propublica.org/congress/v1/117/bil...,Providing for the consideration of the bill (H...,Providing for the consideration of the bill (H...,Rep.,G000595,Robert Good,...,,,67,House Rules Committee,Congress,,,2022-06-22,Motion to Discharge Committee filed by Mr. Goo...,67
4,117,hr7743-117,hr,H.R.7743,https://api.propublica.org/congress/v1/117/bil...,To amend the National Labor Relations Act to a...,Small Businesses before Bureaucrats Act,Rep.,G000595,Robert Good,...,,,13,House Education and Labor Committee,Labor and Employment,,,2022-05-12,Referred to the House Committee on Education a...,13
5,117,hres1077-117,hres,H.RES.1077,https://api.propublica.org/congress/v1/117/bil...,Amending the Rules of the House of Representat...,Less is More Resolution,Rep.,G000595,Robert Good,...,,,9,House Ethics Committee,Congress,,,2022-04-29,"Referred to the Committee on Rules, and in add...",9
6,117,hr7058-117,hr,H.R.7058,https://api.propublica.org/congress/v1/117/bil...,To prohibit abuse of the authority of the Secr...,Federal Student Loan Integrity Act,Rep.,G000595,Robert Good,...,,,17,House Education and Labor Committee,Education,,,2022-03-11,Referred to the House Committee on Education a...,17
7,117,hr7024-117,hr,H.R.7024,https://api.propublica.org/congress/v1/117/bil...,To direct the Secretary of State to designate ...,Every Town A Border Town Act,Rep.,G000595,Robert Good,...,,,15,House Judiciary Committee,Crime and Law Enforcement,Every Town A Border Town Act This bill require...,Every Town A Border Town Act This bill require...,2022-03-09,Referred to the House Committee on the Judiciary.,15
8,117,hr6628-117,hr,H.R.6628,https://api.propublica.org/congress/v1/117/bil...,To provide appropriations for a border wall an...,Close Biden’s Open Border Act,Rep.,G000595,Robert Good,...,,,19,House Foreign Affairs Committee,Economics and Public Finance,Close Biden's Open Border Act This bill provid...,Close Biden&#39;s Open Border Act This bill pr...,2022-02-07,"Referred to the Committee on Appropriations, a...",19
9,117,hr6446-117,hr,H.R.6446,https://api.propublica.org/congress/v1/117/bil...,To amend the Help America Vote Act of 2002 to ...,One Citizen One Vote Act,Rep.,G000595,Robert Good,...,,,11,House Committee on House Administration,Government Operations and Politics,One Citizen One Vote Act This bill prohibits t...,One Citizen One Vote Act This bill prohibits t...,2022-01-20,Referred to the House Committee on House Admin...,11


In [14]:
#Shows the text for the tenth most recent bill Bob Good sponsored in this year 
urltoscrape = bgbills['congressdotgov_url'][10] + '/text'

In [70]:
#connect to website
r = requests.get(urltoscrape, 
                headers = {'User-Agent': useragent, 
                          'From': 'brc4cb@virginia.edu'})

myhtml = BeautifulSoup(r.text, 'html.parser')

In [77]:
myhtml.find_all('p', "lbexIndent", style="text-align:left")[0].text
#Find the first new paragraph text aligned left and pull the text to get the whole text of the bill

' Be it enacted by the Senate and House of Representatives of the\n\t\tUnited States of America in Congress assembled,  SECTION 1.  Short title.This Act may be cited as the “Veterans Education is Timeless Act of 2021”. SEC. 2.  Elimination of delimiting dates under Department of Veterans Affairs educational assistance programs. (a) All volunteer force educational assistance program.— (1) IN GENERAL.—Section 3031 of title 38, United States Code, is amended to read as follows: “§ 3031.  No time limitation for use of eligibility and entitlement“Educational assistance benefits shall be afforded an eligible veteran under this chapter at any time and are not subject to any delimiting date.”. (2) CLERICAL AMENDMENT.—The table of sections at the beginning of chapter 30 of such title is amended by striking the item relating to section 3031 and inserting the following new item:“3031. No time limitation for use of eligibility and entitlement.”. (b) Post-Vietnam educational assistance program.—Sec

## Alternative and better way to get to text of bill in plain text file

In [22]:
#Shows the text for the tenth most recent bill Bob Good sponsored in this year in a txt format (plain text)
urltoscrape = bgbills['congressdotgov_url'][11] + '/text?format=txt'
urltoscrape

'https://www.congress.gov/bill/117th-congress/house-bill/5731/text?format=txt'

In [21]:
#connect to website
r = requests.get(urltoscrape, 
                headers = {'User-Agent': useragent, 
                          'From': 'brc4cb@virginia.edu'})

myhtml = BeautifulSoup(r.text, 'html.parser')
r

<Response [200]>

#### To search in source code for the text we want we can grab text from the bill and control f for it

In [23]:
#we want to pull the shown here text in the source code
myhtml.find_all('h3')
#Finds 5 occurrences of h3 tag in HTML 

[<h3>More on This Bill</h3>,
 <h3>Subject — Policy Area:</h3>,
 <h3>Give Feedback on This Bill</h3>,
 <h3 class="cdg-summary-wrapper-header">Text available as:</h3>,
 <h3 class="currentVersion">Shown Here:<br/><span>Introduced in House (10/26/2021)</span></h3>]

In [24]:
#now lets get what we want which is the 5th entry 
myhtml.find_all('h3', "currentVersion")
#pulls only the tags with the current version class 

[<h3 class="currentVersion">Shown Here:<br/><span>Introduced in House (10/26/2021)</span></h3>]

In [25]:
myhtml.find_all('h3', "currentVersion")[0]
#take it out of the list

<h3 class="currentVersion">Shown Here:<br/><span>Introduced in House (10/26/2021)</span></h3>

In [26]:
#now pull what we want from the tag span 
myhtml.find_all('h3', "currentVersion")[0].span

<span>Introduced in House (10/26/2021)</span>

In [27]:
#now pull the text
myhtml.find_all('h3', "currentVersion")[0].text

'Shown Here:Introduced in House (10/26/2021)'

### Getting text of the bill

In [31]:
#look for the doc tag
myhtml.find_all('DOC')
#doesn't work

[]

In [32]:
#try with pre instead
myhtml.find_all('pre')

[<pre id="billTextContainer">[Congressional Bills 117th Congress]
 [From the U.S. Government Publishing Office]
 [H.R. 5731 Introduced in House (IH)]
 
 <doc>
 
 
 
 
 
 
 117th CONGRESS
   1st Session
                                 H. R. 5731
 
  To provide that no Federal funds may be expended to implement certain 
          law enforcement partnerships, and for other purposes.
 
 
 _______________________________________________________________________
 
 
                     IN THE HOUSE OF REPRESENTATIVES
 
                             October 26, 2021
 
 Mr. Good of Virginia (for himself, Mr. Gosar, Mrs. Boebert, Mr. Duncan, 
  Mr. Perry, Mrs. Miller of Illinois, Mr. Cawthorn, Mr. Buck, Mr. Weber 
    of Texas, and Mr. Cloud) introduced the following bill; which was 
                referred to the Committee on the Judiciary
 
 _______________________________________________________________________
 
                                  A BILL
 
 
  
  To provide that no Federal 

In [33]:
#now get just the text
print(myhtml.find_all('pre')[0].text)

[Congressional Bills 117th Congress]
[From the U.S. Government Publishing Office]
[H.R. 5731 Introduced in House (IH)]








117th CONGRESS
  1st Session
                                H. R. 5731

 To provide that no Federal funds may be expended to implement certain 
         law enforcement partnerships, and for other purposes.


_______________________________________________________________________


                    IN THE HOUSE OF REPRESENTATIVES

                            October 26, 2021

Mr. Good of Virginia (for himself, Mr. Gosar, Mrs. Boebert, Mr. Duncan, 
 Mr. Perry, Mrs. Miller of Illinois, Mr. Cawthorn, Mr. Buck, Mr. Weber 
   of Texas, and Mr. Cloud) introduced the following bill; which was 
               referred to the Committee on the Judiciary

_______________________________________________________________________

                                 A BILL


 
 To provide that no Federal funds may be expended to implement certain 
         law enforcement pa

# Now all we need is a list of URLs to go to and all we need to do is change the URL, the bill pulling procedure will work the same

In [34]:
#basic for loop
for i in range(0,10): print(i)

0
1
2
3
4
5
6
7
8
9


In [36]:
for i in range(1,10):
    print('https://www.congress.gov/bill/117th-congress/house-bill/{i}/text?format=txt'.format(i=i))
#print first 10 bills for the 117th congress

https://www.congress.gov/bill/117th-congress/house-bill/1/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/2/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/3/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/4/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/5/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/6/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/7/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/8/text?format=txt
https://www.congress.gov/bill/117th-congress/house-bill/9/text?format=txt


In [64]:
#Now we want to create a function to pull text from these urls, with URL as the argument
def scrape_one_bill(url):
    time.sleep(2)
    print('Now getting from ' + url)
    r = requests.get(url, headers = {'User-Agent': useragent, 'From': 'brc4cb@virginia.edu'})
    myhtml = BeautifulSoup(r.text, 'html.parser')
    try:
        billtext = myhtml.find_all('pre')[0].text
        return billtext
    except:
        pass
#scrapes on bill when we feed it a url and gives us the text of the bill
# need to add in a two second delay because of the robots.txt file on bills.gov mentioning you need to wait two seconds
# between scrapes

In [59]:
#list comprehension practice
[......  for i in range(1,10)]
#do a thing for every i in this range and put them into a list 

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1775158221.py, line 2)

In [60]:
#create a url list
urllist = ['https://www.congress.gov/bill/117th-congress/house-bill/{i}/text?format=txt'.format(i=i) for i in range(3,13)]

In [61]:
urllist

['https://www.congress.gov/bill/117th-congress/house-bill/3/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/4/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/5/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/6/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/7/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/8/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/9/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/10/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/11/text?format=txt',
 'https://www.congress.gov/bill/117th-congress/house-bill/12/text?format=txt']

In [62]:
# all you need for a spider is a list of URLs to go to, we gave it a predetermined list whereas other spiders might
# use code to scrape URLs from a starting webpage (like the coursera code)

### Run the Spider

In [65]:
bills = [scrape_one_bill(u) for u in urllist]
#unleash our spider and store results in a list called bills

Now getting from https://www.congress.gov/bill/117th-congress/house-bill/3/text?format=txt
Now getting from https://www.congress.gov/bill/117th-congress/house-bill/4/text?format=txt
Now getting from https://www.congress.gov/bill/117th-congress/house-bill/5/text?format=txt
Now getting from https://www.congress.gov/bill/117th-congress/house-bill/6/text?format=txt
Now getting from https://www.congress.gov/bill/117th-congress/house-bill/7/text?format=txt
Now getting from https://www.congress.gov/bill/117th-congress/house-bill/8/text?format=txt
Now getting from https://www.congress.gov/bill/117th-congress/house-bill/9/text?format=txt
Now getting from https://www.congress.gov/bill/117th-congress/house-bill/10/text?format=txt
Now getting from https://www.congress.gov/bill/117th-congress/house-bill/11/text?format=txt
Now getting from https://www.congress.gov/bill/117th-congress/house-bill/12/text?format=txt


In [72]:
print(bills[6])
#It works and prints beautifully!

None
