# Scraping Congress Members

Reference and inspiration: https://github.com/lobodemonte/big-data-for-public-policy/blob/master/congress_gov_scraper/congress_gov_scraper.ipynb

Information about all congress members in the house of representatives have been scraped from congress.gov using the code below, adjusted from the example in the link above to suit the needs of this project.

In [None]:
import json
import re
import requests
import pandas as pd 
import numpy as np
import urllib
import tqdm
from bs4 import BeautifulSoup

The approach is as follows:
* Extract urls with lists of all relevant congress members
* For each page, extract the url to each congress members' personal page from the link in the list

In [None]:
congress_numbers = [110, 111, 112, 113, 114, 115,116]
page_size = 250

In [None]:
# Generating urls with lists of congress members
def get_congress_url(congress_numbers, page_size, page):
    congress_q_str = "{" + '"congress":[{}]'.format(','.join('"{0}"'.format(num) for num in congress_numbers)) + "}"
    params= {
        "q": congress_q_str,
        "pageSize" : page_size,
        "page" : page,
        "searchResultViewType": "expanded",
        "KWICView": "true"
    }
    actual_params = urllib.parse.urlencode(params, safe='{}:[]')
    return "https://www.congress.gov/members?{}".format(actual_params)

def get_num_pages(soup):
    page_num_element = soup.find("div", {"class": "basic-search-tune-number"}).find("div", {"class": "pagination"}).find("span", {"class": "results-number"}).text
    page_num_raw = [int(s) for s in page_num_element.split() if s.isdigit()]
    return int(page_num_raw[0])

In [None]:
# Extract individual urls
def extract_member_name_url(entry):
    result_heading = entry.find("span", {"class": "result-heading"})
    member_name = result_heading.text
    if "Representative" in member_name:
        member_name = member_name.replace("Representative", "").strip()

    elif "Senator" in member_name:
        member_name = member_name.replace("Senator", "").strip()
    
    url = "https://www.congress.gov" + result_heading.a['href']
    return [member_name, url]   

def extract_congress_members(congress_list):
    members = []
    for entry in congress_list:
        member = {}
        [member["name"], member["url"]] = extract_member_name_url(entry)
        
        members.append(member)
    return members

In [None]:
congress_url = get_congress_url(congress_numbers, page_size, 1)
print("Congress URL: ", congress_url)

response = requests.get(congress_url).text
soup = BeautifulSoup(response, "html.parser")

num_pages = get_num_pages(soup)
print("Number of result pages: ", num_pages)

Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22110%22%2C%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=1&searchResultViewType=expanded&KWICView=true
Number of result pages:  5


We run the functions, generating a long dictionary with all of the relevant congress members and their corresponding URL

In [None]:
#Now that we know how many pages we'll scrape, we can start scraping
congress_url = get_congress_url(congress_numbers, page_size, 1)
print("Congress URL: ", congress_url)

response = requests.get(congress_url).text
soup = BeautifulSoup(response, "html.parser")

#We need the number of pages we will need to scrape
num_pages = get_num_pages(soup)
print("Number of result pages: ", num_pages)

all_members = []
for page in range(1, num_pages+1):
    congress_url = get_congress_url(congress_numbers, page_size, page)
    print("Congress URL: ", congress_url)
    
    response = requests.get(congress_url).text
    soup = BeautifulSoup(response, "html.parser")
    
    congress_list = soup.find("ol", {"class": "basic-search-results-lists"}).find_all("li", {"class": "expanded"})

    print("Page Size: {}, Members found: {}".format(page_size, len(congress_list)))

    members = extract_congress_members(congress_list)
    print("Extracted {} congressional entries from Page {}".format(len(members), page))
    all_members.extend(members)

Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22110%22%2C%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=1&searchResultViewType=expanded&KWICView=true
Number of result pages:  5
Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22110%22%2C%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=1&searchResultViewType=expanded&KWICView=true
Page Size: 250, Members found: 250
Extracted 250 congressional entries from Page 1
Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22110%22%2C%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=250&page=2&searchResultViewType=expanded&KWICView=true
Page Size: 250, Members found: 250
Extracted 250 congressional entries from Page 2
Congress URL:  https://www.congress.gov/members?q={%22congress%22:[%22110%22%2C%22111%22%2C%22112%22%2C%22113%22%2C%22114%22%2C%22115%22%2C%22116%22]}&pageSize=25

In [None]:
congress_members_urls = pd.DataFrame(all_members)
congress_members_urls.drop_duplicates(ignore_index=True, inplace=True)
congress_members_urls

Unnamed: 0,name,url
0,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...
1,"Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...
2,"Ackerman, Gary L.",https://www.congress.gov/member/gary-ackerman/...
3,"Adams, Alma S.",https://www.congress.gov/member/alma-adams/A00...
4,"Adams, Sandy",https://www.congress.gov/member/sandy-adams/A0...
...,...,...
1084,"Young, David",https://www.congress.gov/member/david-young/Y0...
1085,"Young, Don",https://www.congress.gov/member/don-young/Y000033
1086,"Young, Todd",https://www.congress.gov/member/todd-young/Y00...
1087,"Zeldin, Lee M.",https://www.congress.gov/member/lee-zeldin/Z00...


Finally, for each member, we open their URL and extract the relevant information from the HTML. In this case we are looking for:
* Party
* Champer (to ensure they are part of the house of representatives)
* District
* Congress numbers
* Time period

In [None]:
def parse_profile(soup):
    results = []
    remove_endings = ['th', 'st', 'nd', 'rd']

    profile = soup.find("div", {"class": "overview-member-column-profile member_profile"})
    party_table = profile.find("table", {"class": "standard01 nomargin"})

    for row in party_table.find_all("tr"):
        if row.find('th', {"class":"member_party"}):
            party = row.find('td').text.strip()
            
        if row.find('th', {"class":"member_chamber"}):
            chamber = row.find('th', {"class":"member_chamber"}).text.strip()
            if chamber == 'House':
                res = row.find('td').text.strip()
                if res:
                    state = res.split(",")[0]
                    rest = res.split(",")[1]
                    district = rest.split(" ")[2]
                    if district == 'At':
                        district = 'At Large'
                        congress_numbers = [int(s[:-2]) for s in rest.split(" ")[4].split("-") if s[:-2].isdigit()]

                    else:
                        congress_numbers = [int(s[:-2]) for s in rest.split(" ")[3].split("-") if s[:-2].isdigit()]
                    
                    if len(congress_numbers) > 1:
                        congress_numbers = list(range(congress_numbers[0], congress_numbers[1] + 1))

                    years_str = rest[rest.find("(")+1:rest.find(")")]
                    years = years_str.split("-")
                    if len(years) == 1:
                        start = years[0]
                        end = years[0]
                    if len(years) == 2:
                        start = years[0]
                        end = years[1]
                    if end == 'Present':
                        end = np.inf

                    results.append([party, state, district, chamber, congress_numbers, start, end])
    return results

We iterate over the dataframe containing all the congress members, extracting their individual information for each iteration, which takes a bit of time.

In [None]:
columns=['name','url', 'party','state','district','chamber', 'congress_numbers', 'start', 'end']
congress_member_info = pd.DataFrame()
for index, row in congress_members_urls.iterrows():
    name = row['name']
    url = row['url']
    print(name)
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    terms = parse_profile(soup)
    for term in terms:
        to_add = [name,url]
        to_add.extend(term)
        to_add = pd.Series(to_add, columns)
        congress_member_info = congress_member_info.append([to_add], ignore_index=True)

Abercrombie, Neil
Abraham, Ralph Lee
Ackerman, Gary L.
Adams, Alma S.
Adams, Sandy
Aderholt, Robert B.
Adler, John H.
Aguilar, Pete
Akaka, Daniel K.
Akin, W. Todd
Alexander, Lamar
Alexander, Rodney
Allard, Wayne
Allen, Rick W.
Allen, Thomas H.
Allred, Colin Z.
Altmire, Jason
Amash, Justin
Amodei, Mark E.
Andrews, Robert E.
Arcuri, Michael A.
Armstrong, Kelly
Arrington, Jodey C.
Ashford, Brad
Austria, Steve
Axne, Cynthia
Ayotte, Kelly
Babin, Brian
Baca, Joe
Bachmann, Michele
Bachus, Spencer
Bacon, Don
Baird, Brian
Baird, James R.
Baker, Richard H.
Balderson, Troy
Baldwin, Tammy
Banks, Jim
Barber, Ron
Barletta, Lou
Barr, Andy
Barragan, Nanette Diaz
Barrasso, John
Barrett, J. Gresham
Barrow, John
Bartlett, Roscoe G.
Barton, Joe
Bass, Charles F.
Bass, Karen
Baucus, Max
Bayh, Evan
Bean, Melissa L.
Beatty, Joyce
Becerra, Xavier
Begich, Mark
Benishek, Dan
Bennet, Michael F.
Bennett, Robert F.
Bentivolio, Kerry L.
Bera, Ami
Berg, Rick
Bergman, Jack
Berkley, Shelley
Berman, Howard L.
Berry, Mar

Jenkins, Lynn
Jindal, Bobby
Johanns, Mike
Johnson, Bill
Johnson, Dusty
Johnson, Eddie Bernice
Johnson, Henry C. "Hank," Jr.
Johnson, Mike
Johnson, Ron
Johnson, Sam
Johnson, Tim
Johnson, Timothy V.
Jolly, David W.
Jones, Brenda
Jones, Doug
Jones, Stephanie Tubbs
Jones, Walter B., Jr.
Jordan, Jim
Joyce, David P.
Joyce, John
Kagen, Steve
Kaine, Tim
Kanjorski, Paul E.
Kaptur, Marcy
Katko, John
Kaufman, Edward E.
Keating, William R.
Keller, Fred
Keller, Ric
Kelly, Mike
Kelly, Robin L.
Kelly, Trent
Kennedy, Edward M.
Kennedy, John
Kennedy, Joseph P., III
Kennedy, Patrick J.
Kerry, John F.
Khanna, Ro
Kihuen, Ruben J.
Kildee, Dale E.
Kildee, Daniel T.
Kilmer, Derek
Kilpatrick, Carolyn C.
Kilroy, Mary Jo
Kim, Andy
Kind, Ron
King, Angus S., Jr.
King, Peter T.
King, Steve
Kingston, Jack
Kinzinger, Adam
Kirk, Mark Steven
Kirk, Paul Grattan, Jr.
Kirkpatrick, Ann
Kissell, Larry
Klein, Ron
Kline, John
Klobuchar, Amy
Knight, Stephen
Knollenberg, Joe
Kohl, Herb
Kosmas, Suzanne M.
Kratovil, Frank, Jr.
K

Trone, David J.
Trott, David A.
Tsongas, Niki
Turner, Michael R.
Turner, Robert L.
Udall, Mark
Udall, Tom
Underwood, Lauren
Upton, Fred
Valadao, David G.
Van Drew, Jefferson
Van Hollen, Chris
Vargas, Juan
Veasey, Marc A.
Vela, Filemon
Velazquez, Nydia M.
Visclosky, Peter J.
Vitter, David
Voinovich, George V.
Wagner, Ann
Walberg, Tim
Walden, Greg
Walker, Mark
Walorski, Jackie
Walsh, James T.
Walsh, Joe
Walsh, John E.
Walters, Mimi
Waltz, Michael
Walz, Timothy J.
Wamp, Zach
Warner, John
Warner, Mark R.
Warren, Elizabeth
Wasserman Schultz, Debbie
Waters, Maxine
Watkins, Steve
Watson Coleman, Bonnie
Watson, Diane E.
Watt, Melvin L.
Waxman, Henry A.
Webb, Jim
Weber, Randy K., Sr.
Webster, Daniel
Weiner, Anthony D.
Welch, Peter
Weldon, Dave
Weller, Jerry
Wenstrup, Brad R.
West, Allen B.
Westerman, Bruce
Westmoreland, Lynn A.
Wexler, Robert
Wexton, Jennifer
Whitehouse, Sheldon
Whitfield, Ed
Wicker, Roger F.
Wild, Susan
Williams, Roger
Wilson, Charles A.
Wilson, Frederica S.
Wilson, Heather
Wi

In [None]:
congress_member_info

Unnamed: 0,name,url,party,state,district,chamber,congress_numbers,start,end
0,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Democratic,Hawaii,1,House,"[102, 103, 104, 105, 106, 107, 108, 109, 110, ...",1991,2011
1,"Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,Republican,Louisiana,5,House,"[114, 115, 116]",2015,inf
2,"Ackerman, Gary L.",https://www.congress.gov/member/gary-ackerman/...,Democratic,New York,5,House,"[103, 104, 105, 106, 107, 108, 109, 110, 111, ...",1993,2013
3,"Adams, Alma S.",https://www.congress.gov/member/alma-adams/A00...,Democratic,North Carolina,12,House,"[113, 114, 115, 116]",2014,inf
4,"Adams, Sandy",https://www.congress.gov/member/sandy-adams/A0...,Republican,Florida,24,House,[112],2011,2013
...,...,...,...,...,...,...,...,...,...
1062,"Young, David",https://www.congress.gov/member/david-young/Y0...,Republican,Iowa,3,House,"[114, 115]",2015,2019
1063,"Young, Don",https://www.congress.gov/member/don-young/Y000033,Republican,Alaska,At Large,House,"[93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 10...",1973,inf
1064,"Young, Todd",https://www.congress.gov/member/todd-young/Y00...,Republican,Indiana,9,House,"[112, 113, 114]",2011,2017
1065,"Zeldin, Lee M.",https://www.congress.gov/member/lee-zeldin/Z00...,Republican,New York,1,House,"[114, 115, 116]",2015,inf


## Formatting dataframe

In [None]:
#Extracting last name
congress_member_info['last_name'] = congress_member_info['name'].apply(lambda x: x.split(",")[0])

In [None]:
#Scraping state abbreviations to create ID sometimes used in the congress.gov site
state_abbreviation_url = "https://docs.omnisci.com/v4.1.1/3_apdx_states.html"
response = requests.get(state_abbreviation_url).text
soup = BeautifulSoup(response, "html.parser")
state_abbreviation_lookup = pd.DataFrame()
col_names = ["state", "state_abbreviation"]

for state in soup.find("div", {"id":"us-state-abbreviations"}).find("tbody").find_all("tr"):
    state_pair = []
    for value in state.find_all("td"):
        state_pair.append(value.text.strip())
    to_add = pd.Series(state_pair, col_names)
    state_abbreviation_lookup = state_abbreviation_lookup.append([to_add], ignore_index=True)
state_abbreviation_lookup.head()

Unnamed: 0,state,state_abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [None]:
congress_member_info = pd.merge(congress_member_info, state_abbreviation_lookup, how='left', on='state')

# Removing delegates from guam, virgin islands, samoa, etc.
congress_member_info = congress_member_info[congress_member_info['state_abbreviation'].notna()]

Unnamed: 0.1,Unnamed: 0,name,url,party,state,district,chamber,congress_numbers,start,end,last_name,state_abbreviation
0,0,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Democratic,Hawaii,1,House,"[102, 103, 104, 105, 106, 107, 108, 109, 110, ...",1991,2011.0,Abercrombie,HI
1,1,"Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,Republican,Louisiana,5,House,"[114, 115, 116]",2015,inf,Abraham,LA
2,2,"Ackerman, Gary L.",https://www.congress.gov/member/gary-ackerman/...,Democratic,New York,5,House,"[103, 104, 105, 106, 107, 108, 109, 110, 111, ...",1993,2013.0,Ackerman,NY
3,3,"Adams, Alma S.",https://www.congress.gov/member/alma-adams/A00...,Democratic,North Carolina,12,House,"[113, 114, 115, 116]",2014,inf,Adams,NC
4,4,"Adams, Sandy",https://www.congress.gov/member/sandy-adams/A0...,Republican,Florida,24,House,[112],2011,2013.0,Adams,FL
...,...,...,...,...,...,...,...,...,...,...,...,...
1062,1062,"Young, David",https://www.congress.gov/member/david-young/Y0...,Republican,Iowa,3,House,"[114, 115]",2015,2019.0,Young,IA
1063,1063,"Young, Don",https://www.congress.gov/member/don-young/Y000033,Republican,Alaska,At Large,House,"[93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 10...",1973,inf,Young,AK
1064,1064,"Young, Todd",https://www.congress.gov/member/todd-young/Y00...,Republican,Indiana,9,House,"[112, 113, 114]",2011,2017.0,Young,IN
1065,1065,"Zeldin, Lee M.",https://www.congress.gov/member/lee-zeldin/Z00...,Republican,New York,1,House,"[114, 115, 116]",2015,inf,Zeldin,NY


In [None]:
#Getting representative ID for future match
congress_member_info['rep_id'] = [url.split("/")[-1:][0] for url in congress_member_info['url']]

In [None]:
#Combining last name and state abbreviation for future match
congress_member_info['last_name_state'] = congress_member_info['last_name'] + " (" + congress_member_info['state_abbreviation'] + ")"

In [None]:
congress_member_info.head()

Unnamed: 0.1,Unnamed: 0,name,url,party,state,district,chamber,congress_numbers,start,end,last_name,state_abbreviation,rep_id,last_name_state
0,0,"Abercrombie, Neil",https://www.congress.gov/member/neil-abercromb...,Democratic,Hawaii,1,House,"[102, 103, 104, 105, 106, 107, 108, 109, 110, ...",1991,2011.0,Abercrombie,HI,A000014,Abercrombie (HI)
1,1,"Abraham, Ralph Lee",https://www.congress.gov/member/ralph-abraham/...,Republican,Louisiana,5,House,"[114, 115, 116]",2015,inf,Abraham,LA,A000374,Abraham (LA)
2,2,"Ackerman, Gary L.",https://www.congress.gov/member/gary-ackerman/...,Democratic,New York,5,House,"[103, 104, 105, 106, 107, 108, 109, 110, 111, ...",1993,2013.0,Ackerman,NY,A000022,Ackerman (NY)
3,3,"Adams, Alma S.",https://www.congress.gov/member/alma-adams/A00...,Democratic,North Carolina,12,House,"[113, 114, 115, 116]",2014,inf,Adams,NC,A000370,Adams (NC)
4,4,"Adams, Sandy",https://www.congress.gov/member/sandy-adams/A0...,Republican,Florida,24,House,[112],2011,2013.0,Adams,FL,A000366,Adams (FL)


In [None]:
congress_member_info.to_csv("congress_110_116.csv")

## Scraping Roll Call Info

Each bill can be voted on a number of times, for every step of the legislative process and if ammendments are made. Each vote is referred to as a 'roll'.

Congress.gov gives access to all recorded historic votes, which are readily available in a list format in their online database.

Furthermore, the votes of each congress member is also available, which will be scraped later and used to create links.

First, we generate the links for each list of rolls, by looking at the structure of the website.

In [None]:
congress_urls = []
for year in range(2007,2021):
    congress_urls.append(f"https://clerk.house.gov/evs/{year}/index.asp")

print(congress_urls)

['https://clerk.house.gov/evs/2007/index.asp', 'https://clerk.house.gov/evs/2008/index.asp', 'https://clerk.house.gov/evs/2009/index.asp', 'https://clerk.house.gov/evs/2010/index.asp', 'https://clerk.house.gov/evs/2011/index.asp', 'https://clerk.house.gov/evs/2012/index.asp', 'https://clerk.house.gov/evs/2013/index.asp', 'https://clerk.house.gov/evs/2014/index.asp', 'https://clerk.house.gov/evs/2015/index.asp', 'https://clerk.house.gov/evs/2016/index.asp', 'https://clerk.house.gov/evs/2017/index.asp', 'https://clerk.house.gov/evs/2018/index.asp', 'https://clerk.house.gov/evs/2019/index.asp', 'https://clerk.house.gov/evs/2020/index.asp']


Then, we extract the embedded links to all the rolls, such that we can later extract the data from them. 

In [None]:
def get_roll_urls(congress_url):
    response = requests.get(congress_url).text
    soup = BeautifulSoup(response, "html.parser")
    
    roll_urls = []
    for a in soup.find("font", {"size": "3"}).find_all("a", href=True):
        roll_urls.append(congress_url[:-9] + a['href'].split(".")[0] + ".asp")
    
    return roll_urls

Finally, the scrape can be run for each roll, extracting the following information:
* Year
* Roll number (starting from 1 every year)
* Issue (bill number)
* Date of the vote
* Question to be voted on
* Result of the vote
* Title/description
* Link to the votes (to be used for further scraping)
* Link to the page

In [None]:
roll_info = pd.DataFrame()
columns = ['Year','Roll', 'Issue','Date','Question','Result', 'Title/Description', 'Vote_link', 'Issue_link']

for congress_url in tqdm.tqdm(congress_urls):
    for roll_url in get_roll_urls(congress_url):        
        year = roll_url.split("/")[4]
        response = requests.get(roll_url).text
        soup = BeautifulSoup(response, "html.parser")
        for row in soup.find("table").find_all("tr"):
            if row.find_all("td"):
                line = []
                line.append(year)
                roll_link = 1
                for data in row.find_all("td"):
                    line.append(data.text.strip())
                    link = data.find("a")
                    if link and roll_link:
                        roll_link = 0
                        vote_link = link['href']
                    elif link:
                        issue_link = link['href']

                line.append(vote_link)
                line.append(issue_link)
                to_add = pd.Series(line, columns)
                roll_info = roll_info.append([to_add], ignore_index=True)

roll_info['Roll-id'] = roll_info['Year']+"-"+roll_info['Roll']

100%|██████████████████████████████████████████| 14/14 [02:13<00:00,  9.55s/it]


In [None]:
#Link to be used for next scrape
roll_info['Vote_link_new'] = roll_info.apply(lambda row: "https://clerk.house.gov/Votes/" + str(row['Year']) + str(row['Roll']), axis=1)

In [None]:
roll_info.to_csv("roll_info.csv")

## Scraping summaries
For each unique issue (bill/resolution) we need to scrape summaries and committees.

There may be many rolls for each issue, as the house needs to vote for each ammendment made, thus the summaries are the most recent ones.

In [None]:
#Preparing data
roll_info = pd.read_csv("roll_info.csv")

In [None]:
unique_issue_links = pd.DataFrame(roll_info['Issue_link'].unique()).rename(columns={0:'Issue_link'})
unique_issue_links['Issue_link'] = unique_issue_links['Issue_link'] + "/all-info"
unique_issue_links['Issue_link']

0                      https://www.congress.gov/bill/110th-congress/house-bill/2764/all-info
1                      https://www.congress.gov/bill/110th-congress/house-bill/4040/all-info
2                     https://www.congress.gov/bill/110th-congress/senate-bill/2499/all-info
3                      https://www.congress.gov/bill/110th-congress/house-bill/3996/all-info
4                      https://www.congress.gov/bill/110th-congress/house-bill/3690/all-info
                                                ...                                         
4284                   https://www.congress.gov/bill/116th-congress/house-bill/5078/all-info
4285    https://www.congress.gov/bill/116th-congress/house-concurrent-resolution/83/all-info
4286              https://www.congress.gov/bill/116th-congress/house-resolution/781/all-info
4287                   https://www.congress.gov/bill/116th-congress/house-bill/2881/all-info
4288              https://www.congress.gov/bill/116th-congress/house-r

In [None]:
issue_info = pd.DataFrame()
cols = ['issue_link', 'committees', 'summary']

for issue_url in tqdm.tqdm(unique_issue_links['Issue_link'].iloc[3000:]):
  response = requests.get(issue_url).text
  soup = BeautifulSoup(response, "html.parser")

  relevant = soup.find("div", {"class":"main-wrapper all-info-wrapper"})
  if not relevant:
    continue

  #Getting committees
  committees = []
  if relevant.find("tr", {"class":"committee"}):
    for committee_row in relevant.find("table", {"class":"table_committee"}).find("tbody").find_all("tr", {"class":"committee"}):
      committees.append(committee_row.find("th").text.strip())

  #Getting latest summary
  latest_summary = relevant.find_next("div", {"id":"bill-summary"})
  #Take all paragraph tags and combine to a single string
  if not latest_summary:
    continue
  
  summary = " ".join([paragraph.text.strip() for paragraph in latest_summary.select("p")])

  to_add = pd.Series([issue_url, committees, summary], cols)
  issue_info = issue_info.append(to_add, ignore_index=True)

In [None]:
issue_info

Unnamed: 0,committees,issue_link,summary
0,[House Rules],https://www.congress.gov/bill/114th-congress/house-resolution/173/all-info,
1,[House Budget],https://www.congress.gov/bill/114th-congress/house-concurrent-resolution/27/all-info,Establishes the congressional budget for the federal government for FY2016 and sets forth budget...
2,"[House Veterans' Affairs, Senate Veterans' Affairs]",https://www.congress.gov/bill/114th-congress/house-bill/216/all-info,Department of Veterans Affairs Budget Planning Reform Act of 2015 (Sec. 2) Directs the Secretary...
3,[House Rules],https://www.congress.gov/bill/114th-congress/house-resolution/163/all-info,
4,[House Foreign Affairs],https://www.congress.gov/bill/114th-congress/house-resolution/162/all-info,(This measure has not been amended since it was introduced. The summary of that version is repea...
...,...,...,...
1267,"[House Small Business, Senate Small Business and Entrepreneurship]",https://www.congress.gov/bill/116th-congress/house-bill/5078/all-info,"Prison to Proprietorship Act This bill directs the Small Business Administration, in coordinatio..."
1268,"[House Foreign Affairs, Senate Foreign Relations]",https://www.congress.gov/bill/116th-congress/house-concurrent-resolution/83/all-info,This concurrent resolution directs the President to terminate the use of U.S. Armed Forces in ho...
1269,[House Rules],https://www.congress.gov/bill/116th-congress/house-resolution/781/all-info,This resolution sets forth the rule for consideration of H.Con.Res. 83 (directing the President ...
1270,"[House Energy and Commerce, House Foreign Affairs]",https://www.congress.gov/bill/116th-congress/house-bill/2881/all-info,"Secure 5G and Beyond Act of 2020 This bill requires the President, in consultation with relevant..."


In [None]:
issue_info.to_csv("issue_info_3000_4288.csv")

## Scraping Roll Call Votes

First, a dataframe is prepared to contain the scraping data with roll-ids as columns and member IDs as index

In [None]:
congress_member_info = pd.read_csv("congress_110_116.csv")
roll_info = pd.read_csv("roll_info.csv")
year = 2009

cols = roll_info[roll_info['Year'] == year]['Roll-id'].sort_values(ascending=False)
idx = congress_member_info['rep_id'].unique()

roll_call_vote = pd.DataFrame(index=idx, columns=cols)

In [None]:
roll_call_vote

Roll-id,2008-99,2008-98,2008-97,2008-96,2008-95,2008-94,2008-93,2008-92,2008-91,2008-90,...,2008-107,2008-106,2008-105,2008-104,2008-103,2008-102,2008-101,2008-100,2008-10,2008-1
A000014,,,,,,,,,,,...,,,,,,,,,,
A000374,,,,,,,,,,,...,,,,,,,,,,
A000022,,,,,,,,,,,...,,,,,,,,,,
A000370,,,,,,,,,,,...,,,,,,,,,,
A000366,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Y000066,,,,,,,,,,,...,,,,,,,,,,
Y000033,,,,,,,,,,,...,,,,,,,,,,
Y000064,,,,,,,,,,,...,,,,,,,,,,
Z000017,,,,,,,,,,,...,,,,,,,,,,


Using the links previously extracted for the votes, we extract the vote for each member and store it in the dataframe. 

In [None]:
for idx, row in tqdm.tqdm(roll_info[roll_info['Year'] == year].iterrows()):
    roll_vote_url = row['Vote_link_new']
    roll_id = row['Roll-id']

    response = requests.get(roll_vote_url).text
    soup = BeautifulSoup(response, "html.parser")
    tables = soup.find_all("table")
    
    try:
        for row in tables[1].find("tbody").find_all("tr"):
            try:
                rep_id = row.find("a", href=True)['href'][-7:]
            except:
                continue
            vote = row.find("td", {"data-label":"vote"}).text.strip()
            roll_call_vote[roll_id][rep_id] = vote
    except:
        continue

In [None]:
roll_call_vote.to_csv(f"roll_call_vote_{year}.csv")