# Scrape web for congressional and judicial hearings

All transcripts pulled from https://www.govinfo.gov

In [38]:
# import requests and beautiful soup for parsing
# run 'pip install requests' or 'pip install bs4' if needed
import requests
import bs4
import json
import pandas

# import required modules and set up environment
import os

# replace file path below with your own local convokit
os.chdir('/Users/marianneaubin/Documents/Classes/CS6742/cs6742-fork')

import convokit
from convokit import Corpus, Parser

## Get list of senate hearings from website

In [2]:
hearings = {}

#should be 2014 and 2020
start_year = 2014
end_year = 2015

import re
from bs4 import BeautifulSoup
for i in range(start_year,end_year):
    #the sitemap contains all the senate hearings
    url = 'https://www.govinfo.gov/sitemap/CHRG_2015_sitemap.xml'
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.text,'lxml')
    
    hearings[i] = soup.find_all("loc")
    
    temp_hearings = []
    for j,hearing in enumerate(hearings[i]):
        hearings[j] = str(hearing).replace('<loc>https://www.govinfo.gov/app/details/', '')
        hearing = hearings[j]
        hearings[j] = str(hearing).replace('</loc>', '')
        hearing = hearings[j]
        temp_hearings.append(hearing)
    hearings[i] = temp_hearings

In [3]:
total_hearings = 0
for i in range(start_year,end_year):
    total_hearings = total_hearings + len(hearings[i])
print("There are in total ", total_hearings, "senate hearings.")

There are in total  1496 senate hearings.


# Use govinfo's API to get the debates that might be relevant to us

In [4]:
# this dictionary will store, for each hearing, its title and its transcript.
# relevant metadata may be added later
# the key for the dict is the slug of that hearing
hearings_dict = {}

In [32]:
counter = 1
for i in range(start_year, end_year):
    for slug in hearings[i]:
        hearings_dict[slug] = {}
        hearings_dict[slug]['year'] = [i]
        url = 'https://api.govinfo.gov/packages/' + slug + '/granules?offset=0&pageSize=10&api_key=FM8E7n2pr9fBPcWhAt0FunieHQcSGjGevkFYQh70'
        resp = requests.get(url)
        if resp.status_code == 200:
            if resp.json()['granules'] != []:
                title = resp.json()['granules'][0]['title']
                hearings_dict[slug]['title'] = title
                #if accessible online
                hearings_dict[slug]['status'] = 1
            else:
                #if not accessible online
                hearings_dict[slug]['status'] = 0
        else:
                hearings_dict[slug]['status'] = 0
        counter = counter + 1
        if counter % 100 == 0:
            print("completed ", counter)

print(hearings_dict)

completed  100
completed  200
completed  300
completed  400
completed  500
completed  600
completed  700
completed  800
completed  900
completed  1000
completed  1100
completed  1200
completed  1300
completed  1400


In [39]:
## save result to file

json = json.dumps(hearings_dict)
f = open("hearings_dict.json","w")
f.write(json)
f.close()

## Filter titles based on relevant words

In [42]:
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

keywords = ["gun", "weapon", "rifle", "arm", "gunman", "gunmen", "shooting", "of"]
keywords = {stemmer.stem(word) for word in keywords}

keep = []

for i in range(start_year, end_year):
    for hearing in hearings_dict:
        if hearings_dict[hearing]["status"] != 0:
            h_title = hearings_dict[hearing]["title"]
            h_title = nltk.word_tokenize(h_title)
            #print(h_title)
            if (len(keywords.intersection(h_title)) > 0):
                keep.append(hearing)
                
print("number of relevant debates is ", len(keep))
print(keep)

['NIH', ':', 'Investing', 'in', 'A', 'Healthier', 'Future']
['Promoting', 'U.S.', 'Commerce', 'in', 'the', 'Middle', 'East', 'and', 'North', 'Africa']
['GAO', "'s", 'High-Risk', 'List', 'and', 'the', 'Veterans', 'Health', 'Administration']
['[', 'H.A.S.C', '.', 'No', '.', '114-62', ']', 'Transition', 'Assistance', 'Program', '--', 'A', 'Unity', 'of', 'Effort']
['Dead', 'End', ',', 'No', 'Turn', 'Around', ',', 'Danger', 'Ahead', ':', 'Challenges', 'to', 'the', 'Future', 'of', 'Highway', 'Funding']
['Opm', 'Data', 'Breach', ':', 'Part', 'II']
['Departments', 'of', 'Labor', ',', 'Health', 'and', 'Human', 'Services', ',', 'and', 'Education', ',', 'and', 'Related', 'Agencies', 'Appropriations', 'for', 'Fiscal', 'Year', '2016']
['Hearing', 'on', 'Pending', 'Health', 'Care', 'and', 'Benefits', 'Legislation']
['Agriculture', ',', 'Rural', 'Development', ',', 'Food', 'and', 'Drug', 'Administration', ',', 'and', 'Related', 'Agencies', 'Appropriations', 'for', '2016', 'Wednesday', ',', 'February'

['The', 'Development', 'and', 'Potential', 'Implementation', 'of', 'the', 'Office', 'of', 'Surface', 'Mining', ',', 'Reclamation', ',', 'and', 'Enforcement', "'s", 'Proposed', 'Stream', 'Protection', 'Rule']
['Military', 'Construction', ',', 'Veterans', 'Affairs', ',', 'and', 'Related', 'Agencies', 'Appropriations', 'for', '2017']
['Legislative', 'Hearing', 'on', 'H.r', '.', '1157', ',', 'to', 'Authorize', 'the', 'Secretary', 'of', 'the', 'Interior', 'to', 'Take', 'Land', 'into', 'Trust', 'for', 'the', 'Benefit', 'of', 'the', 'Santa', 'Ynez', 'Band', 'of', 'Chumash', 'Mission', 'Indians', ',', 'and', 'for', 'Other', 'Purposes', ',', '``', 'santa', 'Ynez', 'Band', 'of', 'Chumash', 'Mission', 'Indians', 'Land', 'Transfer', 'Act', 'of', '2015', "''", ';', 'H.r', '.', '2386', ',', 'to', 'Provide', 'for', 'the', 'Recognition', 'of', 'Certain', 'Native', 'Communities', 'and', 'the', 'Settlement', 'of', 'Certain', 'Claims', 'Under', 'the', 'Alaska', 'Native', 'Claims', 'Settlement', 'Act', ',

## Download transcript from each hearing and save to memory

In [None]:
counter = 1

# format of txt transcript: https://www.govinfo.gov/content/pkg/CHRG-114shrg21644/html/CHRG-114shrg21644.htm

for slug in hearings[2014]:
    url = "https://www.govinfo.gov/content/pkg/" + str(slug) + "/html/" + str(slug) + ".htm"
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.text,'lxml')
    if resp.status_code != 200:
        print("error")
        print(resp.status_code)
        print(i)
        print("slug is " ,slug)
    hearings_dict[slug]['raw_text'] = resp.text
    body = soup.find("body")
    
    if counter % 50 == 0:
        print("completed scraping", counter, " hearings")

In [None]:
print(hearings_dict)