# Scrape web for congressional and judicial hearings

All transcripts pulled from https://www.govinfo.gov

In [115]:
# import requests and beautiful soup for parsing
# run 'pip install requests' or 'pip install bs4' if needed
import requests
import bs4
import json
import pandas

# import required modules and set up environment
import os

# replace file path below with your own local convokit
os.chdir('/Users/marianneaubin/Documents/Classes/CS6742/cs6742-fork')

import convokit
from convokit import Corpus, Parser

## Get list of senate hearings from website

In [103]:
hearings = {}

#should be 2014 and 2020
start_year = 2013
end_year = 2014

import re
from bs4 import BeautifulSoup
for i in range(start_year,end_year):
    #the sitemap contains all the senate hearings UPDATE YEAR
    url = 'https://www.govinfo.gov/sitemap/CHRG_2013_sitemap.xml'
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.text,'lxml')
    
    hearings[i] = soup.find_all("loc")
    
    temp_hearings = []
    for j,hearing in enumerate(hearings[i]):
        hearings[j] = str(hearing).replace('<loc>https://www.govinfo.gov/app/details/', '')
        hearing = hearings[j]
        hearings[j] = str(hearing).replace('</loc>', '')
        hearing = hearings[j]
        temp_hearings.append(hearing)
    hearings[i] = temp_hearings
    
print(i)

2013


In [104]:
total_hearings = 0
for i in range(start_year,end_year):
    total_hearings = total_hearings + len(hearings[i])
print("There are in total ", total_hearings, "senate hearings.")

There are in total  1484 senate hearings.


# Use govinfo's API to get the debates that might be relevant to us

In [105]:
# this dictionary will store, for each hearing, its title and its transcript.
# relevant metadata may be added later
# the key for the dict is the slug of that hearing
hearings_dict = {}

In [106]:
counter = 1
for i in range(start_year, end_year):
    for slug in hearings[i]:
        hearings_dict[slug] = {}
        hearings_dict[slug]['year'] = [i]
        url = 'https://api.govinfo.gov/packages/' + slug + '/granules?offset=0&pageSize=10&api_key=FM8E7n2pr9fBPcWhAt0FunieHQcSGjGevkFYQh70'
        resp = requests.get(url)
        if resp.status_code == 200:
            if resp.json()['granules'] != []:
                title = resp.json()['granules'][0]['title']
                hearings_dict[slug]['title'] = title
                #if accessible online
                hearings_dict[slug]['status'] = 1
            else:
                #if not accessible online
                hearings_dict[slug]['status'] = 0
        else:
                hearings_dict[slug]['status'] = 0
        counter = counter + 1
        if counter % 100 == 0:
            print("completed ", counter)

#print(hearings_dict)

completed  100
completed  200
completed  300
completed  400
completed  500
completed  600
completed  700
completed  800
completed  900
completed  1000
completed  1100
completed  1200
completed  1300
completed  1400


In [116]:
## save result to file

json = json.dumps(hearings_dict)
f = open("hearings_dict.json","w")
f.write(json)
f.close()

## Filter titles based on relevant words

In [112]:
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

print(len(hearings_dict.keys()))

keywords = ["gun", "rifle", "gunman", "gunmen", "shooting", "semi-automatic"]
keywords = {stemmer.stem(word) for word in keywords}
keep = []

for i in range(start_year, end_year):
    for hearing in hearings_dict:
        if "status" in hearings_dict[hearing].keys():
            if hearings_dict[hearing]["status"] != 0:
                h_title = (hearings_dict[hearing]["title"])
                if h_title != None:
                    h_title = h_title.lower()
                    h_title = nltk.word_tokenize(h_title)
                    h_title = {stemmer.stem(word) for word in h_title}
                    relevant = False
                    for key in keywords:
                        if relevant == False:
                            if key in h_title:
                                keep.append(hearing)
                                relevant = True

                
print("number of relevant debates is ", len(keep))
for k in keep:
    print(hearings_dict[k])

1484
number of relevant debates is  5
{'year': [2013], 'title': 'What Should America Do About Gun Violence?', 'status': 1}
{'year': [2013], 'title': 'Facility Protection: Implications of the Navy Yard Shooting on Homeland Security', 'status': 1}
{'year': [2013], 'title': "Tsa's Spot Program and Initial Lessons from the Lax Shooting", 'status': 1}
{'year': [2013], 'title': "Legislative Hearing on H.r. 1825, to Direct Federal Public Land Management Officials to Exercise Their Authority Under Existing Law to Facilitate Use of and Access to Federal Public Lands for Fishing, Sport Hunting, and Recreational Shooting, and for Other Purposes. ``recreational Fishing and Hunting Heritage and Opportunities Act''; H.r. 586, to Provide for Certain Improvements to the Denali National Park and Preserve in the State of Alaska, and for Other Purposes. ``denali National Park Improvement Act''; H.r. 995, to Establish A Monument in Dona Ana County, New Mexico, and for Other Purposes. ``organ Mountains Nat

## Download transcript from each hearing and save to memory

In [114]:
counter = 1

# format of txt transcript: https://www.govinfo.gov/content/pkg/CHRG-114shrg21644/html/CHRG-114shrg21644.htm

for slug in hearings[2013]:
    url = "https://www.govinfo.gov/content/pkg/" + str(slug) + "/html/" + str(slug) + ".htm"
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.text,'lxml')
    if resp.status_code != 200:
        print("error")
        print(resp.status_code)
        print(i)
        print("slug is " ,slug)
    hearings_dict[slug]['raw_text'] = resp.text
    body = soup.find("body")
    
    if counter % 50 == 0:
        print("completed scraping", counter, " hearings")

KeyboardInterrupt: 

In [None]:
print(hearings_dict)