# Scraping the Speeches from the Federal Reserve Website

In [1]:
import requests
import json
from bs4 import BeautifulSoup
import bs4
import pandas as pd
import time
import random
import math

I get the request details by copying the request to the `/json/ne-speeches` endpoint as cURL and then using [this website](https://curlconverter.com/) to convert that to a python request

In [3]:
cookies = {
    'google-analytics_v4_7d17__engagementStart': '1699382345044',
    'google-analytics_v4_7d17__counter': '377',
    'google-analytics_v4_7d17__session_counter': '32',
    'google-analytics_v4_7d17__ga4': '20e77a5d-205a-42f6-8a19-0bd0e82b9f9a',
    'google-analytics_v4_7d17__let': '1699382421247',
    'cf_clearance': 'BzBM.S_65AbLlNh4OCc4ASoJQYYBHpFte6hyoUZrEsw-1699382102-0-1-1a686270.b107120b.4d693d90-0.2.1699382102',
    'google-analytics_v4_7d17__engagementPaused': '1699382421247',
    '_cfuvid': 'ZP4rPMFRXHyfqp2Of7OxyC06wR0ULECDDdiMFmudfqs-1699131072732-0-604800000',
    '__utma': '197852984.794505660.1699131073.1699131073.1699131073.1',
    '__utmb': '197852984.1.10.1699131073',
    '__utmc': '197852984',
    '__utmz': '197852984.1699131073.1.1.utmcsr=ideas.repec.org|utmccn=(referral)|utmcmd=referral|utmcct=/',
    '__cf_bm': '04uG.qUDKAeXwWUu76SR4QkaBe.JaGPPlRIVNxLp66k-1699382086-0-AXqir9G6ifK5a10aO0gDRulBH9bVJ12LmbmFr2wwMY91K6qw5LUBy4Qm83BBTTMaZseZOCXIZu2oJhYUABOa218=',
    'google-analytics_v4_7d17__ga4sid': '2027065144',
    'BIGipServerwww.federalreserve.gov_hsts.app~www.federalreserve.gov_hsts_pool': '!Y9yC73q6/uK6A9rjGVpQzzWrPrLO+0rNrJymKFw85wbB7O9OAQrn8qBGAO6SakmNp8aadZDpFfKTsw==',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.5',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Referer': 'https://www.federalreserve.gov/newsevents/speeches.htm',
    # 'Cookie': 'google-analytics_v4_7d17__engagementStart=1699382345044; google-analytics_v4_7d17__counter=377; google-analytics_v4_7d17__session_counter=32; google-analytics_v4_7d17__ga4=20e77a5d-205a-42f6-8a19-0bd0e82b9f9a; google-analytics_v4_7d17__let=1699382421247; cf_clearance=BzBM.S_65AbLlNh4OCc4ASoJQYYBHpFte6hyoUZrEsw-1699382102-0-1-1a686270.b107120b.4d693d90-0.2.1699382102; google-analytics_v4_7d17__engagementPaused=1699382421247; _cfuvid=ZP4rPMFRXHyfqp2Of7OxyC06wR0ULECDDdiMFmudfqs-1699131072732-0-604800000; __utma=197852984.794505660.1699131073.1699131073.1699131073.1; __utmb=197852984.1.10.1699131073; __utmc=197852984; __utmz=197852984.1699131073.1.1.utmcsr=ideas.repec.org|utmccn=(referral)|utmcmd=referral|utmcct=/; __cf_bm=04uG.qUDKAeXwWUu76SR4QkaBe.JaGPPlRIVNxLp66k-1699382086-0-AXqir9G6ifK5a10aO0gDRulBH9bVJ12LmbmFr2wwMY91K6qw5LUBy4Qm83BBTTMaZseZOCXIZu2oJhYUABOa218=; google-analytics_v4_7d17__ga4sid=2027065144; BIGipServerwww.federalreserve.gov_hsts.app~www.federalreserve.gov_hsts_pool=!Y9yC73q6/uK6A9rjGVpQzzWrPrLO+0rNrJymKFw85wbB7O9OAQrn8qBGAO6SakmNp8aadZDpFfKTsw==',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}

response = requests.get('https://www.federalreserve.gov/json/ne-speeches.json', cookies=cookies, headers=headers)

The response text is framed by three irrelevant characters that mess up the parsing, so I ignore those prior to using the parsing package

In [5]:
speeches = json.loads(response.text[3:-3])

In [16]:
base_url = 'https://www.federalreserve.gov'

Thankfully, all of the Federal Reserve speeches during this time interval are on the website in the same exact HTML format, making scraping pretty easy. I use BeautifulSoup to grap the relevant elements, removing references to footnotes and the like, ending with an array of paragraphs.

In [118]:
def get_article_text(rel_url):
    speech_page = requests.get(base_url + rel_url)
    soup = BeautifulSoup(speech_page.text)
    raw_html = soup.find('div', {'id': 'article'}).find_all('div')[2]
    contents = []
    for item in raw_html:
        if type(item) is bs4.element.Tag:
            if item.name == 'hr':
                break
            elif item.name == 'p':
                footnote_check = item.find_all('a')
                if footnote_check:
                    for link in footnote_check:
                        title = link.get('title')
                        if title and 'footnote' in title:
                            link.clear()
                contents.append(item.text)
    contents = [item.strip().encode('ascii', 'namereplace').decode('ascii') for item in contents]
    contents = [item.replace('\\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}\\x80\\x94', '--') for item in contents]
    contents = [{'text': item, 'index': i} for i, item in enumerate(contents)]
    return contents

In [168]:
def get_speech_range(min, max):
    records = []
    for i in range(min, max):
        speech = speeches[i]
        if speech.get('updateDate'):
            continue
        time.sleep(5 + random.random() * 5)
        print(speech['l'])
        symbols = []

        paras = get_article_text(speech['l'])
        meta = {
            'speaker': speech['s'],
            'date': speech['d'].split(' ')[0],
            'title': speech['t']
        }
        new_records = [{**para, **meta} for para in paras]

        records.extend(new_records)
    return records

In [None]:
for i in range(math.ceil(len(speeches) / 100)):
    start = i * 100
    end = min(i * 100 + 100, len(speeches))
    records = get_speech_range(start, end)
    df = pd.DataFrame.from_records(records)
    df.to_csv(f'speeches_{start}-{end}.csv')

Combine CSV files

In [171]:
final_fname = 'speeches.csv'

for i in range(0, math.ceil(len(speeches) / 100)):
    start = i * 100
    end = min(i * 100 + 100, len(speeches))
    file = f'speeches_{start}-{end}.csv'

    df = pd.read_csv(file)
    
    df.to_csv(final_fname, mode='a', index=False)

In [124]:
sdf = pd.read_csv('speeches.csv')
import re

In [125]:
texts = sdf['text'].to_list()

In [147]:
def replace_uc(raw_string):
    result = raw_string.replace('\\N{REGISTERED SIGN}', '')
    result = result.replace('N{NO-BREAK SPACE}', ' ')
    result = result.replace('\\N{LATIN CAPITAL LETTER A WITH RING ABOVE}', 'a')
    result = result.replace('\\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}', 'o')
    result = result.replace('\\N{LATIN SMALL LETTER E WITH GRAVE}', 'e')
    result = result.replace('\\N{BROKEN BAR}', ' ')
    result = result.replace('\\N{INVERTED EXCLAMATION MARK}', '')
    result = result.replace('\\N{NON-BREAKING HYPHEN}', '-')
    result = result.replace('\\N{SOFT HYPHEN}', '-')
    result = result.replace('\\N{HYPHEN}', '-')
    result = result.replace('\\N{VULGAR FRACTION ONE HALF}', '1/2')
    result = result.replace('\\N{VULGAR FRACTION ONE QUARTER}', '1/4')
    result = result.replace('\\N{EURO SIGN}', 'euros')
    result = result.replace('\\x80', '')
    result = result.replace('\\x93141', '')
    result = re.sub('\\\\x[0-9]+', '', result)
    result = re.sub('\\\\N\{LATIN (SMALL|CAPITAL) LETTER U WITH [A-Z]+\}', 'u', result)
    result = re.sub('\\\\N\{LATIN (SMALL|CAPITAL) LETTER E WITH [A-Z]+\}', 'e', result)
    result = re.sub('\\\\N\{LATIN (SMALL|CAPITAL) LETTER N WITH [A-Z]+\}', 'n', result)
    result = re.sub('\\\\N\{LATIN (SMALL|CAPITAL) LETTER I WITH [A-Z]+\}', 'i', result)
    result = re.sub('\\\\N\{LATIN (SMALL|CAPITAL) LETTER A WITH [A-Z]+\}', 'a', result)
    result = re.sub('\\\\N\{LATIN (SMALL|CAPITAL) LETTER O WITH [A-Z]+\}', 'o', result)
    result = re.sub('\\\\N\{LATIN (SMALL|CAPITAL) LETTER C WITH [A-Z]+\}', 'c', result)
    result = re.sub('\\\\N\{LATIN (SMALL|CAPITAL) LETTER S WITH [A-Z]+\}', 's', result)
    return result

In [148]:
texts = [replace_uc(str(text)) for text in texts]

In [128]:
for text in texts:
    result = re.search('\\\\N\{(.*)\}', str(text))
    if result:
        print(result.group())

\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}
\N{CURRENCY SIGN}


In [149]:
for text in texts:
    result = re.search('\\\\x[0-9]+', str(text))
    if result:
        print(result.group())
        print(text)