# Scrap speeches from the FED website

In [1]:
#import necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Scrap information

In [2]:
#website only works from 1996 to 2005
years = range(1996,2006)
dfs_1996_2005 = []

for year in years:
    df_one_year = pd.DataFrame()
    page = requests.get(f'https://www.federalreserve.gov/newsevents/speech/{year}speech.htm')
    soup = BeautifulSoup(page.text, 'html.parser')
    title = soup.select(".title")
    speakers = soup.select(".speaker")
    locations = soup.select(".location")
    for i in range(len(title)):
        df_one_year.at[i,'link'] = 'https://www.federalreserve.gov'+title[i].find_all('a', href=True)[0]['href']
        df_one_year.at[i,'title'] = title[i].text.split('\n')[1]
        df_one_year.at[i,'speaker'] = speakers[i].text.split('\n')[1].strip()
        df_one_year.at[i,'event'] = locations[i].text.split('\n')[1].strip()
        df_one_year.at[i,'year'] = year
    dfs_1996_2005.append(df_one_year)

df_1996_2005 = pd.concat(dfs_1996_2005, ignore_index=True)

In [3]:
years = range(2006,2011)
dfs_2006_2010 = []

for year in years:
    df_one_year = pd.DataFrame()
    page = requests.get(f'https://www.federalreserve.gov/newsevents/speech/{year}speech.htm')
    soup = BeautifulSoup(page.text, 'html.parser')
    events = soup.select(".eventlist__event")
    for i,speech in enumerate(events):
        df_one_year.at[i,'link'] = 'https://www.federalreserve.gov'+events[i].find_all('a', href=True)[0]['href']
        df_one_year.at[i,'title'] = events[i].text.split('\n')[2]
        if events[i].text.split('\n')[3]=='Watch Live' or events[i].text.split('\n')[3]=='Video':
            df_one_year.at[i,'speaker'] = events[i].text.split('\n')[4]
            df_one_year.at[i,'event'] = events[i].text.split('\n')[5]
            df_one_year.at[i,'year'] = year
        else:
            df_one_year.at[i,'speaker'] = events[i].text.split('\n')[3]
            df_one_year.at[i,'event'] = events[i].text.split('\n')[4]
            df_one_year.at[i,'year'] = year
    dfs_2006_2010.append(df_one_year)

df_2006_2010 = pd.concat(dfs_2006_2010, ignore_index=True)

In [4]:
years = range(2011,2023)
dfs_2011_2022 = []

for year in years:
    df_one_year = pd.DataFrame()
    page = requests.get(f'https://www.federalreserve.gov/newsevents/speech/{year}-speeches.htm')
    soup = BeautifulSoup(page.text, 'html.parser')
    events = soup.select(".eventlist__event")
    for i,speech in enumerate(events):
        df_one_year.at[i,'link'] = 'https://www.federalreserve.gov'+events[i].find_all('a', href=True)[0]['href']
        df_one_year.at[i,'title'] = events[i].text.split('\n')[2]
        if events[i].text.split('\n')[3]=='Watch Live' or events[i].text.split('\n')[3]=='Video':
            df_one_year.at[i,'speaker'] = events[i].text.split('\n')[4]
            df_one_year.at[i,'event'] = events[i].text.split('\n')[5]
            df_one_year.at[i,'year'] = year
        else:
            df_one_year.at[i,'speaker'] = events[i].text.split('\n')[3]
            df_one_year.at[i,'event'] = events[i].text.split('\n')[4]
            df_one_year.at[i,'year'] = year
    dfs_2011_2022.append(df_one_year)

df_2011_2022 = pd.concat(dfs_2011_2022, ignore_index=True)

In [5]:
df_1996_2022 = pd.concat([df_1996_2005,df_2006_2010,df_2011_2022], ignore_index=True)

In [6]:
df_1996_2022

Unnamed: 0,link,title,speaker,event,year
0,https://www.federalreserve.gov/boarddocs/speec...,Supervision of bank risk-taking,Vice Chair Alice M. Rivlin,At the The Brookings Institution National Issu...,1996.0
1,https://www.federalreserve.gov/boarddocs/speec...,Social security,Chairman Alan Greenspan,At the Abraham Lincoln Award Ceremony of the U...,1996.0
2,https://www.federalreserve.gov/boarddocs/speec...,The challenge of central banking in a democrat...,Chairman Alan Greenspan,At the Annual Dinner and Francis Boyer Lecture...,1996.0
3,https://www.federalreserve.gov/boarddocs/speec...,Clearinghouses and risk management,"Governor Edward W. Kelley, Jr.","At the 1996 Payments System Risk Conference, W...",1996.0
4,https://www.federalreserve.gov/boarddocs/speec...,Supervisory and regulatory responses to financ...,Governor Susan M. Phillips,At the BAI Seminar on Regulatory Policy Change...,1996.0
...,...,...,...,...,...
1602,https://www.federalreserve.gov/newsevents/spee...,Welcoming Remarks,Governor Michelle W. Bowman,At Fed Listens: Helping Youth ThriveâA Discu...,2022.0
1603,https://www.federalreserve.gov/newsevents/spee...,Fighting Inflation with Rate Hikes and Balance...,Governor Christopher J. Waller,"At the Economic Forecast Project, University o...",2022.0
1604,https://www.federalreserve.gov/newsevents/spee...,High Inflation and the Outlook for Monetary Po...,Governor Michelle W. Bowman,At the American Bankers Association Community ...,2022.0
1605,https://www.federalreserve.gov/newsevents/spee...,Preparing for the Financial System of the Future,Governor Lael Brainard,"At the 2022 U.S. Monetary Policy Forum, New Yo...",2022.0


# Scrape speech text

In [7]:
old_site_version_length = sum(df_1996_2022['year']<1999)

j = 1
for i in range(old_site_version_length):
    page = requests.get(df_1996_2022.loc[i,'link'])
    soup = BeautifulSoup(page.text, 'html.parser')
    text_list = [i for i in soup.find('p').getText().split('\n') if i] 
    text_list=text_list[:-8]
    text_list = ' '.join(text_list)
    text_list = text_list.replace('--', ' ')
    text_list = text_list.replace('\r', '')
    text_list = text_list.replace('\t', '')
    text_list = text_list.replace("\'", '')
    df_1996_2022.loc[i,'text'] = text_list
    j+=1
    print(j, end="\r")

122

In [8]:
for i in range(len(df_1996_2022)):
    if ((df_1996_2022.loc[i,'year']>1998) & (df_1996_2022.loc[i,'year']<2006)):
        page = requests.get(df_1996_2022['link'].iloc[i])
        soup = BeautifulSoup(page.text, 'html.parser')
        events = soup.select("table")
        if len(str(events[0].text))>600:
            text_list = [i for i in events[0].text if i] 
        else:
            text_list = [i for i in events[1].text if i]
        text_list = ''.join(text_list)
        text_list = text_list.replace('--', '')
        text_list = text_list.replace('\r', '')
        text_list = text_list.replace('\t', '')
        text_list = text_list.replace('\n        ', '')
        text_list = text_list.replace('\n ', '')
        text_list = text_list.replace('\n', '')
        text_list = text_list.replace("\'", '')
        text_list = text_list.replace("\\", "")
        text_list = text_list.split('Appendix', 1)[0]
        if ((i>=383) & (i<=536)):
            text_list = text_list.replace('     ', ' ')
            text_list = text_list.replace('    ', ' ')
        df_1996_2022.loc[i,'text'] = text_list
        j+=1
        print(j, end="\r")

646

In [9]:
for i in range(1,len(df_1996_2022)):
    if ((df_1996_2022.loc[i,'year']>2005) and (i not in [743, 748])):
        page = requests.get(df_1996_2022.loc[i,'link'])
        soup = BeautifulSoup(page.text, 'html.parser')
        events = soup.select(".col-md-8")
        text_list = events[1].text
        text_list = text_list.replace("\'", '')
        text_list = text_list.replace('\n', ' ')
        text_list = text_list.replace('â\x80\x94', '')
        df_1996_2022.loc[i,'text'] = text_list
        j+=1
        print(j, end="\r")

1606

# Normalize Data

In [10]:
df_1996_2022.drop([743,748], axis=0, inplace=True)

In [11]:
df_1996_2022['text_len'] = df_1996_2022['text'].apply(len)

In [12]:
df_1996_2022.loc[df_1996_2022['speaker']=='Chairman  Ben S. Bernanke','speaker'] = 'Chairman Ben S. Bernanke'
df_1996_2022.loc[df_1996_2022['speaker']=='Governor Ben S. Bernanke and Vincent R. Reinhart, Director, Division of Monetary Affairs','speaker'] = 'Governor Ben S. Bernanke'
df_1996_2022.loc[df_1996_2022['speaker']=='Governor Donald L. Kohn and Brian P. Sack, Senior Economist','speaker'] = 'Governor Donald L. Kohn'
df_1996_2022.loc[df_1996_2022['speaker']=='Governor Susan Schmidt Bies','speaker'] = 'Governor Susan S. Bies'
df_1996_2022.loc[df_1996_2022['speaker']=='Vice Chair for Supervision and Chair of the Financial Stability Board Randal K. Quarles','speaker'] = 'Vice Chair for Supervision Randal K. Quarles'
df_1996_2022.loc[df_1996_2022['speaker']=='Vice Chairman for Supervision and Chair of the Financial Stability Board Randal K. Quarles','speaker'] = 'Vice Chair for Supervision Randal K. Quarles'
df_1996_2022.loc[df_1996_2022['speaker']=='Vice Chairman for Supervision Randal K. Quarles','speaker'] = 'Vice Chair for Supervision Randal K. Quarles'
df_1996_2022.loc[df_1996_2022['speaker']=='Vice Chairman Roger W. Ferguson, Jr','speaker'] = 'Vice Chairman Roger W. Ferguson'
df_1996_2022.loc[df_1996_2022['speaker']=='Vice Chairman Roger W. Ferguson, Jr.','speaker'] = 'Vice Chairman Roger W. Ferguson'
df_1996_2022.loc[df_1996_2022['speaker']=='Chair Jerome H. Powell','speaker'] = 'Chairman Jerome H. Powell'
df_1996_2022.loc[df_1996_2022['speaker']=='Vice Chair Richard H. Clarida','speaker'] = 'Vice Chairman Richard H. Clarida'

In [13]:
df_1996_2022['year'] = df_1996_2022['year'].astype(int)

In [14]:
df_1996_2022 = df_1996_2022.drop(df_1996_2022[df_1996_2022['text_len']<1000].index, axis=0)

In [15]:
df_1996_2022['date'] = df_1996_2022['link'].str.extract('(\d\d\d\d\d\d\d\d)')
df_1996_2022['date'] = pd.to_datetime(df_1996_2022['date'])

In [16]:
df_1996_2022 = df_1996_2022.reset_index(drop=True)

In [18]:
df_1996_2022.speaker = df_1996_2022.speaker.apply(lambda x: x.replace('Chair ', 'Chairman '))

In [19]:
for index, frame in df_1996_2022.iterrows():
    if 'Chairman' in df_1996_2022['speaker'].iloc[index] and 'Vice Chairman' not in df_1996_2022['speaker'].iloc[index]:
         df_1996_2022.loc[index, 'speaker_title'] = 'Chairman'
    elif 'Vice Chairman' in df_1996_2022['speaker'].iloc[index]:
        df_1996_2022.loc[index, 'speaker_title'] = 'Vice Chairman'
    elif 'Governor' in df_1996_2022['speaker'].iloc[index]:
        df_1996_2022.loc[index, 'speaker_title'] = 'Governor'
    else:
        df_1996_2022.loc[index, 'speaker_title'] = 'Other'

In [20]:
df_1996_2022

Unnamed: 0,link,title,speaker,event,year,text,text_len,date,speaker_title
0,https://www.federalreserve.gov/boarddocs/speec...,Supervision of bank risk-taking,Vice Chairman Alice M. Rivlin,At the The Brookings Institution National Issu...,1996,I discovered when I joined the Board of Govern...,23431,1996-12-19,Vice Chairman
1,https://www.federalreserve.gov/boarddocs/speec...,Social security,Chairman Alan Greenspan,At the Abraham Lincoln Award Ceremony of the U...,1996,I am privileged to accept the Union League of...,16604,1996-12-06,Chairman
2,https://www.federalreserve.gov/boarddocs/speec...,The challenge of central banking in a democrat...,Chairman Alan Greenspan,At the Annual Dinner and Francis Boyer Lecture...,1996,The Challenge of Central Banking in a Democrat...,27372,1996-12-05,Chairman
3,https://www.federalreserve.gov/boarddocs/speec...,Clearinghouses and risk management,"Governor Edward W. Kelley, Jr.","At the 1996 Payments System Risk Conference, W...",1996,It is a pleasure to be with you this morning ...,16703,1996-12-03,Governor
4,https://www.federalreserve.gov/boarddocs/speec...,Supervisory and regulatory responses to financ...,Governor Susan M. Phillips,At the BAI Seminar on Regulatory Policy Change...,1996,Supervisory and Regulatory Responses to Financ...,14888,1996-11-25,Governor
...,...,...,...,...,...,...,...,...,...
1588,https://www.federalreserve.gov/newsevents/spee...,Restoring Price Stability,Chairman Pro Tempore Jerome H. Powell,"At ""Policy Options for Sustainable and Inclusi...",2022,Thank you for the opportunity to speak with y...,17562,2022-03-21,Chairman
1589,https://www.federalreserve.gov/newsevents/spee...,Welcoming Remarks,Governor Michelle W. Bowman,At Fed Listens: Helping Youth ThriveâA Discu...,2022,"When we started the Fed Listens initiative, w...",2795,2022-03-18,Governor
1590,https://www.federalreserve.gov/newsevents/spee...,Fighting Inflation with Rate Hikes and Balance...,Governor Christopher J. Waller,"At the Economic Forecast Project, University o...",2022,"Thank you Peter, and thank you to the UCSB Ec...",22710,2022-02-24,Governor
1591,https://www.federalreserve.gov/newsevents/spee...,High Inflation and the Outlook for Monetary Po...,Governor Michelle W. Bowman,At the American Bankers Association Community ...,2022,Before we get to our conversation on communit...,9460,2022-02-21,Governor


In [21]:
df_1996_2022.to_csv('fed_speeches_1996_2022.csv', index=False)