In [1]:
!pip install PyPDF2



### Parsing meeting minutes

In [2]:
# Import necessary libraries
import requests                      # For making HTTP requests
from bs4 import BeautifulSoup        # For parsing HTML content
import os                            # For interacting with the operating system
import re                            # For using regular expressions
import pandas as pd                  # For data handling
from PyPDF2 import PdfReader         # For reading PDF files

# Base URL for FOMC documents
fed_base_url = 'https://www.federalreserve.gov'

# URL for the page listing FOMC meeting dates and minutes
meeting_page_url = f'{fed_base_url}/monetarypolicy/fomccalendars.htm'

# Send a request to get the FOMC meeting page content
page_response = requests.get(meeting_page_url)
parsed_page = BeautifulSoup(page_response.text, 'html.parser')

# Initialize an empty list to hold extracted information
meeting_data = []

# Find all links on the page
page_links = parsed_page.find_all('a', href=True)

# Loop through all links to find meeting minutes PDF files
for item in page_links:
    if 'fomcminutes' in item['href'] and item['href'].endswith('.pdf'):  # Process only PDF links
        # Construct the full URL for the PDF file
        pdf_url = fed_base_url + item['href']
        print(f'Downloading PDF: {pdf_url}')
        
        # Download the PDF file
        pdf_content = requests.get(pdf_url)
        
        # Define a filename based on the URL
        pdf_filename = pdf_url.split('/')[-1]
        with open(pdf_filename, 'wb') as file:
            file.write(pdf_content.content)
        print(f'Saved PDF as: {pdf_filename}')
        
        # Extract the date from the filename using a regular expression
        date_match = re.search(r'(\d{8})', pdf_filename)
        if date_match:
            meeting_date_str = date_match.group(1)
            meeting_date = pd.to_datetime(meeting_date_str, format='%Y%m%d')  # Convert to datetime
            
            # Attempt to read and extract text from the PDF
            try:
                pdf_reader = PdfReader(pdf_filename)
                pdf_text = ''
                for page in pdf_reader.pages:
                    pdf_text += page.extract_text()  # Accumulate text from each page
                
                # Append the date and text content to the list
                meeting_data.append({'date': meeting_date, 'text': pdf_text})
            except Exception as error:
                print(f"Error reading PDF {pdf_filename}: {error}")

# Create a DataFrame with the extracted data
meetings_df = pd.DataFrame(meeting_data)

# Display the first few rows of the DataFrame
print(meetings_df.head())

Downloading PDF: https://www.federalreserve.gov/monetarypolicy/files/fomcminutes20240131.pdf
Saved PDF as: fomcminutes20240131.pdf
Downloading PDF: https://www.federalreserve.gov/monetarypolicy/files/fomcminutes20240320.pdf
Saved PDF as: fomcminutes20240320.pdf
Downloading PDF: https://www.federalreserve.gov/monetarypolicy/files/fomcminutes20240501.pdf
Saved PDF as: fomcminutes20240501.pdf
Downloading PDF: https://www.federalreserve.gov/monetarypolicy/files/fomcminutes20240612.pdf
Saved PDF as: fomcminutes20240612.pdf
Downloading PDF: https://www.federalreserve.gov/monetarypolicy/files/fomcminutes20240731.pdf
Saved PDF as: fomcminutes20240731.pdf
Downloading PDF: https://www.federalreserve.gov/monetarypolicy/files/fomcminutes20240918.pdf
Saved PDF as: fomcminutes20240918.pdf
Downloading PDF: https://www.federalreserve.gov/monetarypolicy/files/fomcminutes20230201.pdf
Saved PDF as: fomcminutes20230201.pdf
Downloading PDF: https://www.federalreserve.gov/monetarypolicy/files/fomcminutes202

### Parsing press conferences

In [3]:
# Import necessary libraries
import requests                      # For making HTTP requests
import os                            # For handling file operations
import pandas as pd                  # For data handling and manipulation
from PyPDF2 import PdfReader         # For reading PDF files

# Function to download and extract text from FOMC press conference PDFs, then create a DataFrame
def fetch_and_process_fomc_press_conferences(dates_list):
    # Base URL pattern for FOMC press conference PDFs
    press_conf_url_template = 'https://www.federalreserve.gov/mediacenter/files/FOMCpresconf{}.pdf'

    # List to store data extracted from PDFs
    extracted_data = []

    # Loop through each date to fetch corresponding PDF
    for conf_date in dates_list:
        # Construct the URL for each press conference
        pdf_url = press_conf_url_template.format(conf_date)
        print(f"Trying to download: {pdf_url}")
        response = requests.get(pdf_url)
        
        if response.status_code == 200:
            # Define the filename based on the date
            file_name = f'FOMCpresconf{conf_date}.pdf'
            
            # Save the downloaded PDF
            with open(file_name, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded and saved as: {file_name}")
            
            # Extract text content from the PDF
            try:
                pdf_reader = PdfReader(file_name)
                extracted_text = ''
                
                for pdf_page in pdf_reader.pages:
                    page_text = pdf_page.extract_text()
                    if page_text:
                        extracted_text += page_text + '\n'

                # Append the date and extracted text to the data list
                extracted_data.append({'date': conf_date, 'text': extracted_text})
                print(f"Text extracted from: {file_name}")
                
            except Exception as extract_error:
                print(f"Failed to extract text from {file_name}: {extract_error}")
        
        else:
            print(f"Download failed: {pdf_url} (HTTP Status: {response.status_code})")

    # Create a DataFrame from the extracted data
    fomc_df = pd.DataFrame(extracted_data)
    
    # Convert the 'date' column to datetime format for better handling
    fomc_df['date'] = pd.to_datetime(fomc_df['date'], format='%Y%m%d')

    return fomc_df

    # List of FOMC press conference dates to process
conference_dates = ['20120125', '20120425', '20120620', '20120913', '20121212', '20130320', '20130619', '20130918', '20131218',
'20140319', '20140618', '20140917', '20141217', '20150318', '20150617', '20150917', '20151216',
'20160316', '20160615', '20160921', '20161214', '20170315', '20170614', '20170920', '20171213',
'20180321', '20180613', '20180926', '20181219','20190130', '20190320', '20190501', '20190619', '20190731', '20190918', '20191030', '20191211',
'20200129', '20200303', '20200315', '20200429', '20200610', '20200729', '20200916', '20201105', '20201216',
'20210127', '20210317', '20210428', '20210616', '20210728', '20210922', '20211103', '20211215',
'20220126', '20220316', '20220504', '20220615', '20220727', '20220921', '20221102', '20221214',
'20230201', '20230322', '20230503', '20230614', '20230726', '20230920', '20231101', '20231213',
'20240131', '20240320', '20240501', '20240612', '20240731', '20240918']

# Call the function to fetch, process, and compile data into a DataFrame
press_conferences_df = fetch_and_process_fomc_press_conferences(conference_dates)

# Display the first few entries of the DataFrame
print(press_conferences_df.head())


Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20120125.pdf
Downloaded and saved as: FOMCpresconf20120125.pdf
Text extracted from: FOMCpresconf20120125.pdf
Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20120425.pdf
Downloaded and saved as: FOMCpresconf20120425.pdf
Text extracted from: FOMCpresconf20120425.pdf
Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20120620.pdf
Downloaded and saved as: FOMCpresconf20120620.pdf
Text extracted from: FOMCpresconf20120620.pdf
Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20120913.pdf
Downloaded and saved as: FOMCpresconf20120913.pdf
Text extracted from: FOMCpresconf20120913.pdf
Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20121212.pdf
Downloaded and saved as: FOMCpresconf20121212.pdf
Text extracted from: FOMCpresconf20121212.pdf
Trying to download: https://www.federalreserve.gov

Text extracted from: FOMCpresconf20200916.pdf
Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20201105.pdf
Downloaded and saved as: FOMCpresconf20201105.pdf
Text extracted from: FOMCpresconf20201105.pdf
Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20201216.pdf
Downloaded and saved as: FOMCpresconf20201216.pdf
Text extracted from: FOMCpresconf20201216.pdf
Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20210127.pdf
Downloaded and saved as: FOMCpresconf20210127.pdf
Text extracted from: FOMCpresconf20210127.pdf
Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20210317.pdf
Downloaded and saved as: FOMCpresconf20210317.pdf
Text extracted from: FOMCpresconf20210317.pdf
Trying to download: https://www.federalreserve.gov/mediacenter/files/FOMCpresconf20210428.pdf
Downloaded and saved as: FOMCpresconf20210428.pdf
Text extracted from: FOMCpresconf20210428.pdf
Tryi

### Parsing speech transcripts

In [4]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from http.client import HTTPSConnection
import requests

def create_url_list(start_year, end_year, prefix, suffix):
    annual_htm_list = []
    for year in range(start_year, end_year + 1):
        mid_str = str(year)
        this_suffix = 'speech.htm' if year <= 2010 else suffix
        annual_htm_list.append(prefix + mid_str + this_suffix)
    return annual_htm_list

def find_speeches_by_year(host, this_url, print_test=False):
    conn = HTTPSConnection(host)
    conn.request(method='GET', url=this_url)
    resp = conn.getresponse()
    body = resp.read()
    
    if resp.status != 200:
        print(f'Error from website! Response code: {resp.status}')
        return [], [], [], []

    soup = BeautifulSoup(body, 'html.parser')
    event_list = soup.find('div', class_='row eventlist')
    
    date_lst, title_lst, speaker_lst, link_lst = [], [], [], []
    
    if event_list:
        for row in event_list.find_all('div', class_='row'):
            tmp_date = [x.text for x in row.find_all('time')]
            date_lst.append(tmp_date[0] if tmp_date else None)

            tmp_speaker = [x.text for x in row.find_all('p', class_='news__speaker')]
            speaker_lst.append(tmp_speaker[0] if tmp_speaker else None)

            tmp_title = [x.text for x in row.find_all('em')]
            title_lst.append(tmp_title[0] if tmp_title else None)

        for link in event_list.find_all('a', href=True, class_=lambda x: x != 'watchLive'):
            link_lst.append(link['href'])

    if print_test:
        print(f'Length of dates: {len(date_lst)}')
        print(f'Length of speakers: {len(speaker_lst)}')
        print(f'Length of titles: {len(title_lst)}')
        print(f'Length of hrefs: {len(link_lst)}')

    return date_lst, speaker_lst, title_lst, link_lst

def create_speech_df(host, annual_htm_list):
    all_dates, all_speakers, all_titles, all_links = [], [], [], []
    
    for url in annual_htm_list:
        dates, speakers, titles, links = find_speeches_by_year(host, url)
        all_dates.extend(dates)
        all_speakers.extend(speakers)
        all_titles.extend(titles)
        all_links.extend(links)

    df = pd.DataFrame({
        'date': all_dates,
        'speaker': all_speakers,
        'title': all_titles,
        'link': all_links
    })

    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df.dropna(subset=['date'], inplace=True)
    df['text'] = np.nan
    df = df[~df['link'].str.startswith('/pubs/feds')]

    return df

def retrieve_docs(host, df):
    for index, row in df.iterrows():
        link = row['link']
        doc = get_one_doc(host, link)
        df.at[index, 'text'] = doc
    return df

def get_one_doc(host, this_url):
    url = f'https://{host}{this_url}'
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f'Failed to retrieve document from {url}. Status code: {response.status_code}')
        return ''
    
    soup = BeautifulSoup(response.text, 'html.parser')
    article = soup.find('div', class_='col-xs-12 col-sm-8 col-md-8')
    if not article:
        return ''
    
    paragraphs = [p.text for p in article.find_all('p')]
    return ' '.join(paragraphs)


host = 'www.federalreserve.gov'
prefix = '/newsevents/speech/'
suffix = '-speeches.htm'
start_year = 2012
end_year = 2024

annual_htm_list = create_url_list(start_year, end_year, prefix, suffix)
print('Below is the annual_htm_list')
print(annual_htm_list)

df_speech = create_speech_df(host, annual_htm_list)
df_speech = retrieve_docs(host, df_speech)
df_speech=df_speech[["date", "text"]]

print(df_speech.head())


Below is the annual_htm_list
['/newsevents/speech/2012-speeches.htm', '/newsevents/speech/2013-speeches.htm', '/newsevents/speech/2014-speeches.htm', '/newsevents/speech/2015-speeches.htm', '/newsevents/speech/2016-speeches.htm', '/newsevents/speech/2017-speeches.htm', '/newsevents/speech/2018-speeches.htm', '/newsevents/speech/2019-speeches.htm', '/newsevents/speech/2020-speeches.htm', '/newsevents/speech/2021-speeches.htm', '/newsevents/speech/2022-speeches.htm', '/newsevents/speech/2023-speeches.htm', '/newsevents/speech/2024-speeches.htm']


       Thanks very much. It's a pleasure to be part of this panel on the future of financial globalization.  I will focus my remarks on one important aspect of this issue--namely, the growing use of wholesale dollar funding by global financial institutions.  I'll begin by briefly discussing research I've been doing, along with my coauthors Victoria Ivashina and David Scharfstein, which examines some of the consequences of this funding model during times of market stress.1   I'll then touch on the policy implications of this and related work.  But first, the usual disclaimer:  The views that follow are my own and do not necessarily reflect the thinking of my colleagues on the Federal Open Market Committee.
     
       By way of background, the dollar liabilities of foreign banks have grown rapidly in the past two decades and now stand at about $8 trillion, roughly on par with those of U.S. banks.2   A significant proportion of foreign banks' dollar liabilities are raised via U.S. bra

        date                                               text
0 2012-12-17  \r\n       Thanks very much. It's a pleasure t...
1 2012-12-04  \r\n       It is a pleasure to be at Brookings...
2 2012-11-30  \r\n       Given that the conference theme is ...
3 2012-11-28  \r\n       In the aftermath of the financial c...
4 2012-11-20  \r\n       Good afternoon. I am pleased to joi...


### Combining all of the data into single dataframe

In [5]:
df_min_press=pd.concat([meetings_df, press_conferences_df], ignore_index=True).sort_values(by='date').reset_index(drop=True)
df_min_press

Unnamed: 0,date,text
0,2012-01-25,"January 25, 2012 Chairman Bernanke’s Press Co..."
1,2012-04-25,"April 25, 2012 Chairman Bernanke’s Press Confe..."
2,2012-06-20,"June 20, 2012 Chairman Bernanke’s Press Confe..."
3,2012-09-13,"September 13, 2012 Chairman Bernanke’s Pres..."
4,2012-12-12,"December 12, 2012 Chairman Bernanke’s Press C..."
...,...,...
117,2024-06-12,"June 12, 2024 Chair Powell’s Press Conferenc..."
118,2024-07-31,1 \n \nMinutes of the Federal Open Market \nCo...
119,2024-07-31,"July 31, 202 4 Chair Powell’s Press Conferen..."
120,2024-09-18,1 \n \nMinutes of the Federal Open Market \nCo...


In [6]:
df_min_press_speech=pd.concat([df_min_press, df_speech]).sort_values(by='date').reset_index(drop=True)
df_min_press_speech

Unnamed: 0,date,text
0,2012-01-06,"It is the start of a new year, the traditional..."
1,2012-01-06,\r\n Thank you for the opportunity to sp...
2,2012-01-07,\r\n Thank you and happy New Year. It is...
3,2012-01-13,\r\n It's a pleasure to be here this mor...
4,2012-01-16,\r\n It is certainly a pleasure to be he...
...,...,...
884,2024-10-08,"Thank you, Isabel, and thank you for the oppor..."
885,2024-10-08,"Thank you, President Hicks and Tara Boehmler, ..."
886,2024-10-09,"Thank you, Steve, for that kind introduction a..."
887,2024-10-10,"Thank you for the kind introduction, Jennet.1 ..."


#### Save the final dataframe as csv file

In [7]:
df_min_press_speech.to_csv('df_min_press_speech.csv', index=False)
