In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [2]:
# import packages
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import hashlib



In [4]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
service = Service('/Users/anthony_ning/Downloads/chromedriver_mac_arm64/chromedriver')
driver = webdriver.Chrome(service=service, options=chrome_options)
driver_2 = webdriver.Chrome(service=service, options=chrome_options)

In [6]:
# Fetch the webpage
url = 'https://www.federalreserve.gov/newsevents/speeches.htm'
driver.get(url)

In [11]:
titles = []
speakers = []
dates = []
articles = []

while True:
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    speeches = soup.find_all('div', class_='row ng-scope')
    
    for speech in speeches[1:]:
        article_paragraphs = ''
        
        # title of the speech
        title = speech.find('a').get_text(strip=True)
        titles.append(title)

        # speaker of the speech
        speaker = speech.find('p', class_='news__speaker ng-binding').get_text(strip=True)
        speakers.append(speaker)

        # date of the speech
        date = speech.find('time').get_text(strip=True)
        dates.append(date)
        
        # speech url
        speech_url = 'https://www.federalreserve.gov' + speech.find('a')['href']
        # fetch the webpage
        driver_2.get(speech_url)
        soup = BeautifulSoup(driver_2.page_source, 'html.parser')
        # speech content
        content = soup.find('div', class_='col-xs-12 col-sm-8 col-md-8')

        if content.find('div', class_='col-xs-12 col-md-7 pull-right'):
            content.find('div', class_='col-xs-12 col-md-7 pull-right').decompose()
        
        for paragraph in content.find_all('p'):
            
            # Check if the paragraph contains a <strong> tag with the word 'References'
            if paragraph.find('strong') and paragraph.find('strong').get_text(strip=True) == 'References':
                # Stop processing when the "References" section starts
                break
                
            elif paragraph.find_all('a'):
                for a in paragraph.find_all('a'):
                    if a.get('title') and 'footnote' in a.get('title'):
                        a.decompose()
                        skip = False
                    elif a.get('name') and 'fn' in a.get('name'):
                        a.decompose()
                        skip =True
                if not skip:
                    article_paragraphs += paragraph.get_text(strip=True)
                    article_paragraphs += ' '
                else:
                    break
            elif paragraph.find('div'):
                continue
            else:
                article_paragraphs += paragraph.get_text(strip=True)
                article_paragraphs += ' '

        articles.append(article_paragraphs)
            
    # Handle the pagination by clicking "Next"
    try:
        # Save a hash of the current page content
        page_hash_before = hashlib.md5(driver.page_source.encode('utf-8')).hexdigest()

        # Try to locate and click the "Next" button
        next_button = driver.find_element(By.XPATH, '//a[text()="Next"]')
        next_button.click()

        # Wait for the page to update
        driver.implicitly_wait(5)

        # Save a hash of the new page content
        page_hash_after = hashlib.md5(driver.page_source.encode('utf-8')).hexdigest()

        # Check if the page content changed
        if page_hash_before == page_hash_after:
            print("Page content did not change. Reached the last page.")
            break  # Exit the loop if the page content hasn't changed

    except NoSuchElementException:
        # No "Next" button found, assume we've reached the last page
        print("No 'Next' button found. Exiting loop.")
        break

    except StaleElementReferenceException:
        # Handle potential stale element issues
        print("Stale element encountered. Exiting loop.")
        break

Page content did not change. Reached the last page.


In [23]:
dic = {'title': titles, 'speaker': speakers, 'date': dates, 'article': articles}
data = pd.DataFrame(dic)

# Convert 'date' column to datetime
data['date'] = pd.to_datetime(data['date'])

# Filter rows where the year is greater than or equal to 2019
data_filtered = data[data['date'].dt.year >= 2020]

In [33]:
data_filtered.to_csv('FED_speech.csv', index=False)