In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re
import fitz  # PyMuPDF for PDF handling
from pdfminer.high_level import extract_text
import time

In [2]:
# getting all the dates from 2012 to 2024, based on day month and year

days_temp = [i for i in range(1,32)]
days = []
for day in days_temp:
    if day <= 9:
        days.append('0'+str(day))
    else:
        days.append(str(day))
        
months_temp = [i for i in range(1,13)]
months = []

for month in months_temp:
    if month <= 9:
        months.append('0'+str(month))
    else:
        months.append(str(month))
        
years = [str(i) for i in range(2012,2025)]

In [3]:
# combining them to form a string YYYYMMDD

dates = []
for i in years:
    for j in months:
        for k in days:
            temp = i+j+k
            dates.append(temp)

In [4]:
#converts pdf to text 

def convert_pdf_to_text(pdf_url, output_file,parent_dir,new_dir):

    #checking for directory
    path = os.path.join(parent_dir, new_dir)

    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Directory {new_dir} created at {path}")
        

    #downloading pdf
    response = requests.get(pdf_url)
    temp_pdf = 'temp_downloaded.pdf'
    #print(output_file)
    
    with open(temp_pdf, 'wb') as f:
        f.write(response.content)
    try:

        # read file for every page and convert it to text file
        doc = fitz.open(temp_pdf)
        text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            text += page.get_text()

        file_path = os.path.join(path, output_file)
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(text)

        print(f"Text successfully extracted to {output_file}")

    except Exception as e:
        print('Running Next File.')

    finally:
        if os.path.exists(temp_pdf):
            os.remove(temp_pdf)



In [5]:
# get links of all the meeting for FOMC Speeches

def scrape_speech_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all speech entries, which are stored under <a> tags
    speech_links = []
    
    for link in soup.find_all('a', href=True):
        href = link['href']
        if '/newsevents/speech/' in href:
            speech_links.append(href)
            
    return speech_links

In [6]:
# method redundant

def convert_htm_to_text(htm_url,output_file,parent_dir,new_dir): 
   
    path = os.path.join(parent_dir, new_dir)
    
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Directory {new_dir} created at {path}")

    response = requests.get(htm_url)
    
    if response.status_code == 200:  # Check if the request was successful
        htm_content = response.content

        soup = BeautifulSoup(htm_content, 'html.parser')
        text = soup.get_text()
        
        file_path = os.path.join(path, output_file) 

        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(text)
            print(f"Converted {htm_url} to {output_file}")
    else:
        #print(f"Failed to retrieve the URL: {response.status_code}")
        print()


In [7]:
# Every link wont work, only will work on dates when there was a press conference, since we are checking every date.

# running files for FOMC Press Conf
parent_dir = "/Users/pranaymatalia/Desktop/FRE-GY 7871/HW 2/Data" 
fomc_press_conf_dir = "FOMC_Press_Conference"

start_time_press_conf = time.time()
for date in dates:
    base_url = 'https://www.federalreserve.gov/mediacenter/files/FOMCpresconf'+date+'.pdf'
    output_file = f"FOMCpresconf{date}.txt"
    convert_pdf_to_text(base_url, output_file,parent_dir,fomc_press_conf_dir)
end_time_press_conf = time.time()

elapsed_time_press_conf = end_time_press_conf - start_time_press_conf
print(f"Execution time for all Press Conferences: {elapsed_time_press_conf:.4f} seconds")



Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Text successfully extracted to FOMCpresconf20120125.txt
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next 

In [8]:
#running all files for FOMC Speech

all_speeches_url = [f"https://www.federalreserve.gov/newsevents/speech/{i}-speeches.htm" for i in range(2012,2025)]
fomc_speech_dir = "FOMC_Speech"

start_time_speech = time.time()
for speech_year in all_speeches_url:
    year_speech_url_ = scrape_speech_links(speech_year)
    for speech in year_speech_url_:
        name = speech.split('/')[-1]
        output = name.split('.')[0]
        output_file = output + '.txt'
        base_url = f"https://www.federalreserve.gov/newsevents/speech/files/{output}.pdf"
        convert_pdf_to_text(base_url,output_file, parent_dir,fomc_speech_dir)

end_time_speech = time.time()
elapsed_time_speech = end_time_speech-start_time_speech
print(f"Execution time for Speech: {elapsed_time_speech:.4f} seconds")
      

Text successfully extracted to stein20121217a.txt
Text successfully extracted to tarullo20121204a.txt
Text successfully extracted to stein20121130a.txt
Text successfully extracted to tarullo20121128a.txt
Text successfully extracted to bernanke20121120a.txt
Text successfully extracted to bernanke20121115a.txt
Text successfully extracted to yellen20121113a.txt
Text successfully extracted to duke20121109a.txt
Text successfully extracted to bernanke20121014a.txt
Text successfully extracted to stein20121011a.txt
Text successfully extracted to yellen20121011a.txt
Text successfully extracted to tarullo20121010a.txt
Text successfully extracted to duke20121005a.txt
Text successfully extracted to bernanke20121001a.txt
Text successfully extracted to bernanke20120831a.txt
Text successfully extracted to bernanke20120807a.txt
Text successfully extracted to bernanke20120806a.txt
Text successfully extracted to bernanke20120724a.txt
Text successfully extracted to raskin20120723a.txt
Text successfully e

In [9]:
# Every link wont work, only will work on dates when there was minutes of a meeitng were present, since we are checking every date.

# running files for FOMC Minutes

parent_dir = "/Users/pranaymatalia/Desktop/FRE-GY 7871/HW 2/Data" 
fomc_press_conf_dir = "FOMC_Minutes"

start_time_minutes = time.time()
for date in dates:
    base_url = f'https://www.federalreserve.gov/monetarypolicy/files/fomcminutes{date}.pdf'
    output_file = f"FOMCminutes{date}.txt"
    convert_pdf_to_text(base_url, output_file,parent_dir,fomc_press_conf_dir)
end_time_minutes = time.time()

elapsed_time_minutes = end_time_minutes - start_time_minutes
print(f"Execution time for all FOMC Minutes: {elapsed_time_minutes:.4f} seconds")



Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Text successfully extracted to FOMCminutes20120125.txt
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next File.
Running Next F