<a href="https://colab.research.google.com/github/abagnard/266FinalProject/blob/main/W266_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os, sys, re, csv, json, time, unittest
import itertools, collections
from importlib import reload

import numpy as np
from scipy import stats
import pandas as pd

import nltk



#FOMC Data Scraper

###Links to Data:
**Start-2014:** https://www.federalreserve.gov/monetarypolicy/fomc_historical_year.htm
**2015-2020:** https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm

###2010-2020 General Link Format:
  **Statement/Press Release:** https://www.federalreserve.gov/newsevents/pressreleases/monetaryYYYYMMDDa.htm

**Minutes:** https://www.federalreserve.gov/monetarypolicy/fomcminutesYYYYMMDD.htm

In [58]:
from   bs4    import BeautifulSoup
from   urllib.request import urlopen

def getFOMCStatementURL(date):
  if date >= 20100000:
    url = "https://www.federalreserve.gov/newsevents/pressreleases/monetary" + str(date) + "a.htm"
  return url

def getFOMCMinutesURL(date):
  if date >= 20100000:
    url = "https://www.federalreserve.gov/monetarypolicy/fomcminutes" + str(date) + ".htm"
  return url

def getFOMCStatementData(date):
    html = urlopen(getFOMCStatementURL(date)).read()
    soup = BeautifulSoup(html)
    text = soup.get_text(" ")
    if int(date) < 20200400:
        startLocation = re.search("Information received since", text).start()
    else:
        startLocation = re.search("The Federal Reserve is committed", text).start()
    endLocation = re.search("Voting for", text).start()

    statementText = text[startLocation : endLocation]
    return statementText

def getFOMCMinutesData(date):
  html = urlopen(getFOMCMinutesURL(date)).read()
  soup = BeautifulSoup(html)
  text = soup.get_text(" ")

  startLocation = re.search("Minutes of the Federal Open Market Committee", text).start() + len("Minutes of the Federal Open Market Committee")
  endLocation = re.search("notation vote", text).start()

  minutesText = text[startLocation : endLocation]
  return minutesText


Pull Dates from Google Sheet

In [61]:
# Code Source
# https://medium.com/analytics-vidhya/colab-and-google-sheets-surprisingly-powerful-combination-for-data-science-part-1-bbbb11cbd8e

# from google.colab import auth
# auth.authenticate_user()
# import gspread
# from oauth2client.client import GoogleCredentials
# gc = gspread.authorize(GoogleCredentials.get_application_default())

In [62]:
# date_google_sheet_url = gc.open_by_url("https://docs.google.com/spreadsheets/d/1Q9_JimVaDffFe8U4SYQ_VLU9-1aeuFXlDZhvTsFD0kg/edit#gid=1023419829")
# date_google_sheet = date_google_sheet_url.worksheet("Dates")
# date_data_raw = date_google_sheet.get_all_values()

# df_date = pd.DataFrame(date_data_raw)
# df_date.columns = df_date.iloc[0]
# df_date = df_date.iloc[1:]

Pull Dates from CSV File (in Github)

In [70]:
# df_date = pd.read_csv("data/FOMC_data/FOMC_meeting_dates.csv")
# df_date.head()
# dates = df_date["Date"].values.tolist()
# dates

df_date = pd.read_csv("https://raw.githubusercontent.com/abagnard/266FinalProject/main/data/FOMC_data/FOMC_meeting_dates.csv")
df_date.head()
dates = df_date["Date"].values.tolist()
# dates

In [None]:
def createFOMCStatementTextFile():
  # file_path = "/content/drive/My Drive/W266 - NLP/Final Project/data/FOMC_data/FOMC_Raw/"
  file_path = "data/FOMC_data/FOMC_data_raw/"
  for date in dates:
      print("Creating Text File: Statement - ", date)
      data = getFOMCStatementData(date)
      filename = "FOMCStatement_" + str(date) + ".txt"
      file = open(file_path+filename, "w")
      file.write(data)
      file.close

        
def createFOMCMinutesTextFile():
  # file_path = "/content/drive/My Drive/W266 - NLP/Final Project/data/FOMC_data/FOMC_Raw/"
  file_path = "data/FOMC_data/FOMC_data_raw/"
  for date in dates:
      print("Creating Text File: Minute - ", date)
      data = getFOMCMinutesData(date)
      filename = "FOMCMinutes_" + str(date) + ".txt"
      file = open(file_path+filename, "w")
      file.write(data)
      file.close

        
# createFOMCStatementTextFile()
# createFOMCMinutesTextFile()

#Clean Data
Removes punctuation

Creates new clean files

In [71]:
def cleanFOMCDocument(raw_file_name):
    raw_data = open(raw_file_name,"r").read().lower()
    clean_data = raw_data

    # remove punctuation except for periods ('.')
    for todelete in ["\r\n", "\n", "\t", ",", "-", ";", ":", "  "]:
        clean_data = clean_data.replace(todelete, ' ')
    return clean_data

import glob
raw_file_names = glob.glob('data/FOMC_data/FOMC_data_raw/*.txt')

def createCleanFOMCTextFiles(raw_file_names):
    file_path = 'data/FOMC_data/FOMC_data_clean/'
    
    for raw_file_name in raw_file_names:
        file_name = raw_file_name.split('/')[-1]
        print("Creating Text File: ", file_name)
        data = cleanFOMCDocument(raw_file_name)
        
        file = open(file_path+file_name, "w")
        file.write(data)
        file.close
        
# createCleanFOMCTextFiles(raw_file_names)

#Create Dataframe with FOMC document data

In [None]:
clean_file_names = glob.glob('data/FOMC_data/FOMC_data_clean/*.txt')

FOMC_records = []
for clean_file_name in clean_file_names:
    date = clean_file_name.split('/')[-1].split('_')[-1][:-4]
    file_type = 'minutes' if 'Minutes' in clean_file_name else 'statement'
    press_conference = df_date[df_date['Date'] == int(date)].iloc[0]['PressConference']
    data = open(clean_file_name).read()
    
    FOMC_record = {'FOMC_date': date, 'document_type': file_type, 'press_conference': press_conference, 'document_data': data}
    FOMC_records.append(FOMC_record)


df_FOMC = pd.DataFrame(FOMC_records)
df_FOMC.FOMC_date = pd.to_datetime(df_FOMC.FOMC_date, format='%Y%m%d')
df_FOMC.head()

In [None]:
df_FOMC.describe(include='all')

Fed Note


Fed Release

Financial News Sources