<a href="https://colab.research.google.com/github/abagnard/266FinalProject/blob/main/W266_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import os, sys, re, csv, json, time, unittest
import itertools, collections
from importlib import reload

import pandas as pd
import numpy as np
from scipy import stats

# Data Scraper Packages
from   bs4    import BeautifulSoup
from   urllib.request import urlopen

# Clean Data Packages
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import glob

# FOMC Data Loader

In [23]:
# Pull Dates of FOMC meetings
df_date = pd.read_csv("data/FOMC_data/FOMC_meeting_dates.csv")
meeting_dates = df_date["MeetingDate"].values.tolist()

## Create DataFrame with Clean Data
- Includes press_conference flag that denotes whether a press conference was held for that FOMC meeting

In [24]:
clean_file_names = glob.glob('data/FOMC_data/FOMC_data_clean/*.txt')

FOMC_records = []
for clean_file_name in clean_file_names:
    meeting_date = clean_file_name.split('/')[-1].split('_')[-1][:-4]
    press_conference = df_date[df_date['MeetingDate'] == int(meeting_date)].iloc[0]['PressConference']
    data = open(clean_file_name).read()
    
    if 'Minutes' in clean_file_name:
        file_type = 'minutes'
        release_date = df_date[df_date['MeetingDate'] == int(meeting_date)].iloc[0]['MinuteNotesReleaseDate']
        # Pull data without stopwords
        file_name = clean_file_name[-24:]
        data_without_stopwords = open('data/FOMC_data/FOMC_data_clean/FOMC_data_clean_without_stopwords/' + file_name).read()
    
    else:
        file_type = 'statement'
        release_date = meeting_date
        # Pull data without stopwords        
        file_name = clean_file_name[-26:]
        data_without_stopwords = open('data/FOMC_data/FOMC_data_clean/FOMC_data_clean_without_stopwords/' + file_name).read()
    
    FOMC_record = {'FOMC_date': meeting_date, 'release_date': release_date, 'document_type': file_type, 'press_conference': press_conference, 'document_data': data, 'document_data_wo_stopwords': data_without_stopwords}
    FOMC_records.append(FOMC_record)


df_FOMC_raw = pd.DataFrame(FOMC_records)

In [25]:
df_FOMC_raw.FOMC_date = pd.to_datetime(df_FOMC_raw.FOMC_date, format='%Y%m%d')
df_FOMC_raw.release_date = pd.to_datetime(df_FOMC_raw.release_date, format='%Y%m%d')

df_FOMC_raw.head()

Unnamed: 0,FOMC_date,document_data,document_data_wo_stopwords,document_type,press_conference,release_date
0,2006-01-31,minutes of the FOMC january 31 2006 a meeting ...,minutes FOMC january 31 2006 meeting FOMC held...,minutes,0,2006-02-21
1,2000-10-03,for immediate release the FOMC at its meeting ...,immediate release FOMC meeting today decided m...,statement,0,2000-10-03
2,2000-03-21,for immediate release the FOMC voted today to ...,immediate release FOMC voted today raise targe...,statement,0,2000-03-21
3,2011-01-26,information received since the FOMC met in dec...,information received since FOMC met december c...,statement,0,2011-01-26
4,2002-11-06,for immediate release the FOMC decided today t...,immediate release FOMC decided today lower tar...,statement,0,2002-11-06


In [26]:
df_FOMC_512_raw = df_FOMC_raw

def pull_middle_512_words(data):
    data = data.split(' ')
    midpoint = len(data)//2
    start_idx = midpoint - 200
    end_idx = midpoint + 200
    data_512 = ' '.join(data[start_idx:end_idx])
    return data_512

df_FOMC_512_raw['document_data_512'] = df_FOMC_512_raw['document_data'].apply(lambda data: pull_middle_512_words(data))
df_FOMC_512_raw['document_data_wo_stopwords_512'] = df_FOMC_512_raw['document_data_wo_stopwords'].apply(lambda data: pull_middle_512_words(data))


In [27]:
df_FOMC_512_raw.head()

Unnamed: 0,FOMC_date,document_data,document_data_wo_stopwords,document_type,press_conference,release_date,document_data_512,document_data_wo_stopwords_512
0,2006-01-31,minutes of the FOMC january 31 2006 a meeting ...,minutes FOMC january 31 2006 meeting FOMC held...,minutes,0,2006-02-21,account messrs oliner and slifman associate di...,messrs moskow poole hoenig alternate members F...
1,2000-10-03,for immediate release the FOMC at its meeting ...,immediate release FOMC meeting today decided m...,statement,0,2000-10-03,utilization of the pool of available workers r...,immediate release FOMC meeting today decided m...
2,2000-03-21,for immediate release the FOMC voted today to ...,immediate release FOMC voted today raise targe...,statement,0,2000-03-21,growth and of the information currently availa...,raise target federal funds rate 25 basis point...
3,2011-01-26,information received since the FOMC met in dec...,information received since FOMC met december c...,statement,0,2011-01-26,levels consistent with its mandate,committee regularly review pace securities pur...
4,2002-11-06,for immediate release the FOMC decided today t...,immediate release FOMC decided today lower tar...,statement,0,2002-11-06,in productivity is providing important ongoing...,immediate release FOMC decided today lower tar...


### Rename Column Names for 512 subselection

In [28]:
# df_FOMC_512_raw = df_FOMC_512_raw.drop(['document_data', 'document_data_wo_stopwords'], axis = 1)
# df_FOMC_512_raw = df_FOMC_512_raw.rename(columns = {'document_data_512': 'document_data', 'document_data_wo_stopwords_512': 'document_data_wo_stopwords'})

In [29]:
# df_FOMC_512_raw.head()

## Add Lables to DataFrame
- Label_Data file includes code on how we caluclate our data lables

#### Create df with labels

In [30]:
# df_market_data_labels = pd.read_csv('data/Market_data/market_data_labels_by_date.csv', parse_dates=['Date'])
df_market_data_labels = pd.read_csv('data/Market_data/label_data_v2.csv', parse_dates=['Date'])
df_market_data_labels.head()


Unnamed: 0.1,Unnamed: 0,Date,Press Release,symbol,open,high,low,close,volume,day_before_close,...,prev_fed_rate,pct_change_in_fed_rate,10_yr,one_day_before_10,two_day_before_10,three_day_before_10,prev_days_avg_treas,pct_change_in_treas,label_market,label_FED
0,0,1993-02-03,0,SP500,442.56,447.35,442.56,447.2,345410000,409.53,...,3.08,-15.584416,6.45,7.36,9.01,8.21,8.193333,-27.028424,0,-1
1,1,1993-03-23,0,SP500,448.88,449.8,448.3,448.76,232730000,409.91,...,2.92,-0.342466,5.91,7.62,8.52,9.43,8.523333,-44.218838,0,-1
2,2,1993-05-18,0,SP500,440.39,441.26,437.95,440.32,264300000,412.81,...,2.94,-1.020408,6.15,7.28,8.75,8.78,8.27,-34.471545,0,-1
3,3,1993-07-07,0,SP500,441.4,443.63,441.4,442.83,253170000,409.16,...,3.09,-9.708738,5.8,6.87,8.02,8.95,7.946667,-37.011494,0,-1
4,4,1993-08-17,0,SP500,452.38,453.7,451.96,453.13,261320000,420.74,...,3.02,-2.649007,5.7,6.56,8.8,8.18,7.846667,-37.660819,0,-1


#### Update FED and market labels to be positive
- BERT models will throw errors if labels are negative

In [31]:
# df_market_data_labels['label_market_raw'] = df_market_data_labels['label_market']
# df_market_data_labels['label_FED_raw'] = df_market_data_labels['label_FED']

# df_market_data_labels['label_market'] = df_market_data_labels['label_market_raw'] + 1
# df_market_data_labels['label_FED'] = df_market_data_labels['label_FED_raw'] + 1

print(df_market_data_labels[['label_market', 'label_FED']].describe())

       label_market   label_FED
count    442.000000  442.000000
mean      -0.303167   -0.047511
std        0.566197    0.796822
min       -1.000000   -1.000000
25%       -1.000000   -1.000000
50%        0.000000    0.000000
75%        0.000000    1.000000
max        1.000000    1.000000


In [32]:
df_FOMC_labeled =  pd.merge(df_FOMC_512_raw, df_market_data_labels, left_on='FOMC_date', right_on='Date')
df_FOMC_labeled.head()

Unnamed: 0.1,FOMC_date,document_data,document_data_wo_stopwords,document_type,press_conference,release_date,document_data_512,document_data_wo_stopwords_512,Unnamed: 0,Date,...,prev_fed_rate,pct_change_in_fed_rate,10_yr,one_day_before_10,two_day_before_10,three_day_before_10,prev_days_avg_treas,pct_change_in_treas,label_market,label_FED
0,2006-01-31,minutes of the FOMC january 31 2006 a meeting ...,minutes FOMC january 31 2006 meeting FOMC held...,minutes,0,2006-02-21,account messrs oliner and slifman associate di...,messrs moskow poole hoenig alternate members F...,104,2006-01-31,...,4.47,0.0,4.53,4.14,4.0,5.07,4.403333,2.796174,0,0
1,2006-01-31,for immediate release share the FOMC decided t...,immediate release share FOMC decided today rai...,statement,0,2006-01-31,for immediate release share the FOMC decided t...,immediate release share FOMC decided today rai...,104,2006-01-31,...,4.47,0.0,4.53,4.14,4.0,5.07,4.403333,2.796174,0,0
2,2000-10-03,for immediate release the FOMC at its meeting ...,immediate release FOMC meeting today decided m...,statement,0,2000-10-03,utilization of the pool of available workers r...,immediate release FOMC meeting today decided m...,61,2000-10-03,...,6.51,0.768049,5.87,6.01,6.61,6.13,6.25,-6.473595,-1,1
3,2000-10-03,minutes of the FOMC october 3 2000 a meeting o...,minutes FOMC october 3 2000 meeting FOMC held ...,minutes,0,2000-11-16,indexes of stock market prices declined somewh...,though previous policy tightening actions yet ...,61,2000-10-03,...,6.51,0.768049,5.87,6.01,6.61,6.13,6.25,-6.473595,-1,1
4,2000-03-21,for immediate release the FOMC voted today to ...,immediate release FOMC voted today raise targe...,statement,0,2000-03-21,growth and of the information currently availa...,raise target federal funds rate 25 basis point...,57,2000-03-21,...,6.02,3.488372,6.13,6.74,6.28,7.16,6.726667,-9.733551,-1,1


In [33]:
# df_FOMC_labeled_small = df_FOMC_labeled[['FOMC_date', 'document_type', 'document_data', 'document_data_wo_stopwords', 'document_data_512', 'document_data_wo_stopwords_512', 'press_conference', 'release_date','label_market', 'label_FED']].copy()

df_FOMC_labeled_small = df_FOMC_labeled[['FOMC_date', 'document_type', 'document_data_512', 'document_data_wo_stopwords_512', 'press_conference', 'release_date','label_market', 'label_FED']].copy()
df_FOMC = df_FOMC_labeled_small
df_FOMC.head()


Unnamed: 0,FOMC_date,document_type,document_data_512,document_data_wo_stopwords_512,press_conference,release_date,label_market,label_FED
0,2006-01-31,minutes,account messrs oliner and slifman associate di...,messrs moskow poole hoenig alternate members F...,0,2006-02-21,0,0
1,2006-01-31,statement,for immediate release share the FOMC decided t...,immediate release share FOMC decided today rai...,0,2006-01-31,0,0
2,2000-10-03,statement,utilization of the pool of available workers r...,immediate release FOMC meeting today decided m...,0,2000-10-03,-1,1
3,2000-10-03,minutes,indexes of stock market prices declined somewh...,though previous policy tightening actions yet ...,0,2000-11-16,-1,1
4,2000-03-21,statement,growth and of the information currently availa...,raise target federal funds rate 25 basis point...,0,2000-03-21,-1,1


#### Save FOMC Labeled Data to .csv File

In [34]:
df_FOMC.to_csv("data/FOMC_labeled_data.csv")