In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
!pip install xgboost
import xgboost as xgb

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Multi-modal learning
The way to approach this problem is to separately process the text and quantitative data. <br> For the text data, we will need to transform it into numerical form using an embedding of some sort. <br> The quantitative data can be pre-processed using the standard pre-processing techniques such as normalization. <br>The two sets of data are then concate to put into one single classifier ML algorithm.
<br><br> To tackle this challenge, we will utilize basic algorithms first before stepping up to transformers, GPT embeddings etc.

In [5]:
fomc_statements = pd.read_excel('FOMC Statements 1997-2023.xlsx')
fomc_statements.rename(columns={"Meeting Date":"Meeting_Date", "Unnamed: 1":"Statement"}, inplace=True)
fomc_statements

Unnamed: 0,Meeting_Date,Statement
0,1997-03-25,_x000D_\n_x000D_\n\tThe Federal Open Market Co...
1,1998-09-29,_x000D_\nThe Federal Open Market Committee dec...
2,1998-10-15,_x000D_\nThe Federal Reserve today announced t...
3,1998-11-17,_x000D_\nThe Federal Reserve today announced t...
4,1999-05-18,_x000D_\nThe Federal Reserve released the foll...
...,...,...
201,2022-09-21,\nRecent indicators point to modest growth in ...
202,2022-11-02,\nRecent indicators point to modest growth in ...
203,2022-12-14,\nRecent indicators point to modest growth in ...
204,2023-02-01,\nRecent indicators point to modest growth in ...


In [6]:
econs_metrics = pd.read_excel('FOMC_econometrics_v1.xlsx')
econs_metrics.rename(columns={'Dates':'Meeting_Date'}, inplace=True)
econs_metrics

Unnamed: 0,Meeting_Date,Label,Fed_fund_rate,Unemployment_rate,Core_CPI,US_10-2_Spread
0,1997-03-25,1,5.47,5.2,2.5,0.4530
1,1998-09-29,0,5.55,4.5,2.5,0.1200
2,1998-10-15,0,5.33,4.6,2.5,0.5820
3,1998-11-17,0,5.45,4.5,2.3,0.3100
4,1999-05-18,0,5.01,4.3,2.2,0.3160
...,...,...,...,...,...,...
201,2022-09-21,1,2.33,3.7,6.3,-0.5185
202,2022-11-02,1,3.08,3.7,6.3,-0.5192
203,2022-12-14,0,3.83,3.6,6.0,-0.7321
204,2023-02-01,0,4.33,3.4,5.6,-0.6897


In [15]:
combined_df = pd.merge(fomc_statements, econs_metrics, on='Meeting_Date') #Merging the dataframes, this is to ensure that the metrics matches the statements and labels.
y_labels = combined_df['Label']
x_text = combined_df['Statement']
x_metrics = combined_df[['Fed_fund_rate', 'Unemployment_rate', 'Core_CPI', 'US_10-2_Spread']]

# We will first utilize the simple TfidfVectorizer

In [52]:
DT_model_tfidf = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=3407))
RF_model_tfidf = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=3407))
XGB_model_tfidf = make_pipeline(StandardScaler(), xgb.XGBClassifier(objective='binary:logistic', random_state=3407))

In [53]:
tfidf_x_text = x_text.apply(lambda x: re.sub('[^\w\s]', '', x.lower())) #To lowercase all text and remove punctuations
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=600) #Calling the Tfidf vectorizer object
tfidf_x_text = vectorizer.fit_transform(tfidf_x_text).toarray() #Convert the text data to numerical data
tfidf_X = np.concatenate((tfidf_x_text, x_metrics.values), axis=1) #Concate the converted text data with the metrics data

In [55]:
tfidf_X_train, tfidf_X_test, y_train, y_test = train_test_split(tfidf_X, y_labels, test_size=0.2, random_state=3407) #Split the data into a training set and a test set

In [56]:
### Understanding the features ###
print(f"There are {len(tfidf_X_train[0])} features in the train dataset.")

There are 604 features in the train dataset.


**Using the Decision Tree model**

In [57]:
DT_model_tfidf.fit(tfidf_X_train, y_train)
DT_y_pred = DT_model_tfidf.predict(tfidf_X_test)
print(f"Accuracy: {accuracy_score(y_test, DT_y_pred)}")

Accuracy: 0.47619047619047616


**Using the RandomForest model**

In [58]:
RF_model_tfidf.fit(tfidf_X_train, y_train)
RF_y_pred = RF_model_tfidf.predict(tfidf_X_test)
print(f"Accuracy: {accuracy_score(y_test, RF_y_pred)}")

Accuracy: 0.6428571428571429


**Using the XGBoost model**

In [59]:
XGB_model_tfidf.fit(tfidf_X_train, y_train)
XGB_y_pred = XGB_model_tfidf.predict(tfidf_X_test)
print(f"Accuracy: {accuracy_score(y_test, XGB_y_pred)}")

Accuracy: 0.5952380952380952


# Previously, we tried using a simple Tfidf vectorizer, here we will use the more advanced Glove word embedding

GloVe (Global Vectors for Word Representation) is a word embedding technique that represents words as dense vectors in a high-dimensional space. GloVe embeddings capture semantic and syntactic relationships between words based on their co-occurrence statistics in a corpus.

In [60]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize

In [61]:
DT_model_glove = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=3407))
RF_model_glove = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=3407))
XGB_model_glove = make_pipeline(StandardScaler(), xgb.XGBClassifier(objective='binary:logistic', random_state=3407))

In [62]:
### To uncomment below the first time you run this to download the file from Stanford ###
!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip glove*.zip

--2023-06-06 04:43:34--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-06-06 04:46:14 (5.17 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [63]:
# Convert the GloVe file to word2vec format
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [64]:
# Load the converted GloVe vectors
glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

def get_vector(word):
    if word in glove_model:
        return glove_model[word]
    else:
        return np.zeros(glove_model.vector_size)

In [65]:
glove_combined_df = combined_df.copy()
glove_combined_df['Statement'] = glove_combined_df['Statement'].apply(word_tokenize) #Tokenize the text
glove_combined_df['Vector'] = glove_combined_df['Statement'].apply(lambda x: np.mean([get_vector(w) for w in x], axis=0)) #Convert the text to vectors
glove_temp = np.array(glove_combined_df['Vector'].to_list())

In [66]:
glove_X = np.concatenate((glove_temp, x_metrics.values), axis=1) #Concate the converted text data with the metrics data
glove_X_train, glove_X_test, y_train, y_test = train_test_split(glove_X, y_labels, test_size=0.2, random_state=3407) #Split the data into a training set and a test set

In [67]:
### Understanding the features ###
print(f"There are {len(glove_X_train[0])} features in the train dataset.")

There are 604 features in the train dataset.


**Using the Decision Tree model**

In [68]:
DT_model_glove.fit(glove_X_train, y_train)
DT_y_pred_glove = DT_model_glove.predict(glove_X_test)
print(f"Accuracy: {accuracy_score(y_test, DT_y_pred_glove)}")

Accuracy: 0.47619047619047616


**Using the Random Forest model**

In [69]:
RF_model_glove.fit(glove_X_train, y_train)
RF_y_pred_glove = RF_model_glove.predict(glove_X_test)
print(f"Accuracy: {accuracy_score(y_test, RF_y_pred_glove)}")

Accuracy: 0.5476190476190477


**Using the XGBoost model**

In [70]:
XGB_model_glove.fit(glove_X_train, y_train)
XGB_y_pred_glove = XGB_model_glove.predict(glove_X_test)
print(f"Accuracy: {accuracy_score(y_test, XGB_y_pred_glove)}")

Accuracy: 0.6190476190476191


# In this section, we will try the Doc2Vec embedding model.

It can be a good choice if the meaning of the statement is derived from the sequence of words or the overall context rather than the presence of individual words.

In [49]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [50]:
DT_model_D2V = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=3407))
RF_model_D2V = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=3407))
XGB_model_D2V = make_pipeline(StandardScaler(), xgb.XGBClassifier(objective='binary:logistic', random_state=3407))

In [71]:
### Represent each document as a TaggedDocument object
tagged_documents = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(combined_df['Statement'])] 

In [72]:
### Train a Doc2Vec model on the Tagged docmuments ###
D2V_model = Doc2Vec(tagged_documents, vector_size=100, window=2, min_count=1, workers=4)

In [73]:
### Transform each document into a vector using the trained D2V_model ###
combined_df_D2V = combined_df.copy() 
combined_df_D2V['doc_vector'] = combined_df_D2V['Statement'].apply(lambda x: D2V_model.infer_vector(word_tokenize(x.lower())))

D2V_temp = np.array(combined_df_D2V['doc_vector'].to_list())
D2V_X = np.concatenate((D2V_temp, x_metrics.values), axis=1) #Concate the converted text data with the metrics data
D2V_X_train, D2V_X_test, y_train, y_test = train_test_split(D2V_X, y_labels, test_size=0.2, random_state=3407) #Split the data into a training set and a test set

In [77]:
### Understanding the features ###
print(f"There are {len(D2V_X_train[0])} features in the train dataset.")

There are 104 features in the train dataset.



**Using the Decision Tree model**

In [74]:
DT_model_D2V.fit(D2V_X_train, y_train)
DT_y_pred_D2V = DT_model_D2V.predict(D2V_X_test)
print(f"Accuracy: {accuracy_score(y_test, DT_y_pred_D2V)}")

Accuracy: 0.5238095238095238


**Using the Random Forest model**

In [75]:
RF_model_D2V.fit(D2V_X_train, y_train)
RF_y_pred_D2V = RF_model_D2V.predict(D2V_X_test)
print(f"Accuracy: {accuracy_score(y_test, RF_y_pred_D2V)}")

Accuracy: 0.5952380952380952


**Using the XGBoost model**

In [76]:
XGB_model_D2V.fit(D2V_X_train, y_train)
XGB_y_pred_D2V = XGB_model_D2V.predict(D2V_X_test)
print(f"Accuracy: {accuracy_score(y_test, XGB_y_pred_D2V)}")

Accuracy: 0.5952380952380952


# We will try using a pre-trained BERT model to extract features from the text data

In [31]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m109.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m119.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [32]:
from transformers import pipeline, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [33]:
X_text_BERT = combined_df['Statement'].values #Get the text data into a np array
X_quant_BERT = combined_df[['Fed_fund_rate', 'Unemployment_rate', 'Core_CPI', 'US_10-2_Spread']].values #Get the numeric data into a np array

### Initiate a transformer model for feature extraction
feature_extraction = pipeline('feature-extraction', model='bert-base-uncased', tokenizer='bert-base-uncased', device=0, truncation=True, max_length=512) # device=0 to run on GPU
X_text_BERT_embeddings = np.array([np.mean(feature_extraction(text), axis=1)[0] for text in X_text_BERT])

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [44]:
# Combine the text embedding with the metrics data #
X_BERT = np.concatenate((X_text_BERT_embeddings, X_quant_BERT), axis=1)
y_BERT = combined_df['Label'].values

### Split the data into training and test set ###
X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT = train_test_split(X_BERT, y_BERT, test_size=0.2, random_state=3407)

In [41]:
### Understanding the features ###
print(f"There are {len(X_train_BERT[0])} features in the train dataset.")

There are 772 features in the train dataset.


In [45]:
DT_model_BERT = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=3407))
RF_model_BERT = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=3407))
XGB_model_BERT = make_pipeline(StandardScaler(), xgb.XGBClassifier(objective='binary:logistic', random_state=3407))

**Using the Decision Tree model**

In [46]:
DT_model_BERT.fit(X_train_BERT, y_train_BERT)
DT_y_pred_BERT = DT_model_BERT.predict(X_test_BERT)
print(f"Accuracy: {accuracy_score(y_test_BERT, DT_y_pred_BERT)}")

Accuracy: 0.5


**Using the Random Forest model**

In [47]:
RF_model_BERT.fit(X_train_BERT, y_train_BERT)
RF_y_pred_BERT = RF_model_BERT.predict(X_test_BERT)
print(f"Accuracy: {accuracy_score(y_test_BERT, RF_y_pred_BERT)}")

Accuracy: 0.5714285714285714


**Using the XGBoost model**

In [48]:
XGB_model_BERT.fit(X_train_BERT, y_train_BERT)
XGB_y_pred_BERT = XGB_model_BERT.predict(X_test_BERT)
print(f"Accuracy: {accuracy_score(y_test_BERT, XGB_y_pred_BERT)}")

Accuracy: 0.5476190476190477


# We will now use the text embedding from OpenAI

In [None]:
!pip install openai

In [2]:
import openai 
import os

In [7]:
os.environ['OPENAI_API_KEY'] = "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"
openai.api_key = "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"

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [10]:
fomc_statements_GPT = fomc_statements.copy()

In [11]:
fomc_statements_GPT.head()

Unnamed: 0,Meeting_Date,Statement
0,1997-03-25,_x000D_\n_x000D_\n\tThe Federal Open Market Co...
1,1998-09-29,_x000D_\nThe Federal Open Market Committee dec...
2,1998-10-15,_x000D_\nThe Federal Reserve today announced t...
3,1998-11-17,_x000D_\nThe Federal Reserve today announced t...
4,1999-05-18,_x000D_\nThe Federal Reserve released the foll...


In [12]:
fomc_statements_GPT['ada_embedding'] = fomc_statements_GPT.Statement.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

In [22]:
X_text_GPT = np.array(fomc_statements_GPT['ada_embedding'].to_list())
X_quant_GPT = combined_df[['Fed_fund_rate', 'Unemployment_rate', 'Core_CPI', 'US_10-2_Spread']].values #Get the numeric data into a np array

# Combine the text embedding with the metrics data #
X_GPT = np.concatenate((X_text_GPT, X_quant_GPT), axis=1)
y_GPT = combined_df['Label'].values

### Split the data into training and test set ###
X_train_GPT, X_test_GPT, y_train_GPT, y_test_GPT = train_test_split(X_GPT, y_GPT, test_size=0.2, random_state=42)

In [30]:
### Understanding the features ###
print(f"There are {len(X_train_GPT[0])} features in the train dataset.") #Seems like there are way too many parameters for the DTs to handle

There are 1540 features in the train dataset.


In [23]:
DT_model_GPT = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=42))
RF_model_GPT = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42))
XGB_model_GPT = make_pipeline(StandardScaler(), xgb.XGBClassifier(objective='binary:logistic', random_state=42))

**Using the Decision Tree model**

In [24]:
DT_model_GPT.fit(X_train_GPT, y_train_GPT)
DT_y_pred_GPT = DT_model_GPT.predict(X_test_GPT)
print(f"Accuracy: {accuracy_score(y_test_GPT, DT_y_pred_GPT)}")

Accuracy: 0.30952380952380953


**Using the Random Forest model**

In [25]:
RF_model_GPT.fit(X_train_GPT, y_train_GPT)
RF_y_pred_GPT = RF_model_GPT.predict(X_test_GPT)
print(f"Accuracy: {accuracy_score(y_test_GPT, RF_y_pred_GPT)}")

Accuracy: 0.5


**Using the XGBoost model**

In [28]:
XGB_model_GPT.fit(X_train_GPT, y_train_GPT)
XGB_y_pred_GPT = XGB_model_GPT.predict(X_test_GPT)
print(f"Accuracy: {accuracy_score(y_test_GPT, XGB_y_pred_GPT)}")

Accuracy: 0.4523809523809524


# We replicate all of the above but use delta values rather than absolute in the econometrics dataset