In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Multi-modal learning
The way to approach this problem is to separately process the text and quantitative data. <br> For the text data, we will need to transform it into numerical form using an embedding of some sort. <br> The quantitative data can be pre-processed using the standard pre-processing techniques such as normalization. <br>The two sets of data are then concate to put into one single classifier ML algorithm.
<br><br> To tackle this challenge, we will utilize basic algorithms first before stepping up to transformers, GPT embeddings etc.

In [2]:
fomc_statements = pd.read_excel('FOMC Statements 1997-2023.xlsx')
fomc_statements.rename(columns={"Meeting Date":"Meeting_Date", "Unnamed: 1":"Statement"}, inplace=True)
fomc_statements

Unnamed: 0,Meeting_Date,Statement
0,1997-03-25,_x000D_\n_x000D_\n\tThe Federal Open Market Co...
1,1998-09-29,_x000D_\nThe Federal Open Market Committee dec...
2,1998-10-15,_x000D_\nThe Federal Reserve today announced t...
3,1998-11-17,_x000D_\nThe Federal Reserve today announced t...
4,1999-05-18,_x000D_\nThe Federal Reserve released the foll...
...,...,...
201,2022-09-21,\nRecent indicators point to modest growth in ...
202,2022-11-02,\nRecent indicators point to modest growth in ...
203,2022-12-14,\nRecent indicators point to modest growth in ...
204,2023-02-01,\nRecent indicators point to modest growth in ...


In [3]:
econs_metrics = pd.read_excel('FOMC_econometrics_v1.xlsx')
econs_metrics.rename(columns={'Dates':'Meeting_Date'}, inplace=True)
econs_metrics

Unnamed: 0,Meeting_Date,Label,Fed_fund_rate,Unemployment_rate,Core_CPI,US_10-2_Spread
0,1997-03-25,1,5.47,5.2,2.5,0.4530
1,1998-09-29,0,5.55,4.5,2.5,0.1200
2,1998-10-15,0,5.33,4.6,2.5,0.5820
3,1998-11-17,0,5.45,4.5,2.3,0.3100
4,1999-05-18,0,5.01,4.3,2.2,0.3160
...,...,...,...,...,...,...
201,2022-09-21,1,2.33,3.7,6.3,-0.5185
202,2022-11-02,1,3.08,3.7,6.3,-0.5192
203,2022-12-14,0,3.83,3.6,6.0,-0.7321
204,2023-02-01,0,4.33,3.4,5.6,-0.6897


In [4]:
combined_df = pd.merge(fomc_statements, econs_metrics, on='Meeting_Date') #Merging the dataframes, this is to ensure that the metrics matches the statements and labels.
y_labels = combined_df['Label']
x_text = combined_df['Statement']
x_metrics = combined_df[['Fed_fund_rate', 'Unemployment_rate', 'Core_CPI', 'US_10-2_Spread']]

In [5]:
### Calling each of the classifier models first ###
from sklearn.tree import DecisionTreeClassifier
DT_model = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=42))

from sklearn.ensemble import RandomForestClassifier
RF_model = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42))

!pip install xgboost
import xgboost as xgb
XGB_model = make_pipeline(StandardScaler(), xgb.XGBClassifier(objective='binary:logistic', random_state=42))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# We will first utilize the simple TfidfVectorizer

In [6]:
tfidf_x_text = x_text.apply(lambda x: re.sub('[^\w\s]', '', x.lower())) #To lowercase all text and remove punctuations
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=600) #Calling the Tfidf vectorizer object
tfidf_x_text = vectorizer.fit_transform(tfidf_x_text).toarray() #Convert the text data to numerical data
tfidf_X = np.concatenate((tfidf_x_text, x_metrics.values), axis=1) #Concate the converted text data with the metrics data

In [7]:
tfidf_X_train, tfidf_X_test, y_train, y_test = train_test_split(tfidf_X, y_labels, test_size=0.2, random_state=42) #Split the data into a training set and a test set

**Using the Decision Tree model**

In [8]:
DT_model.fit(tfidf_X_train, y_train)
DT_y_pred = DT_model.predict(tfidf_X_test)
print(f"Accuracy: {accuracy_score(y_test, DT_y_pred)}")

Accuracy: 0.40476190476190477


**Using the RandomForest model**

In [9]:
RF_model.fit(tfidf_X_train, y_train)
RF_y_pred = RF_model.predict(tfidf_X_test)
print(f"Accuracy: {accuracy_score(y_test, RF_y_pred)}")

Accuracy: 0.5476190476190477


**Using the XGBoost model**

In [10]:
XGB_model.fit(tfidf_X_train, y_train)
XGB_y_pred = XGB_model.predict(tfidf_X_test)
print(f"Accuracy: {accuracy_score(y_test, XGB_y_pred)}")

Accuracy: 0.5


# Previously, we tried using a simple Tfidf vectorizer, here we will use the more advanced Glove word embedding

GloVe (Global Vectors for Word Representation) is a word embedding technique that represents words as dense vectors in a high-dimensional space. GloVe embeddings capture semantic and syntactic relationships between words based on their co-occurrence statistics in a corpus.

In [11]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize

In [12]:
### To uncomment below the first time you run this to download the file from Stanford ###
# !wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
# !unzip glove*.zip

In [34]:
# Convert the GloVe file to word2vec format
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [35]:
# Load the converted GloVe vectors
glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

def get_vector(word):
    if word in glove_model:
        return glove_model[word]
    else:
        return np.zeros(glove_model.vector_size)

In [36]:
glove_combined_df = combined_df.copy()
glove_combined_df['Statement'] = glove_combined_df['Statement'].apply(word_tokenize) #Tokenize the text
glove_combined_df['Vector'] = glove_combined_df['Statement'].apply(lambda x: np.mean([get_vector(w) for w in x], axis=0)) #Convert the text to vectors
glove_temp = np.array(glove_combined_df['Vector'].to_list())

In [37]:
glove_X = np.concatenate((glove_temp, x_metrics.values), axis=1) #Concate the converted text data with the metrics data
glove_X_train, glove_X_test, y_train, y_test = train_test_split(glove_X, y_labels, test_size=0.2, random_state=42) #Split the data into a training set and a test set

**Using the Decision Tree model**

In [38]:
DT_model.fit(glove_X_train, y_train)
DT_y_pred_glove = DT_model.predict(glove_X_test)
print(f"Accuracy: {accuracy_score(y_test, DT_y_pred_glove)}")

Accuracy: 0.6428571428571429


**Using the Random Forest model**

In [39]:
RF_model.fit(glove_X_train, y_train)
RF_y_pred_glove = RF_model.predict(glove_X_test)
print(f"Accuracy: {accuracy_score(y_test, RF_y_pred_glove)}")

Accuracy: 0.6190476190476191


**Using the XGBoost model**

In [40]:
XGB_model.fit(glove_X_train, y_train)
XGB_y_pred_glove = XGB_model.predict(glove_X_test)
print(f"Accuracy: {accuracy_score(y_test, XGB_y_pred_glove)}")

Accuracy: 0.5952380952380952


# In this section, we will try the Doc2Vec embedding model.

It can be a good choice if the meaning of the statement is derived from the sequence of words or the overall context rather than the presence of individual words.

In [20]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [21]:
### Represent each document as a TaggedDocument object
tagged_documents = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(combined_df['Statement'])] 

In [22]:
### Train a Doc2Vec model on the Tagged docmuments ###
D2V_model = Doc2Vec(tagged_documents, vector_size=100, window=2, min_count=1, workers=4)

In [23]:
### Transform each document into a vector using the trained D2V_model ###
combined_df_D2V = combined_df.copy() 
combined_df_D2V['doc_vector'] = combined_df_D2V['Statement'].apply(lambda x: D2V_model.infer_vector(word_tokenize(x.lower())))

D2V_temp = np.array(combined_df_D2V['doc_vector'].to_list())
D2V_X = np.concatenate((D2V_temp, x_metrics.values), axis=1) #Concate the converted text data with the metrics data
D2V_X_train, D2V_X_test, y_train, y_test = train_test_split(D2V_X, y_labels, test_size=0.2, random_state=42) #Split the data into a training set and a test set

**Using the Decision Tree model**

In [24]:
DT_model.fit(D2V_X_train, y_train)
DT_y_pred_D2V = DT_model.predict(D2V_X_test)
print(f"Accuracy: {accuracy_score(y_test, DT_y_pred_D2V)}")

Accuracy: 0.5952380952380952


**Using the Random Forest model**

In [25]:
RF_model.fit(D2V_X_train, y_train)
RF_y_pred_D2V = RF_model.predict(D2V_X_test)
print(f"Accuracy: {accuracy_score(y_test, RF_y_pred_D2V)}")

Accuracy: 0.5714285714285714


**Using the XGBoost model**

In [26]:
XGB_model.fit(D2V_X_train, y_train)
XGB_y_pred_D2V = XGB_model.predict(D2V_X_test)
print(f"Accuracy: {accuracy_score(y_test, XGB_y_pred_D2V)}")

Accuracy: 0.5476190476190477


# We will try using a pre-trained BERT model to extract features from the text data