In [4]:
# connect to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
FOLDERNAME = 'Colab Notebooks'
%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


### Import packages

In [24]:
from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

In [25]:
from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report

In [26]:
from transformers import AutoModelForSequenceClassification

from finbert import *
import utils as tools

In [27]:
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split

### Build finbert model
Reference: https://github.com/ProsusAI/finBERT/blob/master/notebooks/finbert_training.ipynb

In [9]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

In [10]:
project_dir = Path.cwd().parent
cl_path = project_dir/'Colab Notebooks'/'finbert-sentiment'
cl_data_path = project_dir/'Colab Notebooks'

In [28]:
# model
bertmodel = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', cache_dir=None, num_labels=3)


config = Config(   data_dir=cl_data_path,
                   bert_model=bertmodel,
                   num_train_epochs=4,
                   model_dir=cl_path,
                   max_seq_length = 48,
                   train_batch_size = 32,
                   learning_rate = 2e-5,
                   output_mode='classification',
                   warm_up_proportion=0.2,
                   local_rank=-1,
                   discriminate=True,
                   gradual_unfreeze=True)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
finbert = FinBert(config)
finbert.base_model = 'bert-base-uncased'
finbert.config.discriminate=True
finbert.config.gradual_unfreeze=True

In [33]:
# label
finbert.prepare_model(label_list=['positive','negative','neutral'])

ValueError: Output directory (/content/drive/MyDrive/Colab Notebooks/finbert-sentiment) already exists and is not empty.

### Split data into train, validation and test datasets

In [31]:
data = pd.read_csv(os.path.join(cl_data_path, 'sentences_1.txt'), sep='.@', names=['text','label'], encoding_errors='ignore')

train, test = train_test_split(data, test_size=0.2, random_state=0)
train, valid = train_test_split(train, test_size=0.1, random_state=0)

train.to_csv('train.csv',sep='\t')
test.to_csv('test.csv',sep='\t')
valid.to_csv('validation.csv',sep='\t')

  data = pd.read_csv(os.path.join(cl_data_path, 'sentences_1.txt'), sep='.@', names=['text','label'], encoding_errors='ignore')


### Train the finbert model

In [32]:
# Get the training examples
train_data = finbert.get_data('train')

AttributeError: 'FinBert' object has no attribute 'processor'

In [None]:
model = finbert.create_the_model()



In [None]:
trained_model = finbert.train(train_examples=train_data, model=model)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871]
No best model found


Epoch:  25%|██▌       | 1/4 [12:56<38:50, 777.00s/it]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871, 0.33707844523283154]


Epoch:  50%|█████     | 2/4 [31:54<32:57, 988.86s/it]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871, 0.33707844523283154, 0.33707844523283154]


Epoch:  75%|███████▌  | 3/4 [57:38<20:42, 1242.55s/it]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871, 0.33707844523283154, 0.33707844523283154, 0.33707844523283154]


Epoch: 100%|██████████| 4/4 [1:26:26<00:00, 1296.56s/it]
  checkpoint = torch.load(self.config.model_dir / ('temporary' + str(best_model)))


### Read in Fed speeches and yield spread

In [11]:
fed = pd.read_csv('FED_speech.csv')
spread = pd.read_csv('yield_spread.csv')

# Ensure 'DATE' and 'date' are of the same type
fed['date'] = pd.to_datetime(fed['date'])
spread['DATE'] = pd.to_datetime(spread['DATE'])

# replace the null value with its previous row value
spread['T10Y2Y'] = spread['T10Y2Y'].replace('.', pd.NA).fillna(method='ffill')
spread['T10Y2Y'] = spread['T10Y2Y'].astype(float)

# Filter rows where the 'DATE' in spread matches the 'dates' in fed_sample
spread_filtered = spread[spread['DATE'].isin(fed['date'])]

# Filter rows in 'fed_sample' that have dates matching 'spread_filtered'
fed_filtered = fed[fed['date'].isin(spread_filtered['DATE'])]

  fed['date'] = pd.to_datetime(fed['date'])
  spread['T10Y2Y'] = spread['T10Y2Y'].replace('.', pd.NA).fillna(method='ffill')


In [12]:
merged_df = pd.merge(fed_filtered, spread_filtered, left_on='date', right_on='DATE', how='inner')
merged_df

Unnamed: 0,title,speaker,date,article,DATE,T10Y2Y
0,Thoughts on the Economy and Policy Rules at th...,Governor Christopher J. Waller,2024-10-14,"Thank you, Athanasios, and thank you for the o...",2024-10-14,0.13
1,Challenges to the Community Banking Model,Governor Michelle W. Bowman,2024-10-11,"Good afternoon, I'd like to begin by thanking ...",2024-10-11,0.13
2,"Entrepreneurs, Innovation, and Participation",Governor Lisa D. Cook,2024-10-10,"Thank you for the kind introduction, Jennet.Le...",2024-10-10,0.11
3,The Fed's Discount Window: 1990 to the Present,Vice Chair Philip N. Jefferson,2024-10-09,"Thank you, Steve, for that kind introduction a...",2024-10-09,0.07
4,A History of the Fed's Discount Window: 1913–2000,Vice Chair Philip N. Jefferson,2024-10-08,"Thank you, President Hicks and Tara Boehmler, ...",2024-10-08,0.06
...,...,...,...,...,...,...
335,The Digitalization of Payments and Currency: S...,Governor Lael Brainard,2020-02-05,I want to thank Darrell Duffie for inviting me...,2020-02-05,0.22
336,"Spontaneity and Order: Transparency, Accountab...",Vice Chair for Supervision Randal K. Quarles,2020-01-17,It's a great pleasure to be with you today at ...,2020-01-17,0.26
337,The Outlook for Housing,Governor Michelle W. Bowman,2020-01-16,Few sectors are as central to the success of o...,2020-01-16,0.23
338,U.S. Economic Outlook and Monetary Policy,Vice Chair Richard H. Clarida,2020-01-09,Thank you for the opportunity to join you brig...,2020-01-09,0.27


In [17]:
# speeches
fed_speeches = fed_filtered.article

In [15]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [38]:
model = AutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3)

In [None]:
# results
model_results = []
model_sentiments = []

# predict the speeches based on the model we trained earlier
for speech in fed_speeches:
  result = predict(speech, model)

  average_score = round(result.sentiment_score.mean(), 2)
  model_results.append(average_score)

  if average_score >= 0.05:
    model_sentiments.append('positive')
  elif average_score <= -0.05:
    model_sentiments.append('negative')
  else:
    model_sentiments.append('neutral')

  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_resu

### Regression Analysis

In [54]:
import statsmodels.api as sm

In [None]:
X = sm.add_constant(model_results)
Y = list(merged_df['T10Y2Y'])
tf_est = sm.OLS(Y, X)
tf_est2 = tf_est.fit()
tf_est2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.013
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,4.525
Date:,"Thu, 17 Oct 2024",Prob (F-statistic):,0.0341
Time:,23:52:10,Log-Likelihood:,-355.42
No. Observations:,340,AIC:,714.8
Df Residuals:,338,BIC:,722.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0295,0.046,0.640,0.522,-0.061,0.120
x1,0.5451,0.256,2.127,0.034,0.041,1.049

0,1,2,3
Omnibus:,40.379,Durbin-Watson:,0.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.48
Skew:,0.663,Prob(JB):,8.85e-08
Kurtosis:,2.268,Cond. No.,6.92


### textblob model

In [11]:
from textblob import TextBlob

In [17]:
speech_avg_score = []

for speech in fed_speeches:
  sentence_count = 0
  score = 0

  blob = TextBlob(speech)

  for sentence in blob.sentences:
    sentence_count += 1
    score += sentence.sentiment.polarity

  speech_avg_score.append(score/sentence_count)

In [20]:
X = sm.add_constant(speech_avg_score)
Y = list(merged_df['T10Y2Y'])
tf_est = sm.OLS(Y, X)
tf_est2 = tf_est.fit()
tf_est2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,1.814
Date:,"Tue, 22 Oct 2024",Prob (F-statistic):,0.179
Time:,14:33:47,Log-Likelihood:,-356.77
No. Observations:,340,AIC:,717.5
Df Residuals:,338,BIC:,725.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2336,0.115,2.025,0.044,0.007,0.461
x1,-1.2909,0.958,-1.347,0.179,-3.176,0.594

0,1,2,3
Omnibus:,50.456,Durbin-Watson:,0.023
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.103
Skew:,0.624,Prob(JB):,1.07e-07
Kurtosis:,2.158,Cond. No.,25.8


In [49]:
from transformers import pipeline

In [56]:
classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')
candidate_labels = ['hawkish', 'dovish', 'neutral']

speech_avg_score = []
speech_sentiment = []

for speech in fed_speeches:

  sentences = sent_tokenize(speech)
  total_score = 0
  count = 0

  for sentence in sentences:
    result = classifier(sentence, candidate_labels)
    index_h = result['labels'].index('hawkish')
    score_h = result['scores'][index_h]
    index_d = result['labels'].index('dovish')
    score_d = result['scores'][index_d]
    score = score_h - score_d
    total_score += score
    count += 1

  speech_avg_score.append(total_score/count)

  if total_score/count > 0.05:
    speech_sentiment.append('hawkish')
  elif total_score/count < -0.05:
    speech_sentiment.append('dovish')
  else:
    speech_sentiment.append('neutral')

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

In [55]:
X = sm.add_constant(speech_avg_score)
Y = list(merged_df['T10Y2Y'])
tf_est = sm.OLS(Y, X)
tf_est2 = tf_est.fit()
tf_est2.summary()

ValueError: endog and exog matrices are different sizes