In [1]:
# connect to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
FOLDERNAME = 'Colab Notebooks'
%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


### Import packages

In [2]:
from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

In [3]:
from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report

In [4]:
from transformers import AutoModelForSequenceClassification

from finbert import *
import utils as tools

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [5]:
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split

### Build finbert model
Reference: https://github.com/ProsusAI/finBERT/blob/master/notebooks/finbert_training.ipynb

In [6]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

In [7]:
project_dir = Path.cwd().parent
cl_path = project_dir/'Colab Notebooks'/'finbert-sentiment'
cl_data_path = project_dir/'Colab Notebooks'

In [None]:
# model
bertmodel = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', cache_dir=None, num_labels=3)


config = Config(   data_dir=cl_data_path,
                   bert_model=bertmodel,
                   num_train_epochs=4,
                   model_dir=cl_path,
                   max_seq_length = 48,
                   train_batch_size = 32,
                   learning_rate = 2e-5,
                   output_mode='classification',
                   warm_up_proportion=0.2,
                   local_rank=-1,
                   discriminate=True,
                   gradual_unfreeze=True)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
finbert = FinBert(config)
finbert.base_model = 'bert-base-uncased'
finbert.config.discriminate=True
finbert.config.gradual_unfreeze=True

In [None]:
# label
finbert.prepare_model(label_list=['positive','negative','neutral'])

ValueError: Output directory (/content/drive/MyDrive/Colab Notebooks/finbert-sentiment) already exists and is not empty.

### Split data into train, validation and test datasets

In [None]:
data = pd.read_csv(os.path.join(cl_data_path, 'sentences_1.txt'), sep='.@', names=['text','label'], encoding_errors='ignore')

train, test = train_test_split(data, test_size=0.2, random_state=0)
train, valid = train_test_split(train, test_size=0.1, random_state=0)

train.to_csv('train.csv',sep='\t')
test.to_csv('test.csv',sep='\t')
valid.to_csv('validation.csv',sep='\t')

  data = pd.read_csv(os.path.join(cl_data_path, 'sentences_1.txt'), sep='.@', names=['text','label'], encoding_errors='ignore')


### Train the finbert model

In [None]:
# Get the training examples
train_data = finbert.get_data('train')

AttributeError: 'FinBert' object has no attribute 'processor'

In [None]:
model = finbert.create_the_model()



In [None]:
trained_model = finbert.train(train_examples=train_data, model=model)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871]
No best model found


Epoch:  25%|██▌       | 1/4 [12:56<38:50, 777.00s/it]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871, 0.33707844523283154]


Epoch:  50%|█████     | 2/4 [31:54<32:57, 988.86s/it]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871, 0.33707844523283154, 0.33707844523283154]


Epoch:  75%|███████▌  | 3/4 [57:38<20:42, 1242.55s/it]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871, 0.33707844523283154, 0.33707844523283154, 0.33707844523283154]


Epoch: 100%|██████████| 4/4 [1:26:26<00:00, 1296.56s/it]
  checkpoint = torch.load(self.config.model_dir / ('temporary' + str(best_model)))


### Read in Fed speeches and yield spread

In [8]:
fed = pd.read_csv('FED_speech.csv')
spread = pd.read_csv('yield_spread.csv')

# Ensure 'DATE' and 'date' are of the same type
fed['date'] = pd.to_datetime(fed['date'])
spread['DATE'] = pd.to_datetime(spread['DATE'])

# replace the null value with its previous row value
spread['T10Y2Y'] = spread['T10Y2Y'].replace('.', pd.NA).fillna(method='ffill')
spread['T10Y2Y'] = spread['T10Y2Y'].astype(float)

# Filter rows where the 'DATE' in spread matches the 'dates' in fed_sample
spread_filtered = spread[spread['DATE'].isin(fed['date'])]

# Filter rows in 'fed_sample' that have dates matching 'spread_filtered'
fed_filtered = fed[fed['date'].isin(spread_filtered['DATE'])]

  fed['date'] = pd.to_datetime(fed['date'])
  spread['T10Y2Y'] = spread['T10Y2Y'].replace('.', pd.NA).fillna(method='ffill')


In [9]:
merged_df = pd.merge(fed_filtered, spread_filtered, left_on='date', right_on='DATE', how='inner')
merged_df

Unnamed: 0,title,speaker,date,article,DATE,T10Y2Y
0,Thoughts on the Economy and Policy Rules at th...,Governor Christopher J. Waller,2024-10-14,"Thank you, Athanasios, and thank you for the o...",2024-10-14,0.13
1,Challenges to the Community Banking Model,Governor Michelle W. Bowman,2024-10-11,"Good afternoon, I'd like to begin by thanking ...",2024-10-11,0.13
2,"Entrepreneurs, Innovation, and Participation",Governor Lisa D. Cook,2024-10-10,"Thank you for the kind introduction, Jennet.Le...",2024-10-10,0.11
3,The Fed's Discount Window: 1990 to the Present,Vice Chair Philip N. Jefferson,2024-10-09,"Thank you, Steve, for that kind introduction a...",2024-10-09,0.07
4,A History of the Fed's Discount Window: 1913–2000,Vice Chair Philip N. Jefferson,2024-10-08,"Thank you, President Hicks and Tara Boehmler, ...",2024-10-08,0.06
...,...,...,...,...,...,...
335,The Digitalization of Payments and Currency: S...,Governor Lael Brainard,2020-02-05,I want to thank Darrell Duffie for inviting me...,2020-02-05,0.22
336,"Spontaneity and Order: Transparency, Accountab...",Vice Chair for Supervision Randal K. Quarles,2020-01-17,It's a great pleasure to be with you today at ...,2020-01-17,0.26
337,The Outlook for Housing,Governor Michelle W. Bowman,2020-01-16,Few sectors are as central to the success of o...,2020-01-16,0.23
338,U.S. Economic Outlook and Monetary Policy,Vice Chair Richard H. Clarida,2020-01-09,Thank you for the opportunity to join you brig...,2020-01-09,0.27


In [10]:
# speeches
fed_speeches = fed_filtered.article

In [11]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3)

In [None]:
# results
model_results = []
model_sentiments = []

# predict the speeches based on the model we trained earlier
for speech in fed_speeches:
  result = predict(speech, model)

  average_score = round(result.sentiment_score.mean(), 2)
  model_results.append(average_score)

  if average_score >= 0.05:
    model_sentiments.append('positive')
  elif average_score <= -0.05:
    model_sentiments.append('negative')
  else:
    model_sentiments.append('neutral')

  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_resu

### Regression Analysis

In [15]:
import statsmodels.api as sm

In [None]:
X = sm.add_constant(model_results)
Y = list(merged_df['T10Y2Y'])
tf_est = sm.OLS(Y, X)
tf_est2 = tf_est.fit()
tf_est2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.013
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,4.525
Date:,"Thu, 17 Oct 2024",Prob (F-statistic):,0.0341
Time:,23:52:10,Log-Likelihood:,-355.42
No. Observations:,340,AIC:,714.8
Df Residuals:,338,BIC:,722.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0295,0.046,0.640,0.522,-0.061,0.120
x1,0.5451,0.256,2.127,0.034,0.041,1.049

0,1,2,3
Omnibus:,40.379,Durbin-Watson:,0.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.48
Skew:,0.663,Prob(JB):,8.85e-08
Kurtosis:,2.268,Cond. No.,6.92


### textblob model

In [None]:
from textblob import TextBlob

In [None]:
speech_avg_score = []

for speech in fed_speeches:
  sentence_count = 0
  score = 0

  blob = TextBlob(speech)

  for sentence in blob.sentences:
    sentence_count += 1
    score += sentence.sentiment.polarity

  speech_avg_score.append(score/sentence_count)

In [None]:
X = sm.add_constant(speech_avg_score)
Y = list(merged_df['T10Y2Y'])
tf_est = sm.OLS(Y, X)
tf_est2 = tf_est.fit()
tf_est2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,1.814
Date:,"Tue, 22 Oct 2024",Prob (F-statistic):,0.179
Time:,14:33:47,Log-Likelihood:,-356.77
No. Observations:,340,AIC:,717.5
Df Residuals:,338,BIC:,725.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2336,0.115,2.025,0.044,0.007,0.461
x1,-1.2909,0.958,-1.347,0.179,-3.176,0.594

0,1,2,3
Omnibus:,50.456,Durbin-Watson:,0.023
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.103
Skew:,0.624,Prob(JB):,1.07e-07
Kurtosis:,2.158,Cond. No.,25.8


In [12]:
from transformers import pipeline

In [13]:
classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')
candidate_labels = ['hawkish', 'dovish', 'neutral']

speech_score = []
speech_sentiment = []
i = 0
for speech in fed_speeches:

  result = classifier(speech, candidate_labels)
  index_h = result['labels'].index('hawkish')
  score_h = result['scores'][index_h]
  index_d = result['labels'].index('dovish')
  score_d = result['scores'][index_d]
  score = score_h - score_d

  speech_score.append(score)

  if score > 0.05:
    speech_sentiment.append('hawkish')
  elif score < -0.05:
    speech_sentiment.append('dovish')
  else:
    speech_sentiment.append('neutral')

  i += 1
  print(i)

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [17]:
X = sm.add_constant(speech_score)
Y = list(merged_df['T10Y2Y'])
tf_est = sm.OLS(Y, X)
tf_est2 = tf_est.fit()
tf_est2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.014
Method:,Least Squares,F-statistic:,5.658
Date:,"Wed, 23 Oct 2024",Prob (F-statistic):,0.0179
Time:,13:48:17,Log-Likelihood:,-354.86
No. Observations:,340,AIC:,713.7
Df Residuals:,338,BIC:,721.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1410,0.044,3.220,0.001,0.055,0.227
x1,0.9580,0.403,2.379,0.018,0.166,1.750

0,1,2,3
Omnibus:,38.736,Durbin-Watson:,0.045
Prob(Omnibus):,0.0,Jarque-Bera (JB):,31.734
Skew:,0.657,Prob(JB):,1.29e-07
Kurtosis:,2.284,Cond. No.,10.8
