In [1]:
# connect to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
FOLDERNAME = 'Colab Notebooks'
%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


### Import packages

In [2]:
from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

In [3]:
from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report

In [4]:
from transformers import AutoModelForSequenceClassification

from finbert import *
import utils as tools

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [5]:
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split

### Build finbert model
Reference: https://github.com/ProsusAI/finBERT/blob/master/notebooks/finbert_training.ipynb

In [6]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

In [10]:
project_dir = Path.cwd().parent
cl_path = project_dir/'Colab Notebooks'/'finbert-sentiment'
cl_data_path = project_dir/'Colab Notebooks'

In [93]:
# Clean the cl_path
try:
    shutil.rmtree(cl_path)
except:
    pass

# model
bertmodel = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', cache_dir=None, num_labels=3)


config = Config(   data_dir=cl_data_path,
                   bert_model=bertmodel,
                   num_train_epochs=4,
                   model_dir=cl_path,
                   max_seq_length = 48,
                   train_batch_size = 32,
                   learning_rate = 2e-5,
                   output_mode='classification',
                   warm_up_proportion=0.2,
                   local_rank=-1,
                   discriminate=True,
                   gradual_unfreeze=True)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [94]:
finbert = FinBert(config)
finbert.base_model = 'bert-base-uncased'
finbert.config.discriminate=True
finbert.config.gradual_unfreeze=True

In [95]:
# label
finbert.prepare_model(label_list=['positive','negative','neutral'])



### Split data into train, validation and test datasets

In [96]:
data = pd.read_csv(os.path.join(cl_data_path, 'sentences_1.txt'), sep='.@', names=['text','label'], encoding_errors='ignore')

train, test = train_test_split(data, test_size=0.2, random_state=0)
train, valid = train_test_split(train, test_size=0.1, random_state=0)

train.to_csv('train.csv',sep='\t')
test.to_csv('test.csv',sep='\t')
valid.to_csv('validation.csv',sep='\t')

  data = pd.read_csv(os.path.join(cl_data_path, 'sentences_1.txt'), sep='.@', names=['text','label'], encoding_errors='ignore')


### Train the finbert model

In [97]:
# Get the training examples
train_data = finbert.get_data('train')

In [98]:
model = finbert.create_the_model()



In [100]:
trained_model = finbert.train(train_examples=train_data, model=model)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871]
No best model found


Epoch:  25%|██▌       | 1/4 [12:56<38:50, 777.00s/it]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871, 0.33707844523283154]


Epoch:  50%|█████     | 2/4 [31:54<32:57, 988.86s/it]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871, 0.33707844523283154, 0.33707844523283154]


Epoch:  75%|███████▌  | 3/4 [57:38<20:42, 1242.55s/it]

Iteration:   0%|          | 0/109 [00:00<?, ?it/s]

Validating:   0%|          | 0/13 [00:00<?, ?it/s]

Validation losses: [0.3569155748073871, 0.33707844523283154, 0.33707844523283154, 0.33707844523283154]


Epoch: 100%|██████████| 4/4 [1:26:26<00:00, 1296.56s/it]
  checkpoint = torch.load(self.config.model_dir / ('temporary' + str(best_model)))


### Read in Fed speeches and yield spread

In [101]:
fed = pd.read_csv('FED_speech.csv')
spread = pd.read_csv('yield_spread.csv')

# Ensure 'DATE' and 'date' are of the same type
fed['date'] = pd.to_datetime(fed['date'])
spread['DATE'] = pd.to_datetime(spread['DATE'])

# replace the null value with its previous row value
spread['T10Y2Y'] = spread['T10Y2Y'].replace('.', pd.NA).fillna(method='ffill')
spread['T10Y2Y'] = spread['T10Y2Y'].astype(float)

# Filter rows where the 'DATE' in spread matches the 'dates' in fed_sample
spread_filtered = spread[spread['DATE'].isin(fed['date'])]

# Filter rows in 'fed_sample' that have dates matching 'spread_filtered'
fed_filtered = fed[fed['date'].isin(spread_filtered['DATE'])]

  fed['date'] = pd.to_datetime(fed['date'])
  spread['T10Y2Y'] = spread['T10Y2Y'].replace('.', pd.NA).fillna(method='ffill')


In [102]:
merged_df = pd.merge(fed_filtered, spread_filtered, left_on='date', right_on='DATE', how='inner')
merged_df

Unnamed: 0,title,speaker,date,article,DATE,T10Y2Y
0,Thoughts on the Economy and Policy Rules at th...,Governor Christopher J. Waller,2024-10-14,"Thank you, Athanasios, and thank you for the o...",2024-10-14,0.13
1,Challenges to the Community Banking Model,Governor Michelle W. Bowman,2024-10-11,"Good afternoon, I'd like to begin by thanking ...",2024-10-11,0.13
2,"Entrepreneurs, Innovation, and Participation",Governor Lisa D. Cook,2024-10-10,"Thank you for the kind introduction, Jennet.Le...",2024-10-10,0.11
3,The Fed's Discount Window: 1990 to the Present,Vice Chair Philip N. Jefferson,2024-10-09,"Thank you, Steve, for that kind introduction a...",2024-10-09,0.07
4,A History of the Fed's Discount Window: 1913–2000,Vice Chair Philip N. Jefferson,2024-10-08,"Thank you, President Hicks and Tara Boehmler, ...",2024-10-08,0.06
...,...,...,...,...,...,...
335,The Digitalization of Payments and Currency: S...,Governor Lael Brainard,2020-02-05,I want to thank Darrell Duffie for inviting me...,2020-02-05,0.22
336,"Spontaneity and Order: Transparency, Accountab...",Vice Chair for Supervision Randal K. Quarles,2020-01-17,It's a great pleasure to be with you today at ...,2020-01-17,0.26
337,The Outlook for Housing,Governor Michelle W. Bowman,2020-01-16,Few sectors are as central to the success of o...,2020-01-16,0.23
338,U.S. Economic Outlook and Monetary Policy,Vice Chair Richard H. Clarida,2020-01-09,Thank you for the opportunity to join you brig...,2020-01-09,0.27


In [103]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [104]:
model = AutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3)

In [105]:
# speeches
fed_speeches = fed_filtered.article

# results
model_results = []
model_sentiments = []

# predict the speeches based on the model we trained earlier
for speech in fed_speeches:
  result = predict(speech, model)

  average_score = round(result.sentiment_score.mean(), 2)
  model_results.append(average_score)

  if average_score >= 0.05:
    model_sentiments.append('positive')
  elif average_score <= -0.05:
    model_sentiments.append('negative')
  else:
    model_sentiments.append('neutral')

  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_result], ignore_index=True)
  result = pd.concat([result, batch_resu

### Regression Analysis

In [106]:
import statsmodels.api as sm

In [107]:
X = sm.add_constant(model_results)
Y = list(merged_df['T10Y2Y'])
tf_est = sm.OLS(Y, X)
tf_est2 = tf_est.fit()
tf_est2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.013
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,4.525
Date:,"Thu, 17 Oct 2024",Prob (F-statistic):,0.0341
Time:,23:52:10,Log-Likelihood:,-355.42
No. Observations:,340,AIC:,714.8
Df Residuals:,338,BIC:,722.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0295,0.046,0.640,0.522,-0.061,0.120
x1,0.5451,0.256,2.127,0.034,0.041,1.049

0,1,2,3
Omnibus:,40.379,Durbin-Watson:,0.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.48
Skew:,0.663,Prob(JB):,8.85e-08
Kurtosis:,2.268,Cond. No.,6.92
