#### Setup

##### Import Libraries

In [1]:
from datetime import datetime, timedelta

from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification, pipeline
import matplotlib.pyplot as plt
import statsmodels.api as sm
import yfinance as yf
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

##### Set up tokenizer and model for FinBERT and a dictionary of topics

In [2]:
token = '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'

In [3]:
topic_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification", token=token)
topic_model = AutoModelForSequenceClassification.from_pretrained("nickmuchi/finbert-tone-finetuned-finance-topic-classification", token=token)
topics = {
   0: "Analyst Update",
   1: "Fed | Central Banks",
   2: "Company | Product News",
   3: "Treasuries | Corporate Debt",
   4: "Dividend",
   5: "Earnings",
   6: "Energy | Oil",
   7: "Financials",
   8: "Currencies",
   9: "General News | Opinion",
   10: "Gold | Metals | Materials",
   11: "IPO",
   12: "Legal | Regulation",
   13: "M&A | Investments",
   14: "Macro",
   15: "Markets",
   16: "Politics",
   17: "Personnel Change",
   18: "Stock Commentary",
   19: "Stock Movement",
}

inv_topics = {v: k for k, v in topics.items()}

In [4]:
sentiment_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
sentiment_model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)

sentiments = {
   -1: "Negative",
   0: "Neutral",
   1: "Positive"
}

inv_sentiments = {v: k for k, v in sentiments.items()}

##### Functions

In [5]:
# get_stock_price(ticker, start, end):
#  Gets the stock price of a company in a given timeframe
#  ticker: str    --> ticker of a company
#  date: timedate --> day to get stock price
#
#  ==> DataFrame(
#        index: str  --> date (YYYY-MM-DD)
#        col1:  float --> adjusted closing price

def get_stock_price(ticker, date):
   return yf.download(ticker, start=date, end=date + timedelta(days=7))['Adj Close'].values[0]

In [6]:
# calculate_price_difference(row)
#  Calculates the price difference between now and a year from now
#
#  ==> float

def calculate_price_difference(row):
   start_date = pd.to_datetime(row['date'])
   end_date = start_date + timedelta(days=365)
   
   start_value = get_stock_price(row['ticker'], start_date)
   end_value = get_stock_price(row['ticker'], end_date)
   
   return (end_value - start_value) / start_value

In [7]:
# linear_reg(X, y):
#  Performs linear regression and gives the parameters of the fitted line
#  X: array_like  --> input
#  y: array_like  --> output
#
#  ==> array_like --> parameter values

def linear_reg(X, y):
    X = sm.add_constant(X) 
    return sm.OLS(y, X).fit().params

In [8]:
# nlp_label_topic(texts):
#  Gives text a label
#  texts: str or List(str) --> Texts that needs labeling
#
#  ==> Dict() --> label and score

def nlp_label_topic(texts):
   if type(texts) == str:
      texts = [texts]
   nlp = pipeline("text-classification", model=topic_model, tokenizer=topic_tokenizer)
   
   # added for text with too many tokens (max of 512)
   try:
      return nlp(texts)
   except Exception as e:
      if "must match the size of tensor b (512) at non-singleton dimension 1" in str(e):
         return nlp_label_topic([text.rsplit(' ', 1)[0] for text in texts])
      else:
         print(e)

In [9]:
# assign_topics(column):
#  Gives text a label and reformat it to create columns
#  column: Series --> Column of text that needs labeling
#
#  ==> label number, score (confidence)

def assign_topics(column):
   data = nlp_label_topic(column.to_list())
   labels = [inv_topics[d['label']] for d in data]
   scores = [d['score'] for d in data]
   
   return labels, scores

In [10]:
# nlp_label_sentiment(texts):
#  Gives text a label
#  texts: str or List(str) --> Texts that needs labeling
#
#  ==> Dict() --> sentiment and score

def nlp_label_sentiment(texts):
   if type(texts) == str:
      texts = [texts]
   nlp = pipeline("text-classification", model=sentiment_model, tokenizer=sentiment_tokenizer)

   # added for text with too many tokens (max of 512)
   try:
      return nlp(texts)
   except Exception as e:
      if "must match the size of tensor b (512) at non-singleton dimension 1" in str(e):
         return nlp_label_sentiment([text.rsplit(' ', 1)[0] for text in texts])
      else:
         print(e)

In [11]:
# assign_topics(column):
#  Gives text a label and reformat it to create columns
#  column: Series --> Column of text that needs sentiment labeling
#
#  ==> label number, score (confidence)

def assign_sentiment(column):
   data = nlp_label_sentiment(column.to_list())
   labels = [inv_sentiments[d['label']] for d in data]
   scores = [d['score'] for d in data]
   
   return labels, scores

##### Get company dataset file names and their tickers

In [12]:
datasets = []

for filename in os.listdir('./datasets'):
   datasets.append(('datasets/' + filename, filename[:filename.index("_")]))

#### Analysis

##### Concatenate all the datasets together and clean them

###### I randomly sampled them because having 40,000+ rows was extremely slow
###### Subsequently, I cannot generate a 20 dimensional vector with the presence or absence of each topic

In [16]:
final = None
for dataset in datasets:
   df = pd.read_csv(dataset[0])
   df['date'] = pd.to_datetime(df['date'])
   df.drop(["id", "sequence", "label", "quarter", "year", "speaker", "title"], inplace=True, axis=1)
   df = df.drop_duplicates(subset=['text'], keep='first')
   df.dropna(inplace=True)
   df['ticker'] = dataset[1]
   df = df.sample(frac=0.01)
   
   if final is None:
      final = df.copy()
   else:
      final = pd.concat([final, df], axis=0, ignore_index=True)

In [17]:
final

Unnamed: 0,date,text,mystery indicator,VADER sentiment,ticker
0,2021-07-29,It's hard to do quickly. but we're moving as q...,0.348765,0.126,AMZN
1,2018-07-26,"Yeah. And Brian, this is Dave. Just on the acc...",0.787084,0.735,AMZN
2,2020-01-30,So -- and despite that we give AWS customers a...,0.274922,0.700,AMZN
3,2018-04-26,Sure. Let me wax eloquently. Try to anyway. So...,1.195399,0.896,AMZN
4,2019-07-25,"So, we think there's a lot of shared purpose t...",1.917637,0.912,AMZN
...,...,...,...,...,...
403,2022-01-26,"So I think when you look at the others, they h...",0.138736,0.913,MKTX
404,2021-10-20,Got it. Thanks. Second question is just on cap...,2.181152,0.779,MKTX
405,2021-07-21,"Sure, Alex. I'll take that one. So, our Rates ...",0.109863,0.772,MKTX
406,2020-04-29,I would just say it's actually similar to Marc...,1.404304,0.708,MKTX


##### Assign topics and sentiment based on the text

In [19]:
final['topic'], final['topic_confidence'] = assign_topics(final['text'])

In [20]:
final['sentiment'], final['sentiment_confidence'] = assign_sentiment(final['text'])

##### Count topics and sentiment

In [21]:
final['topic'].value_counts()

topic
2     156
9     122
14     53
7      49
5      10
13      5
3       5
4       2
18      2
16      2
17      1
19      1
Name: count, dtype: int64

In [22]:
final['sentiment'].value_counts()

sentiment
 1    247
 0    131
-1     30
Name: count, dtype: int64

##### Calculate percentage difference from 1 year after the text

In [23]:
final['percent_difference'] = final.apply(calculate_price_difference, axis=1)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [24]:
final.head()

Unnamed: 0,date,text,mystery indicator,VADER sentiment,ticker,topic,topic_confidence,sentiment,sentiment_confidence,percent_difference
0,2021-07-29,It's hard to do quickly. but we're moving as q...,0.348765,0.126,AMZN,2,0.570482,-1,0.986976,-0.250261
1,2018-07-26,"Yeah. And Brian, this is Dave. Just on the acc...",0.787084,0.735,AMZN,9,0.34417,0,0.942233,0.074696
2,2020-01-30,So -- and despite that we give AWS customers a...,0.274922,0.7,AMZN,2,0.924503,1,0.999958,0.713922
3,2018-04-26,Sure. Let me wax eloquently. Try to anyway. So...,1.195399,0.896,AMZN,9,0.748478,0,0.894177,0.285034
4,2019-07-25,"So, we think there's a lot of shared purpose t...",1.917637,0.912,AMZN,2,0.527522,1,0.999937,0.524409


##### Perform linear regression

In [25]:
linear_reg(final[["mystery indicator", "VADER sentiment", "topic", "topic_confidence", "sentiment", "sentiment_confidence"]], final["percent_difference"])

const                   0.210728
mystery indicator      -0.047221
VADER sentiment         0.001720
topic                  -0.004246
topic_confidence       -0.010916
sentiment               0.013414
sentiment_confidence    0.012913
dtype: float64

##### Try and improve regression by converting categorical variables to dummy

In [26]:
temp = pd.get_dummies(final, columns=["topic", "sentiment"], dtype=int)
temp 

Unnamed: 0,date,text,mystery indicator,VADER sentiment,ticker,topic_confidence,sentiment_confidence,percent_difference,topic_2,topic_3,...,topic_9,topic_13,topic_14,topic_16,topic_17,topic_18,topic_19,sentiment_-1,sentiment_0,sentiment_1
0,2021-07-29,It's hard to do quickly. but we're moving as q...,0.348765,0.126,AMZN,0.570482,0.986976,-0.250261,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2018-07-26,"Yeah. And Brian, this is Dave. Just on the acc...",0.787084,0.735,AMZN,0.344170,0.942233,0.074696,0,0,...,1,0,0,0,0,0,0,0,1,0
2,2020-01-30,So -- and despite that we give AWS customers a...,0.274922,0.700,AMZN,0.924503,0.999958,0.713922,1,0,...,0,0,0,0,0,0,0,0,0,1
3,2018-04-26,Sure. Let me wax eloquently. Try to anyway. So...,1.195399,0.896,AMZN,0.748478,0.894177,0.285034,0,0,...,1,0,0,0,0,0,0,0,1,0
4,2019-07-25,"So, we think there's a lot of shared purpose t...",1.917637,0.912,AMZN,0.527522,0.999937,0.524409,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,2022-01-26,"So I think when you look at the others, they h...",0.138736,0.913,MKTX,0.772005,0.990421,0.121538,0,0,...,1,0,0,0,0,0,0,0,0,1
404,2021-10-20,Got it. Thanks. Second question is just on cap...,2.181152,0.779,MKTX,0.831506,0.634236,-0.424378,0,0,...,0,0,0,0,0,1,0,0,1,0
405,2021-07-21,"Sure, Alex. I'll take that one. So, our Rates ...",0.109863,0.772,MKTX,0.715819,0.999984,-0.401721,0,0,...,1,0,0,0,0,0,0,0,0,1
406,2020-04-29,I would just say it's actually similar to Marc...,1.404304,0.708,MKTX,0.459401,0.933989,0.090483,0,0,...,1,0,0,0,0,0,0,0,1,0


In [27]:
linear_reg(temp[[col for col in temp.columns if col not in ['date', 'text', 'ticker', 'topic', 'sentiment', 'percent_difference']]], temp["percent_difference"])

const                   0.085412
mystery indicator      -0.046445
VADER sentiment        -0.014416
topic_confidence        0.014049
sentiment_confidence    0.003928
topic_2                 0.077553
topic_3                -0.075865
topic_4                -0.235719
topic_5                 0.153547
topic_7                 0.019973
topic_9                 0.021865
topic_13                0.006187
topic_14                0.061116
topic_16               -0.231153
topic_17               -0.182219
topic_18                0.030807
topic_19                0.439320
sentiment_-1           -0.049148
sentiment_0             0.076507
sentiment_1             0.058054
dtype: float64

##### Conclusion

The model performs significantly better when we separate out categorical variables with an alpha of 0.085 instead of 0.21. The variable with the highest correlation is topic_5 or earnings, which is interesting. It could be that if they are mentioning earnings, it was really good (and is more tied into sentiment). The least correlated thing was topic_4 or dividends. I think it makes sense since if they are increasing dividends, the stock price takes a hit (the stock decreases by how much the dividend was) and vice versa. If they decrease dividends, then there could be underlying issues with the company.