<a href="https://colab.research.google.com/github/kkt86/quant-notebooks/blob/master/news_based_sentiment_indicator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building financial news sentiment

In [103]:
import pandas as pd

data = pd.read_table("drive/My Drive/colab data/analyst_ratings_processed.csv", sep=",", nrows=100000, usecols=[1, 2, 3])

data.head()

Unnamed: 0,title,date,stock
0,Stocks That Hit 52-Week Highs On Friday,2020-06-05 10:30:00-04:00,A
1,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03 10:45:00-04:00,A
2,71 Biggest Movers From Friday,2020-05-26 04:30:00-04:00,A
3,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22 12:45:00-04:00,A
4,B of A Securities Maintains Neutral on Agilent...,2020-05-22 11:38:00-04:00,A


## Fine tune Word2Vec with gensim

In [104]:
from torchtext.data import Field
import spacy

def tokenize(sentence):
  sentence = sentence.strip()
  sentence = sentence.lower()
  sentence = sentence.replace("\n", " ")
  return [tok.text for tok in en.tokenizer(sentence)]

en = spacy.load("en_core_web_sm")
TEXT = Field(tokenize=tokenize, lower=True, init_token="<sos>", eos_token="<eos>")


In [105]:
import gensim

# WORD2VEC
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 10
W2V_MIN_COUNT = 50

# Collect corpus for training word embeddings
documents = [tokenize(text) for text in data["title"]]

# train word embeddings and save
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT)
w2v_model.build_vocab(documents)
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print(f"Vocab size: {vocab_size}")

w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)
w2v_model.save("embeddings.txt")

Vocab size: 2376


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Load embeddings

In [106]:
import torchtext.vocab as vocab
from tqdm import tqdm_notebook

# build vocab
TEXT.build_vocab(data["title"], min_freq=W2V_MIN_COUNT)

w2v_model_new = gensim.models.word2vec.Word2Vec.load("embeddings.txt")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [113]:
positive_term = "opportunity"
negative_term = "bearish"

positive_dict = [term[0] for term in w2v_model.most_similar(positive_term)]
negative_dict = [term[0] for term in w2v_model.most_similar(negative_term)]

print(f"Positive terms: {positive_dict}")
print(f"Negative terms: {negative_dict}")

Positive terms: ['attractive', 'opportunities', 'upside', 'valuation', "'", 'investing', 'even', 'our', 'recovery', 'position']
Negative terms: ['negative', 'cautious', 'bullish', 'comments', 'stable', 'positive', 'pressure', 'optimistic', 'downgrade', 'recent']


  after removing the cwd from sys.path.
  if np.issubdtype(vec.dtype, np.int):
  """


In [114]:
def compute_index(sentences, dictionary):
  titles = [tokenize(title) for title in sentences]
  total_number_of_words = sum([len(title) for title in titles])
  total_number_of_dictionary_terms = 0
  for title in titles:
    for word in title:
      if word in dictionary:
        total_number_of_dictionary_terms += 1
  return float(total_number_of_dictionary_terms)/total_number_of_words*1000 if total_number_of_words > 0 else 0.0

for stock in data["stock"].unique()[:10]:
  sentences = data["title"][(data["stock"] == stock)]
  positive_sentiment_index = compute_index(sentences, positive_dict)
  negative_sentiment_index = compute_index(sentences, negative_dict)
  print(f"Stock: {stock}, positive index: {positive_sentiment_index}, negative index: {negative_sentiment_index}")


Stock: A, positive index: 3.469319511787327, negative index: 1.5047650894499247
Stock: AAMC, positive index: 0.0, negative index: 0.0
Stock: AAME, positive index: 2.472187886279357, negative index: 0.0
Stock: AAN, positive index: 3.5194368900975843, negative index: 0.7998720204767237
Stock: nan, positive index: 0.0, negative index: 0.0
Stock: AAOI, positive index: 6.801592568015925, negative index: 2.9860650298606504
Stock: AAON, positive index: 3.1965903036760785, negative index: 0.0
Stock: AAP, positive index: 7.027276930189552, negative index: 1.3869625520110958
Stock: AAPL, positive index: 25.16309412861137, negative index: 1.9970709625882042
Stock: AAU, positive index: 4.3383947939262475, negative index: 0.0


In [123]:
# create aggregated data for result summary
agg_data = data[["date", "stock"]].groupby("stock", as_index=False).agg([min, max])
agg_data.head()

Unnamed: 0_level_0,date,date
Unnamed: 0_level_1,min,max
stock,Unnamed: 1_level_2,Unnamed: 2_level_2
A,2009-04-29 08:48:00-04:00,2020-06-05 10:30:00-04:00
AAMC,2014-02-26 16:49:00-05:00,2020-05-11 07:47:00-04:00
AAME,2010-04-28 15:15:00-04:00,2020-05-11 11:05:00-04:00
AAN,2009-08-10 07:27:00-04:00,2020-06-10 10:36:00-04:00
AAOI,2013-09-26 10:16:00-04:00,2020-05-11 05:14:00-04:00


In [127]:
# compute positive and negative sentiment for each stock
from tqdm import tqdm_notebook
agg_data["ps"] = None
agg_data["ns"] = None

for stock in tqdm_notebook(data["stock"].unique()):
  sentences = data["title"][(data["stock"] == stock)]
  positive_sentiment_index = compute_index(sentences, positive_dict)
  negative_sentiment_index = compute_index(sentences, negative_dict)
  agg_data.loc[stock, "ps"] = positive_sentiment_index
  agg_data.loc[stock, "ns"] = negative_sentiment_index

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=402.0), HTML(value='')))




In [129]:
!pip install yfinance

Collecting yfinance
  Downloading https://files.pythonhosted.org/packages/c2/31/8b374a12b90def92a4e27d0fc595fc43635f395984e36a075244d98bd265/yfinance-0.1.54.tar.gz
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.54-py2.py3-none-any.whl size=22409 sha256=accde910c10d6d77c2695fa7e9e32cf5e78424fd6da8e0320879fe4b9cab693f
  Stored in directory: /root/.cache/pip/wheels/f9/e3/5b/ec24dd2984b12d61e0abf26289746c2436a0e7844f26f2515c
Successfully built yfinance
Installing collected packages: yfinance
Successfully installed yfinance-0.1.54


In [149]:
# compute relative returns for the stocks
import yfinance as yf

agg_data["pct_return"] = None

for symbol, row in agg_data.iterrows():
  try:
    stock_data = yf.download(symbol, start=row["date"]["min"].split(" ")[0], end=row["date"]["max"].split(" ")[0])
    adj_close = stock_data["Adj Close"]
    pct_return = (adj_close[-1] - adj_close[0])/adj_close[0]*100
    agg_data.loc[symbol, "pct_return"] = pct_return
  except Exception as e:
    print(f"Problems for symbol: {symbol}")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- AAV: No data found for this date range, symbol may be delisted
Problems for symbol: AAV
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- AAVL: Data doesn't exist for startDate = 1405900800, endDate = 1462492800
Problems for symbol: AAVL
[********

In [159]:
final_data = agg_data.dropna()
final_data.sort_values("pct_return", axis=0, inplace=True)

worst_companies = final_data.iloc[:20,:]
best_companies = final_data.iloc[:-20,:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [160]:
worst_companies.mean()

ps               2.547325
ns               1.421732
pct_return     -96.348059
dtype: float64

In [161]:
best_companies.mean()

ps               3.447184
ns               1.382041
pct_return      68.891746
dtype: float64