In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
from pytz import timezone
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# Pipeline to calculate sentiment scores, targets, and aggregate news

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [None]:
CALENDAR_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/trading_calendar.csv'

DICT_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/loughran_mcdonald/loughran_mcdonald.csv'
EXTRA_DICT_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/loughran_mcdonald/sestm.csv'

STOCKS_SP1500_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/returns_sp1500.csv'
FACTORS_SP1500_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/factors_sp1500.csv'

FINNHUB_SP1500_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/news_reduced/finnhub_sp1500.p'
NEWS_SP1500_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/news_reduced/news_sp1500.p'

TIINGO_LEAN_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/tiingo.p'
FINNHUB_LEAN_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/finnhub.p'

TARGETS_SP1500_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/loughran_mcdonald/targets_sp1500.p'
TARGETS_SP1500_CSV_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/loughran_mcdonald/targets_sp1500.csv'
MASTER_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/master.p'

TRAIN_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/train.p'
TEST_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/test.p'
VALID_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/valid.p'

INFO_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/info.p'
TRAIN_TXT_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/train.txt'


In [None]:
# Load trading calendar

calendar = pd.read_csv(CALENDAR_PATH)
calendar.index = pd.to_datetime(calendar.date).dt.date
calendar.drop(['date'], inplace=True, axis=1)
calendar.market_date = pd.to_datetime(calendar.market_date).dt.date


## Targets

In [None]:
# Create targets 

targets = pd.read_csv(STOCKS_SP1500_PATH)
targets.date = pd.to_datetime(targets.date).dt.date
targets.ticker = targets.ticker.str.lower()

factors = pd.read_csv(FACTORS_SP1500_PATH)
factors.date = pd.to_datetime(factors.date).dt.date
factors.ticker = factors.ticker.str.lower()

targets['residual_close'] = factors.Residuals
targets = targets[['date','ticker','1D_ret_close', '1D_ret_open', 'residual_close']]

# Quantile
targets_2means = targets.groupby('date').transform(lambda x: pd.qcut(x, q=[0, 0.5, 1], labels=range(0,2)))
targets_3means = targets.groupby('date').transform(lambda x: pd.qcut(x, q=[0, 0.25, 0.75, 1], labels=range(0,3)))

targets['ret_close_2'] = targets_2means['1D_ret_close']
targets['ret_open_2'] = targets_2means['1D_ret_open']
targets['res_close_2'] = targets_2means['residual_close']
targets['ret_close_3'] = targets_3means['1D_ret_close']
targets['ret_open_3'] = targets_3means['1D_ret_open']
targets['res_close_3'] = targets_3means['residual_close']

#targets = targets.drop(['1D_ret_close', '1D_ret_open', 'residual_close'], axis=1)

with open(TARGETS_SP1500_PATH,'wb') as pkl_file:
  pickle.dump(targets, pkl_file)

targets.to_csv(TARGETS_SP1500_CSV_PATH, index=False)

## Lougrhan-McDonald

In [None]:
# Load pre-calculated pickle file
with open(TARGETS_SP1500_PATH,'rb') as pkl_file:
  targets = pickle.load(pkl_file)

In [None]:
def loughran_mcdonald(text, pos_vocab, neg_vocab):
  """
  Function that takes in a list of documents, a positive vocabulary and a negative 
  vocabulary, and returns an array of scores which is the sum of postive TF-IDF 
  values minus negative TF-IDF values.
  """
  
  # Calculate TF-IDF for positive and negative dictionaries
  pos_tfidf = TfidfVectorizer(vocabulary=pos_vocab)
  pos_tfidf_vecs = pos_tfidf.fit_transform(text)

  neg_tfidf = TfidfVectorizer(vocabulary=neg_vocab)
  neg_tfidf_vecs = neg_tfidf.fit_transform(text)

  # Document scores
  lmcd_scores = pos_tfidf_vecs.sum(axis=1) - neg_tfidf_vecs.sum(axis=1)
  lmcd_scores = np.squeeze(np.asarray(lmcd_scores))

  return lmcd_scores
  

In [None]:


d2 = pd.read_csv(EXTRA_DICT_PATH)

sentiment_df = pd.read_csv(DICT_PATH) # Load Loughran-McDonald sentiment data
sentiment_categories = ['negative', 'positive']
sentiment_df.columns = sentiment_df.columns.str.lower() # Convert column names to lowercase
sentiment_df = sentiment_df[['word'] + sentiment_categories] # Use only columns related to sentiment
sentiment_df[sentiment_categories] = 1 * sentiment_df[sentiment_categories].astype(bool) # Convert to 1 or 0
sentiment_df = sentiment_df[(sentiment_df[sentiment_categories]).any(1)] # Only use vocabulary that has sentiment ranking
sentiment_df.word = sentiment_df.word.str.lower() # Convert words to lowercase 
sentiment_df.reset_index(drop=True, inplace=True)

# Positive and negative vocabularies
pos_vocab = sentiment_df[sentiment_df.positive==1].word
pos_vocab = pos_vocab.to_list() + d2.pos_words.to_list() + ['beats']
pos_vocab = list(set(pos_vocab))
neg_vocab = sentiment_df[sentiment_df.negative==1].word
neg_vocab = neg_vocab.to_list() + d2.neg_words.to_list()
neg_vocab = list(set(neg_vocab))


In [None]:
# Load data
tiingo_df = pickle.load(open(NEWS_SP1500_PATH,'rb'))
tiingo_df = tiingo_df.reset_index()
# Concatenate description and title
tiingo_df['text'] = tiingo_df.description + ' ' + tiingo_df.title 
tiingo_df = tiingo_df.drop_duplicates(subset=['text'])
tiingo_df = tiingo_df[['id','text','publishedDate','tickers']]
tiingo_df = tiingo_df.rename(columns={'publishedDate':'date', 'tickers':'ticker'})
tiingo_df['source'] = 'tiingo'
# Remove \n characters
tiingo_df.text = tiingo_df.text.str.replace('\n', ' ') 
# Convert date to EST
tiingo_df.date = pd.to_datetime(tiingo_df.date).dt.tz_convert('US/Eastern')
#Calculate Lougrhan McDonald scores
tiingo_df['score'] = 0
tiingo_df.loc[:,'score'] = loughran_mcdonald(tiingo_df.text.values, pos_vocab, neg_vocab)

# Load pre-calculated pickle file
with open(TIINGO_LEAN_PATH,'wb') as pkl_file:
  pickle.dump(tiingo_df, pkl_file)
  

In [None]:
# Load pre-calculated pickle file
with open(FINNHUB_SP1500_PATH,'rb') as pkl_file:
  finnhub_df = pickle.load(pkl_file)
# Concatenate headline and summary
finnhub_df['text'] = finnhub_df.headline + ' ' + finnhub_df.summary
finnhub_df = finnhub_df.drop_duplicates(subset=['text'])
finnhub_df = finnhub_df[['id', 'text', 'iso_time', 'related']]
finnhub_df = finnhub_df.rename(columns={'iso_time':'date', 'related':'ticker'})
finnhub_df.ticker = finnhub_df.ticker.str.lower()
finnhub_df['source'] = 'finnhub'
# Remove \n characters
finnhub_df.text = finnhub_df.text.str.replace('\n', ' ') 
# Convert date to EST
finnhub_df.date = pd.to_datetime(finnhub_df.date).dt.tz_convert('US/Eastern')
#Calculate Lougrhan McDonald scores
finnhub_df['score'] = 0
finnhub_df.loc[:,'score'] = loughran_mcdonald(finnhub_df.text.values, pos_vocab, neg_vocab)

# Load pre-calculated pickle file
with open(FINNHUB_LEAN_PATH,'wb') as pkl_file:
  pickle.dump(finnhub_df, pkl_file)


In [None]:
# Load pre-calculated pickle file
with open(FINNHUB_LEAN_PATH,'rb') as pkl_file:
  finnhub_df = pickle.load(pkl_file)

# Load pre-calculated pickle file
with open(TIINGO_LEAN_PATH,'rb') as pkl_file:
  tiingo_df = pickle.load(pkl_file)
  

In [None]:
key_dev = pd.read_csv("/content/drive/MyDrive/abnormal-distribution-project-data/compustat_keydev/final_keydev_data.csv")
df_outputs = pd.read_pickle("/content/drive/MyDrive/abnormal-distribution-project-data/compustat_keydev/FinBertOutput_HS_Sentiment.pkl")
df_outputs = df_outputs.drop('bert_features',axis=1)
df_outputs['bert_neutral_sentiment'] = 1 - df_outputs['bert_neg_sentiment'] - df_outputs['bert_pos_sentiment']
dd = {'bert_neutral_sentiment':1, 'bert_neg_sentiment':0,'bert_pos_sentiment':2}
key_dev['finbert_baseline'] = df_outputs[['bert_neutral_sentiment', 'bert_neg_sentiment','bert_pos_sentiment']].idxmax(axis=1).apply(lambda x:dd[x]).values
key_dev.datetime = pd.to_datetime(key_dev.datetime)
#dd = (key_dev.datetime.dt.hour != 0) | (key_dev.datetime.dt.minute != 0) | (key_dev.datetime.dt.second != 0)
#key_dev = key_dev[dd]
#key_dev.datetime = key_dev.datetime.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
key_dev = key_dev[key_dev.datetime >= pd.to_datetime('2010-12-30')]#.tz_localize('US/Eastern')]
key_dev = key_dev[key_dev.datetime <= pd.to_datetime('2020-09-03')]#.tz_localize('US/Eastern')]
key_dev['score'] = 0
key_dev.loc[:,'score'] = loughran_mcdonald(key_dev.situation.values, pos_vocab, neg_vocab)
key_dev.loc[key_dev.score<0,'score'] = -1
key_dev.loc[key_dev.score>0,'score'] = 1
key_dev.score += 1


## Master dataset with scores and news

In [None]:
with open('/content/drive/MyDrive/abnormal-distribution-project-data/lean/keydev.p','wb') as pkl_file:
  pickle.dump(key_dev, pkl_file)

In [None]:
dd = finnhub_df.date.dt.tz_convert('UTC')
dd = (dd.dt.hour != 0) | (dd.dt.minute != 0) | (dd.dt.second != 0)

news_df = pd.concat([tiingo_df, finnhub_df[dd]])
news_df['_id'] = news_df.id
news_df.loc[news_df.source == 'finnhub', '_id'] = -news_df.loc[news_df.source == 'finnhub', '_id']
news_df = news_df.rename(columns={'score':'dict_score'})
#news_df.loc[news_df.dict_score>0, 'dict_score'] = 1
#news_df.loc[news_df.dict_score<0, 'dict_score'] = -1

news_df = news_df.sample(frac=1)
news_df.sort_values(by='date', inplace=True)
news_df = news_df[news_df.date >= pd.to_datetime('2010-12-30').tz_localize('US/Eastern')]
news_df = news_df[news_df.date <= pd.to_datetime('2020-09-03').tz_localize('US/Eastern')]

news_df['open_date'] = news_df.date
news_df['close_date'] = news_df.date
news_df.loc[news_df.date.dt.hour > 9, 'open_date'] = news_df.loc[news_df.date.dt.hour > 9, 'open_date'] +timedelta(days=1)
news_df.loc[news_df.date.dt.hour > 16, 'close_date'] = news_df.loc[news_df.date.dt.hour > 16, 'close_date'] +timedelta(days=1)

news_df['open_date'] = news_df['open_date'].dt.date
news_df['close_date'] = news_df['close_date'].dt.date

news_df.open_date = calendar.loc[news_df.open_date].values
news_df.close_date = calendar.loc[news_df.close_date].values

news_df.ticker = news_df.ticker.str.lower()
targets['close_date'] = targets.date
targets['open_date'] = targets.date

a = news_df.merge(targets[['ret_close_2', 'ret_close_3', 'res_close_2', 'res_close_3', 'ticker', 'close_date']], on=['ticker','close_date'])
b = news_df.merge(targets[['ret_open_2', 'ret_open_3', 'ticker', 'open_date']], on=['ticker','open_date'])[['_id','ret_open_2', 'ret_open_3']]

news_df = a.merge(b, on=['_id'])

news_df['dict_ret_close'] =  2 * news_df['ret_close_2']
news_df['dict_ret_open'] =  2 * news_df['ret_open_2'] 
news_df['dict_res_close'] =  2 * news_df['res_close_2']
news_df.loc[news_df.dict_score == 0, 'dict_ret_close'] = 1
news_df.loc[news_df.dict_score == 0, 'dict_ret_open'] = 1
news_df.loc[news_df.dict_score == 0, 'dict_res_close'] = 1

news_df.sort_values(by='date', inplace=True)
news_df.reset_index(inplace=True, drop=True)

news_df[['ret_close_2',	'ret_close_3', 'res_close_2',	'res_close_3', 'ret_open_2',	
         'ret_open_3',	'dict_ret_close',	'dict_ret_open', 'dict_res_close']] = \
news_df[['ret_close_2',	'ret_close_3', 'res_close_2',	'res_close_3', 'ret_open_2',	
         'ret_open_3',	'dict_ret_close',	'dict_ret_open', 'dict_res_close']].astype(int) 

with open(MASTER_PATH,'wb') as pkl_file:
  pickle.dump(news_df, pkl_file)


### Train / Valid / Test Split

In [None]:
with open(MASTER_PATH,'rb') as pkl_file:
  news_df = pickle.load(pkl_file)

In [None]:
train_date = pd.to_datetime('2017-12-29').tz_localize('US/Eastern')
valid_date = pd.to_datetime('2018-12-31').tz_localize('US/Eastern')


In [None]:
train_df = news_df[news_df.date <= train_date].sample(frac=1).reset_index(drop=True)
valid_df = news_df[(news_df.date > train_date ) & (news_df.date <= valid_date)].sort_values(by='date').reset_index(drop=True)
test_df = news_df[(news_df.date > valid_date )].sort_values(by='date').reset_index(drop=True)


In [None]:
with open(TRAIN_PATH,'wb') as pkl_file:
  pickle.dump(train_df, pkl_file)
with open(VALID_PATH,'wb') as pkl_file:
  pickle.dump(valid_df, pkl_file)
with open(TEST_PATH,'wb') as pkl_file:
  pickle.dump(test_df, pkl_file)
  

In [None]:
with open(TRAIN_PATH,'rb') as pkl_file:
  train_df = pickle.load(pkl_file)

In [None]:
with open('/content/drive/MyDrive/abnormal-distribution-project-data/lean/ml_filter.p','wb') as pkl_file:
  pickle.dump(news_df[news_df.dict_res_close==1]['_id'], pkl_file)


## Informative news


In [None]:
# Informative news
info_df = train_df[(train_df.res_close_3!=1) & (train_df.dict_res_close!=1)]

with open(INFO_PATH,'wb') as pkl_file:
  pickle.dump(info_df, pkl_file)


In [None]:

with open(TRAIN_TXT_PATH, 'w') as f:
  for item in info_df.text.to_list():
      f.write("%s\n" % item)


In [None]:
with open(MASTER_PATH,'rb') as pkl_file:
  news_df = pickle.load(pkl_file)

In [None]:
with open(MASTER_PATH,'rb') as pkl_file:
  news_df = pickle.load(pkl_file)

scores = pd.read_csv('/content/drive/MyDrive/abnormal-distribution-project-data/lean/scores_master.csv')
scores['_id'] = scores['id']
scores.loc[scores.source=='finnhub','_id'] = - scores.loc[scores.source=='finnhub','_id']
scores.finbert_baseline += 1

mm = scores[['_id', 'finbert_baseline']].merge(news_df, on='_id')

In [None]:
ss = mm[((mm[['dict_ret_close', 'dict_ret_open', 'dict_res_close', 'ret_close_3', 'ret_open_3']]==0).all(axis=1)) & (mm['finbert_baseline']!=0)]

In [None]:
with open(MASTER_PATH,'rb') as pkl_file:
  news_df = pickle.load(pkl_file)

scores = pd.read_csv('/content/drive/MyDrive/abnormal-distribution-project-data/lean/scores_master.csv')
scores['_id'] = scores['id']
scores.loc[scores.source=='finnhub','_id'] = - scores.loc[scores.source=='finnhub','_id']
scores.finbert_baseline += 1

mm = scores[['_id', 'finbert_baseline']].merge(news_df, on='_id')
idx_neg = (mm[['dict_ret_close', 'dict_ret_open', 'dict_res_close', 'ret_close_3', 'ret_open_3', 'finbert_baseline']]==0).all(axis=1)
idx_pos = (mm[['dict_ret_close', 'dict_ret_open', 'dict_res_close', 'ret_close_3', 'ret_open_3', 'finbert_baseline']]==2).all(axis=1)
idx_neu = (mm[['dict_ret_close', 'dict_ret_open', 'dict_res_close', 'ret_close_3', 'ret_open_3', 'finbert_baseline']]==1).all(axis=1)
mm['score'] = 1
mm.loc[idx_neg,'score'] = 0
mm.loc[idx_pos,'score'] = 2
mm = pd.concat([mm.loc[idx_neg,['close_date', 'text', 'score']], mm.loc[idx_pos,['close_date', 'text', 'score']], mm.loc[idx_neu,['close_date', 'text','score']]])

with open('/content/drive/MyDrive/abnormal-distribution-project-data/lean/keydev.p','rb') as pkl_file:
  key_dev = pickle.load(pkl_file)

key_dev = key_dev[['situation','finbert_baseline','score','datetime']]
key_dev = key_dev.rename(columns= {'situation':'text', 'datetime':'date'})
key_dev.date = pd.to_datetime(key_dev.date).dt.date
mm = mm.rename(columns= { 'close_date':'date'})
mm.date = pd.to_datetime(mm.date).dt.date

In [None]:
final_set = pd.concat([mm, 
            key_dev[(key_dev.finbert_baseline == 2) & (key_dev.score == 2)][['date','text','score']],
            key_dev[(key_dev.finbert_baseline == 1) & (key_dev.score == 1)][['date','text','score']],
            key_dev[(key_dev.finbert_baseline == 0) & (key_dev.score == 0)][['date','text','score']]])

In [None]:
train_date = pd.to_datetime('2017-12-29').date()
valid_date = pd.to_datetime('2018-12-31').date()

In [None]:
final_set = final_set.reset_index(drop=True)

In [None]:
final_train_df = final_set[final_set.date <= train_date].sample(frac=1).reset_index(drop=True)
final_valid_df = final_set[(final_set.date > train_date ) & (final_set.date <= valid_date)].sort_values(by='date').reset_index(drop=True)
final_test_df = final_set[(final_set.date > valid_date )].sort_values(by='date').reset_index(drop=True)


In [None]:
final_train_df =pd.concat([final_train_df[final_train_df.score==0], 
                           final_train_df[final_train_df.score==1].sample(n=40000), 
                           final_train_df[final_train_df.score==2].sample(n=40844)])

In [None]:
final_train_df.reset_index(drop=True).sample(frac=1)

Unnamed: 0,date,text,score
30427,2011-03-16,BofA Merrill Lynch has named Bob Elfring as th...,1.0
17928,2014-02-25,JPMorgan Chase & Co. will cut 136 mortgage ban...,0.0
69126,2012-04-26,"Dominion Resources, Inc. announced that it is ...",2.0
64630,2015-02-06,Growth stocks can be some of the most exciting...,2.0
95343,2013-09-18,"On September 18, 2013 DISH was granted a victo...",2.0
...,...,...,...
74421,2011-02-15,Broadcom Corporation announced a new family of...,2.0
15054,2012-08-07,"Office Depot, Inc. reported unaudited consolid...",0.0
44300,2015-01-12,PRNewswire Lennox International Inc NYSE LII...,1.0
43755,2016-08-22,"Bonnie Baha, a U.S. bond portfolio manager who...",1.0


In [None]:
final_valid_df =pd.concat([final_valid_df[final_valid_df.score==0], 
                           final_valid_df[final_valid_df.score==1].sample(7595), 
                           final_valid_df[final_valid_df.score==2]])

In [None]:
with open('/content/drive/MyDrive/abnormal-distribution-project-data/lean/final_train.p','wb') as pkl_file:
  pickle.dump(final_train_df, pkl_file)
with open('/content/drive/MyDrive/abnormal-distribution-project-data/lean/final_valid.p','wb') as pkl_file:
  pickle.dump(final_valid_df, pkl_file)