# Calculate stalens factor of news

In [57]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import torch
import yaml
from dotmap import DotMap
from transformers import BertModel
from transformers import BertTokenizerFast
from src.model.data_loading import get_data_loader_from_dataset
from numpy import dot
from numpy.linalg import norm
from src.model.neural_network import predict, predict_cls
import numpy as np 

from src.model.neural_network import (
    TRANSFORMER_HF_ID,
    train,
)

config = DotMap(yaml.safe_load(open("src/config.yaml")), _dynamic=False)
input_col_name = config.model.input_col_name
target_col_name = config.model.target_col_name

batch_size = 4
epochs = 3
tokenizer = BertTokenizerFast.from_pretrained(TRANSFORMER_HF_ID)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
# Use baseline bert model to avoid look-ahead bias 
model = BertModel.from_pretrained(TRANSFORMER_HF_ID)
model.eval()

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print("Using GPU.")
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")
model.to(device)

Using GPU.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30873, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [116]:
dataset = pd.read_parquet(config.data.benzinga.cleaned)

In [117]:
dataset.index.name

'__null_dask_index__'

In [118]:
dataset = dataset.iloc[:300]

In [119]:
# To determine the freshness of news, I compare the similarity of each news article with all articles published in the previous three days.

In [120]:
dataloader = get_data_loader_from_dataset(dataset=dataset,
                                          tokenizer=tokenizer,
                                          split=None,
                                          batch_size=batch_size,
                                          data_loader_kwargs=dict(shuffle=False),
                                          text_col="parsed_body",
                                          label_col="__null_dask_index__") # label col is just dummy variable... we dont train the model 

func:'embed_inputs' took: 1.3484 sec


  input_tensor = torch.tensor(inputs)
  mask_tensor = torch.tensor(masks)


In [122]:
cls_tokens = predict_cls(model, dataloader, device)
dataset["cls_token"] = pd.Series(list(cls_tokens))
dataset["staleness"] = 0

In [155]:
ticker_news

Unnamed: 0_level_0,stocks,title,channels,body,author,company_name,short_name,parsed_body,cls_token,staleness
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
75365,NOC,Northrop Grumman to Move Corporate Office to W...,[],"LOS ANGELES, Jan. 4 /PRNewswire-FirstCall/ -- ...",PRNewswire,Northrop Grumman Corporation,Northrop Grumman,FirstCall the company announced a decis...,"[0.45929235219955444, -2.080160140991211, -0.5...",
75909,NOC,Northrop Grumman and the U.S. Army Achieve Maj...,[],"EGLIN AIR FORCE BASE, Fla., Jan. 5, 2010 (GLOB...",Globe Newswire,Northrop Grumman Corporation,Northrop Grumman,the company and the U.S.Army recently comple...,"[-0.457493394613266, 0.08568877726793289, -1.0...",0.0


In [165]:
ticker = "JBLU"

for ticker in set(dataset.stocks):
    # ticker loop
    ticker_news = dataset[dataset.stocks == ticker].reset_index()
    ticker_news = ticker_news.set_index("time")
    # Set staleness of first news message to 0 
    ticker_news.at[ticker_news.index[0], "staleness"] = 0

    # time loop
    for time in  ticker_news.index:
        previous_news = ticker_news.loc[(time-pd.DateOffset(days=3)):time, "cls_token"]
        if len(previous_news) == 1:
            ticker_news.at[time, "staleness"] = 0
        else:
            current_cls = previous_news.pop(time)
            cosine_sims = previous_news.apply(lambda x: dot(current_cls, x)/(norm(current_cls)*norm(x)))
            ticker_news.at[time, "staleness"] = cosine_sims.max()
    
    ticker_news.set_index("__null_dask_index__", inplace=True)
    # Add entries to data set
    dataset.loc[ticker_news.index, "staleness"] = ticker_news.loc[:, "staleness"]

In [171]:
dataset[dataset.stocks == "THG"].parsed_body.iloc[0]

'FirstCall      the company  , a leading property and casualty insurance provider, today announced that it has named Tony de Padua as a member of the company\'s executive leadership team.Additionally, he will have direct responsibility for The Hanover\'s growing marine and umbrella businesses, driving continued profitable growth and building new niche products for lucrative markets, such as the technology industry.He will be based in The Hanover\'s Worcester headquarters."We are extremely pleased that Tony has joined our team," said Marita Zuraitis, president of The Hanover\'s property and casualty companies."As we continue to expand our commercial lines organization, it is important that our senior leadership team is aligned to effectively manage our risk management capabilities as well as our continued expansion.Tony\'s tremendous knowledge and skill will help us achieve our goal of becoming one of the best commercial lines businesses in the industry." De Padua brings more than 30 ye

In [180]:
dataset[dataset.stocks == "THG"]

Unnamed: 0_level_0,time,stocks,title,channels,body,author,company_name,short_name,parsed_body,cls_token,staleness
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
75302,2010-01-04 16:02:58-04:00,THG,The Hanover Names Tony de Padua To Lead Corpor...,[],"WORCESTER, Mass. Jan. 4 /PRNewswire-FirstCall/...",PRNewswire,"The Hanover Insurance Group, Inc.",The Hanover Insurance Group,"FirstCall the company , a leading proper...","[-0.1773327738046646, -1.3504993915557861, -0....",0.0
76981,2010-01-05 16:17:38-04:00,THG,The Hanover Names Andrejs Krutainis to Lead Ha...,[],"WORCESTER, Mass., Jan. 5 /PRNewswire-FirstCall...",PRNewswire,"The Hanover Insurance Group, Inc.",The Hanover Insurance Group,"FirstCall the company , a leading proper...","[-0.1505945324897766, -1.544845700263977, -0.6...",0.985288


In [181]:
from IPython.display import display
display(dataset[dataset.stocks == "THG"].body.iloc[0])


'WORCESTER, Mass. Jan. 4 /PRNewswire-FirstCall/ -- The Hanover Insurance Group,\nInc. (NYSE: THG), a leading property and casualty insurance provider, today\nannounced that it has named Tony de Padua as a member of the company\'s\nexecutive leadership team.\n\n(Logo: http://www.newscom.com/cgi-bin/prnh/20051031/NEM023LOGO )\n\nIn this role, de Padua will lead the company\'s corporate commercial lines\nproduct and underwriting functions. Additionally, he will have direct\nresponsibility for The Hanover\'s growing marine and umbrella businesses,\ndriving continued profitable growth and building new niche products for\nlucrative markets, such as the technology industry. He will be based in The\nHanover\'s Worcester headquarters.\n\n"We are extremely pleased that Tony has joined our team," said Marita\nZuraitis, president of The Hanover\'s property and casualty companies. "As we\ncontinue to expand our commercial lines organization, it is important that our\nsenior leadership team is align

# Distribution of staleness factors