# Calculate stalens factor of news

In [4]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import torch
from transformers import BertModel
from transformers import BertTokenizerFast
from src.model.data_loading import get_data_loader_from_dataset
from src.config import config, MODEL_CONFIG
from numpy import dot
from numpy.linalg import norm
from src.model.neural_network import predict_cls

TRANSFORMER_HF_ID = MODEL_CONFIG.transformer_hugface_id

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
batch_size = 16
tokenizer = BertTokenizerFast.from_pretrained(TRANSFORMER_HF_ID)

In [5]:
# Use baseline bert model to avoid look-ahead bias 
model = BertModel.from_pretrained(TRANSFORMER_HF_ID)
model.eval()

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print("Using GPU.")
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")
    
model.to(device)

Using GPU.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30873, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [52]:
dataset = pd.read_parquet(config.data.benzinga.cleaned)

In [53]:
dataset.index.name

'__null_dask_index__'

In [55]:
# To determine the freshness of news, I compare the similarity of each news article with all articles published in the previous three days.

In [56]:
dataset["text_length"] = dataset["parsed_body"].map(lambda x: len(x))

In [57]:
dataloader = get_data_loader_from_dataset(dataset=dataset,
                                          tokenizer=tokenizer,
                                          split=None,
                                          batch_size=batch_size,
                                          data_loader_kwargs=dict(shuffle=False),
                                          text_col="parsed_body",
                                          label_col="text_length") # label col is just dummy variable... we dont train the model 

func:'embed_inputs' took: 14.4095 sec



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



In [58]:
cls_tokens = predict_cls(model, dataloader, device)
cls_tokens = pd.Series(list(cls_tokens))
# Otherwise index alignment leads to problems
cls_tokens.index = dataset.index 
dataset["cls_token"] = cls_tokens
dataset["staleness"] = 0

In [87]:
n_of_sametime_news = 0

for ticker in set(dataset.stocks):
    
    ticker_news = dataset[dataset.stocks == ticker].reset_index()
    ticker_news = ticker_news.set_index("time").sort_index(ascending=True)
    # Set staleness of first news message to 0 
    ticker_news.at[ticker_news.index[0], "staleness"] = 0

    for time in  ticker_news.index:
        previous_news = ticker_news.loc[(time-pd.DateOffset(days=3)):time, "cls_token"]
        if len(previous_news) == 1:
            ticker_news.at[time, "staleness"] = 0
        else:
            try:
                current_cls = previous_news.pop(time)
            except IndexError as e:
                n_of_sametime_news += 1
                current_cls = previous_news.iloc[-1]
                previous_news = previous_news.iloc[:-1]
                
            cosine_sims = previous_news.apply(lambda x: dot(current_cls, x) / (norm(current_cls)*norm(x)))
            ticker_news.at[time, "staleness"] = cosine_sims.max()
            
    ticker_news.set_index("__null_dask_index__", inplace=True)
    # Add entries to data set
    dataset.loc[ticker_news.index, "staleness"] = ticker_news.loc[:, "staleness"]
    
print(n_of_sametime_news)

22


In [None]:
dataset.to_parquet(config.data.benzinga.cleaned)

# Distribution of staleness factors

In [89]:
import plotly.express as px 

In [90]:
px.histogram(dataset.loc[dataset["staleness"]!=0, "staleness"])

In [None]:
from google.colab import runtime
runtime.unassign()