# Calculate stalens factor of news

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import torch
from transformers import BertModel
from transformers import BertTokenizerFast
from src.model.data_loading import get_data_loader_from_dataset
from src.config import config, MODEL_CONFIG
from numpy import dot
from numpy.linalg import norm
from src.model.neural_network import predict_cls

TRANSFORMER_HF_ID = MODEL_CONFIG.transformer_hugface_id

In [None]:
batch_size = 16
tokenizer = BertTokenizerFast.from_pretrained(TRANSFORMER_HF_ID)
config.data.bzg.encoding_matrix

In [None]:
# Use baseline bert model to avoid look-ahead bias 
model = BertModel.from_pretrained(TRANSFORMER_HF_ID)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

In [None]:
dataset = pd.read_parquet(config.data.benzinga.cleaned)

In [None]:
dataset.index.name

In [None]:
# To determine the freshness of news, I compare the similarity of each news article with all articles published in the previous three days.

In [None]:
dataset["text_length"] = dataset["parsed_body"].map(lambda x: len(x))

In [None]:
dataloader = get_data_loader_from_dataset(dataset=dataset,
                                          split=None,
                                          batch_size=batch_size,
                                          label_col="text_length",
                                          data_loader_kwargs=dict(shuffle=False),
                                          )

In [None]:
cls_tokens = predict_cls(model, dataloader, device)
cls_tokens = pd.Series(list(cls_tokens))

In [None]:

# Otherwise index alignment leads to problems
cls_tokens.index = dataset.index 
dataset["cls_token"] = cls_tokens
dataset["staleness"] = 0

In [None]:
n_of_sametime_news = 0

for i, ticker in enumerate(set(dataset.stocks)):
    print(f"{i}, {ticker=}")
    ticker_news = dataset[dataset.stocks == ticker].reset_index()
    ticker_news = ticker_news.set_index("time").sort_index(ascending=True)
    # Set staleness of first news message to 0 
    ticker_news.at[ticker_news.index[0], "staleness"] = 0

    for time in  ticker_news.index:
        previous_news = ticker_news.loc[(time-pd.DateOffset(days=3)):time, "cls_token"]
        if len(previous_news) == 1:
            ticker_news.at[time, "staleness"] = 0
        else:
            try:
                current_cls = previous_news.pop(time)
            except IndexError as e:
                n_of_sametime_news += 1
                current_cls = previous_news.iloc[-1]
                previous_news = previous_news.iloc[:-1]
                
            cosine_sims = previous_news.apply(lambda x: dot(current_cls, x) / (norm(current_cls)*norm(x)))
            ticker_news.at[time, "staleness"] = cosine_sims.max()
            
    ticker_news.set_index("__null_dask_index__", inplace=True)
    # Add entries to data set
    dataset.loc[ticker_news.index, "staleness"] = ticker_news.loc[:, "staleness"]
    
print(n_of_sametime_news)

In [None]:
dataset.to_parquet(config.data.benzinga.cleaned)

# Distribution of staleness factors

In [None]:
import plotly.express as px 

In [None]:
px.histogram(dataset.loc[dataset["staleness"]!=0, "staleness"])

In [None]:
from google.colab import runtime
runtime.unassign()