# Calculate stalens factor of news

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import yaml
from dotmap import DotMap
from transformers import BertModel
from transformers import BertTokenizerFast
from src.model.data_loading import get_data_loader_from_dataset
from numpy import dot
from numpy.linalg import norm
from src.model.neural_network import predict

from src.model.neural_network import (
    TRANSFORMER_HF_ID,
    train,
)

config = DotMap(yaml.safe_load(open("src/config.yaml")), _dynamic=False)
input_col_name = config.model.input_col_name
target_col_name = config.model.target_col_name

batch_size = 4
epochs = 3
tokenizer = BertTokenizerFast.from_pretrained(TRANSFORMER_HF_ID)

In [None]:
# Use baseline bert model to avoid look-ahead bias 
model = BertModel.from_pretrained(TRANSFORMER_HF_ID)
model.eval()

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print("Using GPU.")
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")
model.to(device)

In [None]:
dataset = pd.read_parquet(config.data.benzinga.cleaned)


In [None]:
cls_tokens = predict(model, train, device)

In [None]:
# To determine the freshness of news, I compare the similarity of each news article with all articles published in the previous three days.

In [None]:
ticker = "AAPL"
ticker_news = dataset[dataset.ticker == ticker]
cls_tokens = predict(model, train, device)
ticker_news["cls_token"] = None
ticker_news["cls_token"] = ticker_news["cls_token"].astype(object)
ticker_news["staleness"] = 0

dataloader = get_data_loader_from_dataset(dataset=ticker_news,
                                          tokenizer=tokenizer,
                                          batch_size=batch_size,
                                          data_loader_kwargs=dict(shuffe=True,
                                                                  pin_memory=True))

In [None]:
cls_tokens = predict(model, dataloader, device)
ticker_news["cls_token"] = cls_tokens
for time in ticker_news.time:
    last_3_day_news = ticker_news[time:time + pd.DateOffset(days=3), "cls_token"]
    current_cls = ticker_news.at[time, "cls_token"]
    cosine_sims = last_3_day_news.cls_token.apply(lambda x: dot(current_cls, x)/(norm(current_cls)*norm(x)))
    ticker_news.at[time, "staleness"] = cosine_sims.max()

In [None]:
dataset["base_bert_"]

# Distribution of staleness factors