In [None]:
%cd /gxfs_work/cau/sunms534/trading_bot/

In [None]:
import pandas as pd 
import numpy as np
from src.config import config, MODEL_CONFIG, PREP_CONFIG
from transformers import AutoTokenizer

In [None]:
# 1. Taking a look at cleaned dataset

In [None]:
stripped = pd.read_parquet(config.data.news.stripped)

In [None]:
stripped.columns

In [None]:
stripped[["title", "stocks", "parsed_body"]].tail()

In [None]:
# 2. Take a look at input_ids and if tokenizer works correctly

In [None]:
title_inputs_ids = pd.read_parquet(config.data.news.title_only.input_ids)

In [None]:
ld = pd.read_parquet(config.data.learning_dataset)

In [None]:
#cleaned = ld = pd.read_parquet(config.data.merged) # requires more memory

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PREP_CONFIG.tokenizer)

In [None]:
mask = ld.parsed_body.apply(lambda x: "today announced" in x)

In [None]:
ld = ld[mask]

In [None]:
n = np.random.randint(0, ld.shape[0])
index = ld.index[n]
print(index)
entry = title_inputs_ids.loc[index, :].values

print(ld.loc[index, "stocks"])
print(f"{tokenizer.decode(entry)} \n")
print(ld.loc[index, "parsed_body"])

for x in entry:
    print(f"{x} --- {tokenizer.decode(x)}")


# Average number of token in news

In [None]:
title_inputs_ids = pd.read_parquet(config.data.news.input_ids)


In [None]:
n_padding = title_inputs_ids[title_inputs_ids == 0].sum(axis=1)

# Predictions from Model

In [None]:
ld = pd.read_parquet(config.data.learning_dataset, columns=["z_score", "z_score_class", "parsed_body"])
title_inputs_ids = pd.read_parquet(config.data.news.title_only.input_ids)
masks = pd.read_parquet(config.data.news.title_only.masks)

In [None]:
ld_mini = ld.iloc[-10000:,:]

In [None]:
index = ld.index[0]

In [None]:
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm


In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizer
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained("/gxfs_work/cau/sunms534/trading_bot/data/models/ProsusAI/finbert", num_labels=3, local_files_only=True)

prob = []
preds = []
actuals = []
with torch.no_grad():
    for index in tqdm(ld_mini.index):
        input_id = torch.tensor([title_inputs_ids.loc[index, :].tolist()], dtype=torch.int32)
        mask = torch.tensor([masks.loc[index, :].tolist()])
        result = finbert(input_id, mask)
        probs = F.softmax(result.logits).numpy()
        prob.append(probs.max())
        preds.append((probs.argmax()  + 1) % 2)
        actuals.append(ld.loc[index, "z_score_class"])
        #print(result)  #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative

In [None]:
F.softmax(result.logits).numpy().argmax()

In [None]:
tokenizer.decode(title_inputs_ids.loc[ld_mini.index[532]])

In [None]:
len(preds)

In [None]:
df = pd.DataFrame(zip(preds,actuals, prob))

In [None]:
df.loc[:, 2].describe()

In [None]:
df = df[df.iloc[:, 2] > 0.95]

In [None]:
len(df)

In [None]:
(df.iloc[:, 0] == df.iloc[:, 1]).sum()

In [None]:
df.iloc[:, 1].value_counts()

In [None]:
df.iloc[:, 0].value_counts()

In [None]:
df.iloc[:, 2].idxmax()

In [None]:
df.loc[532, :]

In [None]:
ld.loc[ld_mini.index[532], "parsed_body"]

## Analyze impact of correction for standard deviation 

In [None]:
import pandas as pd
from src.config import config, MODEL_CONFIG
import numpy as np

In [None]:
dat: pd.DataFrame = pd.read_parquet(path=config.data.merged)

In [None]:
dat.columns

In [None]:
X = dat[['r', 'r_spy', 'std_252', 'stocks', 'entry_time']]

In [None]:
X.loc[:, 'mkt_adj'] = X['r'] - X['r_spy']

In [None]:
X = X.dropna()

In [None]:
X.describe()

In [None]:
X.loc[:, 'z_score'] = X['mkt_adj'] / X['std_252']

In [None]:
X.sort_values('z_score', inplace=True)

In [None]:
X

In [None]:
X['z_score'].describe()

In [None]:
X.loc[:, "z_score"] = X["z_score"].clip(lower=X["z_score"].quantile(0.05), upper=X["z_score"].quantile(0.95))

In [None]:
X['z_score'].describe()

Maximum ist 1.6, minimum ist -1.6 nach winsorisieren...

Problem, wenn wir mit std_252 standardisieren ist, dass es Wertpapiere (Aktien?) gibt mit sehr geringer Vola. Und weil wir die std von r nehmen, und nicht von r - r_spy bekommen wir unsinnige, viel zu große Werte heraus. -> Filtere minimale std_252 heraus.

In [None]:
X['std_252'].describe()

In [None]:
X.sort_values('std_252')

In [None]:
learndat: pd.DataFrame = pd.read_parquet(path=config.data.learning_dataset)

In [None]:
learndat.sort_values("z_score")

In [None]:
learndat.z_score.describe()

In [None]:
(learndat.z_score/learndat.std_252).describe()

## Staleness analysis

In [None]:
dat: pd.DataFrame = pd.read_parquet(path=config.data.merged)

def jaccard_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

dataset = dat

from tqdm.notebook import tqdm

# To determine the freshness of news, I compare the similarity of each news article with all articles published in the previous three days.
original_index_name = dataset.index.name
for ticker in tqdm(list(set(dataset.stocks))[1:100], desc="stocks"):
    orig_sort_ticker_news = dataset[dataset.stocks == ticker]

    # Using time sorted df makes for easier splicing later
    ticker_news = orig_sort_ticker_news.reset_index().set_index("est_entry_time").sort_index(ascending=True)
    
    # Set staleness of first news message to 0 
    ticker_news.at[ticker_news.index[0], "jaccard"] = 0

    for idx in orig_sort_ticker_news.index:
        time = orig_sort_ticker_news.at[idx, 'est_entry_time']
        previous_news = ticker_news.loc[(time-pd.DateOffset(days=3)):time-pd.DateOffset(minutes=1), "parsed_body"]
        if len(previous_news) == 0:
            ticker_news.at[time, "jaccard"] = 0
        else:
            current_str = orig_sort_ticker_news.at[idx, 'parsed_body']
            previous_news = previous_news
            jaccards = previous_news.apply(lambda x: jaccard_similarity(current_str, x))
            ticker_news.loc[ticker_news[original_index_name] == idx, "jaccard"] = jaccards.max()
            
    ticker_news.set_index(original_index_name, inplace=True)
    # Add entries to data set
    dataset.loc[ticker_news.index, "jaccard2"] = ticker_news.loc[:, "jaccard"]

In [None]:
tmp = dataset.dropna()

In [None]:
dataset[["jaccard", "staleness", 'jaccard2']].describe()

In [None]:
import plotly.express as px

In [None]:
px.histogram(dataset["jaccard"])

In [None]:
px.histogram(dataset["staleness"])

In [None]:
dataset[dataset.stocks == "ACCO"].loc[39023:]

In [None]:
jacc_df = tmp[tmp["jaccard2"] == 1]

In [None]:
jacc_df[jacc_df.stocks == "ACCO"]