In [None]:
%cd /gxfs_work/cau/sunms534/trading_bot/

In [None]:
import pandas as pd 
import numpy as np
from src.config import config, ClassificationConfig, PREP_CONFIG
from transformers import AutoTokenizer

In [None]:
# 1. Taking a look at cleaned dataset

In [None]:
stripped = pd.read_parquet(config.data.news.stripped)

In [None]:
stripped.columns

In [None]:
stripped[["title", "stocks", "parsed_body"]].tail()

In [None]:
# 2. Take a look at input_ids and if tokenizer works correctly

In [None]:
title_inputs_ids = pd.read_parquet(config.data.news.title_only.input_ids)

In [None]:
ld = pd.read_parquet(config.data.learning_dataset)

In [None]:
#cleaned = ld = pd.read_parquet(config.data.merged) # requires more memory

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PREP_CONFIG.tokenizer)

In [None]:
mask = ld.parsed_body.apply(lambda x: "today announced" in x)

In [None]:
ld = ld[mask]

In [None]:
n = np.random.randint(0, ld.shape[0])
index = ld.index[n]
print(index)
entry = title_inputs_ids.loc[index, :].values

print(ld.loc[index, "stocks"])
print(f"{tokenizer.decode(entry)} \n")
print(ld.loc[index, "parsed_body"])

for x in entry:
    print(f"{x} --- {tokenizer.decode(x)}")


# Average number of token in news

In [None]:
title_inputs_ids = pd.read_parquet(config.data.news.input_ids)


In [None]:
n_padding = title_inputs_ids[title_inputs_ids == 0].sum(axis=1)

# Predictions from Model

In [None]:
ld = pd.read_parquet(config.data.learning_dataset, columns=["z_score", "z_score_class", "parsed_body"])
title_inputs_ids = pd.read_parquet(config.data.news.title_only.input_ids)
masks = pd.read_parquet(config.data.news.title_only.masks)

In [None]:
ld_mini = ld.iloc[-10000:,:]

In [None]:
index = ld.index[0]

In [None]:
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm


In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizer
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained("/gxfs_work/cau/sunms534/trading_bot/data/models/ProsusAI/finbert", num_labels=3, local_files_only=True)

prob = []
preds = []
actuals = []
with torch.no_grad():
    for index in tqdm(ld_mini.index):
        input_id = torch.tensor([title_inputs_ids.loc[index, :].tolist()], dtype=torch.int32)
        mask = torch.tensor([masks.loc[index, :].tolist()])
        result = finbert(input_id, mask)
        probs = F.softmax(result.logits).numpy()
        prob.append(probs.max())
        preds.append((probs.argmax()  + 1) % 2)
        actuals.append(ld.loc[index, "z_score_class"])
        #print(result)  #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative

In [None]:
F.softmax(result.logits).numpy().argmax()

In [None]:
tokenizer.decode(title_inputs_ids.loc[ld_mini.index[532]])

In [None]:
len(preds)

In [None]:
df = pd.DataFrame(zip(preds,actuals, prob))

In [None]:
df.loc[:, 2].describe()

In [None]:
df = df[df.iloc[:, 2] > 0.95]

In [None]:
len(df)

In [None]:
(df.iloc[:, 0] == df.iloc[:, 1]).sum()

In [None]:
df.iloc[:, 1].value_counts()

In [None]:
df.iloc[:, 0].value_counts()

In [None]:
df.iloc[:, 2].idxmax()

In [None]:
df.loc[532, :]

In [None]:
ld.loc[ld_mini.index[532], "parsed_body"]

## Analyze impact of correction for standard deviation 

In [None]:
import pandas as pd
from src.config import config, MODEL_CONFIG
import numpy as np

In [None]:
dat: pd.DataFrame = pd.read_parquet(path=config.data.merged)

In [None]:
dat.columns

In [None]:
X = dat[['r', 'r_spy', 'std_252', 'stocks', 'entry_time']]

In [None]:
X.loc[:, 'mkt_adj'] = X['r'] - X['r_spy']

In [None]:
X = X.dropna()

In [None]:
X.describe()

In [None]:
X.loc[:, 'z_score'] = X['mkt_adj'] / X['std_252']

In [None]:
X.sort_values('z_score', inplace=True)

In [None]:
X

In [None]:
X['z_score'].describe()

In [None]:
X.loc[:, "z_score"] = X["z_score"].clip(lower=X["z_score"].quantile(0.05), upper=X["z_score"].quantile(0.95))

In [None]:
X['z_score'].describe()

Maximum ist 1.6, minimum ist -1.6 nach winsorisieren...

Problem, wenn wir mit std_252 standardisieren ist, dass es Wertpapiere (Aktien?) gibt mit sehr geringer Vola. Und weil wir die std von r nehmen, und nicht von r - r_spy bekommen wir unsinnige, viel zu große Werte heraus. -> Filtere minimale std_252 heraus.

In [None]:
X['std_252'].describe()

In [None]:
X.sort_values('std_252')

In [None]:
learndat: pd.DataFrame = pd.read_parquet(path=config.data.learning_dataset)

In [None]:
learndat.sort_values("z_score")

In [None]:
learndat.z_score.describe()

In [None]:
(learndat.z_score/learndat.std_252).describe()

## Staleness analysis

In [None]:
dat: pd.DataFrame = pd.read_parquet(path=config.data.merged)

def jaccard_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

dataset = dat

from tqdm.notebook import tqdm

# To determine the freshness of news, I compare the similarity of each news article with all articles published in the previous three days.
original_index_name = dataset.index.name
for ticker in tqdm(list(set(dataset.stocks))[1:100], desc="stocks"):
    orig_sort_ticker_news = dataset[dataset.stocks == ticker]

    # Using time sorted df makes for easier splicing later
    ticker_news = orig_sort_ticker_news.reset_index().set_index("est_entry_time").sort_index(ascending=True)
    
    # Set staleness of first news message to 0 
    ticker_news.at[ticker_news.index[0], "jaccard"] = 0

    for idx in orig_sort_ticker_news.index:
        time = orig_sort_ticker_news.at[idx, 'est_entry_time']
        previous_news = ticker_news.loc[(time-pd.DateOffset(days=3)):time-pd.DateOffset(minutes=1), "parsed_body"]
        if len(previous_news) == 0:
            ticker_news.at[time, "jaccard"] = 0
        else:
            current_str = orig_sort_ticker_news.at[idx, 'parsed_body']
            previous_news = previous_news
            jaccards = previous_news.apply(lambda x: jaccard_similarity(current_str, x))
            ticker_news.loc[ticker_news[original_index_name] == idx, "jaccard"] = jaccards.max()
            
    ticker_news.set_index(original_index_name, inplace=True)
    # Add entries to data set
    dataset.loc[ticker_news.index, "jaccard2"] = ticker_news.loc[:, "jaccard"]

In [None]:
tmp = dataset.dropna()

In [None]:
dataset[["jaccard", "staleness", 'jaccard2']].describe()

In [None]:
import plotly.express as px

In [None]:
px.histogram(dataset["jaccard"])

In [None]:
px.histogram(dataset["staleness"])

In [None]:
dataset[dataset.stocks == "ACCO"].loc[39023:]

In [None]:
jacc_df = tmp[tmp["jaccard2"] == 1]

In [None]:
jacc_df[jacc_df.stocks == "ACCO"]

## Keyword Analysis

In [3]:
from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizer
from transformers import pipeline
from src.config import PREP_CONFIG

In [4]:
dat: pd.DataFrame = pd.read_parquet(path=config.data.merged)

In [33]:
inputs_ids = pd.read_parquet(config.data.news.input_ids)

In [34]:
ld: pd.DataFrame = pd.read_parquet(path=config.data.learning_dataset)

In [35]:
tokenizer = AutoTokenizer.from_pretrained(PREP_CONFIG.tokenizer)

In [36]:
dat.shape[0]

2508859

In [37]:
ld.shape

(628215, 48)

In [38]:
text = "announced"
tmp = ld[ld.parsed_body.apply(lambda x: text in x)]

In [39]:
tmp.shape[0]

340421

In [40]:
tmp[["r", "r_intra_(t-1)", "r_mkt_adj"]].describe()

Unnamed: 0,r,r_intra_(t-1),r_mkt_adj
count,340421.0,340421.0,340421.0
mean,0.000909,0.000142,0.000714
std,0.04094,0.030532,0.039588
min,-0.882565,-0.660274,-0.884877
25%,-0.012174,-0.010412,-0.010852
50%,0.000705,0.000133,0.00031
75%,0.013414,0.01042,0.011653
max,2.954802,3.164384,2.978776


In [41]:
n=0

In [42]:
from IPython.display import Markdown, HTML

In [54]:
n += 1
idx = tmp.parsed_body.index[n]
inputs_ids.loc[idx]
entry = inputs_ids.loc[idx, :].values
display_text = tokenizer.decode(entry).replace(text, f'<span style="color: #ff0000">{text}</span>').strip("<s></s>") + (f" \n\n r={tmp.iloc[n].r}")
display(Markdown(display_text))

 the company  declares quarterly dividend the company  <span style="color: #ff0000">announced</span> today that its board of directors has declared a regular quarterly cash dividend of  cents on each outstanding share of its common and class b common stock the cash dividend is payable to shareholders of record at the close of business the company improves indoor living and working environments with air conditioning and heating solutions that provide comfort regardless of the outdoor climate our solutions also promote healthier indoor spaces by removing pollutants from the indoor air that can lead to asthma allergies and reductions in productivity furthermore since heating and cooling accounts for approximately  of the energy consumed in a typical united states home we offer consumers the greatest opportunity to save money on energy by replacing existing air conditioning and heating systems with more energy efficient and environmentally friendly solutions there are approximately  million central air conditioning and heating systems installed in the united states that have been in service for more than  years older systems often operate below government mandated energy efficiency and environmental standards the company has an opportunity to accelerate the replacement of these systems at a scale greater than our competitors as the movement toward reducing energy consumption and its environmental impact continues as the industry leader with over  locations in the united states and puerto rico with additional market coverage on an export basis to parts of latin america and the caribbean significant growth potential remains given that our current 

 r=0.008650519031141668

In [55]:
dat.loc[idx].parsed_body

'Watsco Declares Quarterly Dividend. the company , announced today that its Board of Directors has declared a regular quarterly cash dividend of 52 cents on each outstanding share of its Common and Class B common stock. The cash dividend is payable to shareholders of record at the close of business. the company improves indoor living and working environments with air conditioning and heating solutions that provide comfort regardless of the outdoor climate. Our solutions also promote healthier indoor spaces by removing pollutants from the indoor air that can lead to asthma, allergies and reductions in productivity. Furthermore, since heating and cooling accounts for approximately 56% of the energy consumed in a typical United States home, we offer consumers the greatest opportunity to save money on energy by replacing existing air conditioning and heating systems with more energy efficient and environmentally friendly solutions. There are approximately 74 million central air conditionin

In [146]:
tmp.iloc[n]

news_time                                            2011-01-03 07:04:06-05:00
stocks                                                                      MD
parsed_body                  MEDNAX Announces the Acquisition of Long Beach...
est_entry_time                                       2011-01-03 09:31:00-05:00
est_exit_time                                        2011-01-03 16:01:00-05:00
entry_time                                           2011-01-03 09:31:00-05:00
open_entry                                                              33.845
high_entry                                                              33.875
low_entry                                                               33.845
close_entry                                                             33.845
volume_entry                                                            4082.0
cum_split_ratio_entry                                                      0.5
exit_time                                           

## Make overnight news indicator

In [5]:
dat.parsed_body.iloc[0].split(".")[0]

'AA Quantitative Stock Analysis'

In [20]:
dat["has_intraday_time"] = ~((dat.news_time.dt.hour == 0) & (dat.news_time.dt.minute == 0))

In [27]:
# Set is_overnight_news to 1... These should not contain as much unprocessed information as real time news
dat["is_overnight_news"] = (dat.news_time.dt.hour >= 16) | (dat.news_time.dt.hour <= 9) | ((dat.news_time.dt.hour == 9) & ((dat.news_time.dt.minute <= 30)))

In [28]:
dat.is_overnight_news.value_counts()

is_overnight_news
True     2253527
False     255332
Name: count, dtype: int64

## Check if company title appears in title 

In [29]:
dat

Unnamed: 0_level_0,news_time,stocks,parsed_body,est_entry_time,est_exit_time,entry_time,open_entry,high_entry,low_entry,close_entry,...,unadj_entry_open,r_spy,entry_is_too_far_apart,exit_is_too_far_apart,std_252,dollar_volume,r_intra_(t-1),unadj_open,cond_vola,jaccard
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
380,2023-12-15 00:00:00-05:00,AA,AA Quantitative Stock Analysis. Below is Valid...,2023-12-15 09:31:00-05:00,2023-12-15 16:01:00-05:00,2023-12-15 09:31:00-05:00,29.9300,29.960,29.9300,29.960,...,29.9300,0.000064,False,False,0.032401,1.072004e+06,0.105769,29.93,,0.055825
381,2023-12-14 00:00:00-05:00,AA,"Notable Friday Option Activity: CPE, AA, TGT. ...",2023-12-14 09:31:00-05:00,2023-12-14 16:01:00-05:00,2023-12-14 09:31:00-05:00,27.0400,27.100,26.9400,27.000,...,27.0400,-0.001797,False,False,0.032309,1.050999e+06,0.080283,27.04,0.041552,0.000000
382,2023-12-07 00:00:00-05:00,AA,"Notable Thursday Option Activity: AA, CIEN, PA...",2023-12-07 09:31:00-05:00,2023-12-07 16:01:00-05:00,2023-12-07 09:31:00-05:00,25.1600,25.160,25.1600,25.160,...,25.1600,0.002757,False,False,0.030531,3.525135e+05,-0.018750,25.16,0.029424,0.075758
383,2023-12-07 00:00:00-05:00,AA,HSBC Initiates Coverage of Alcoa (AA) with Hol...,2023-12-07 09:31:00-05:00,2023-12-07 16:01:00-05:00,2023-12-07 09:31:00-05:00,25.1600,25.160,25.1600,25.160,...,25.1600,0.002757,False,False,0.030531,3.525135e+05,-0.018750,25.16,0.029424,0.126524
384,2023-12-06 00:00:00-05:00,AA,Here's Why Alcoa (AA) Fell More Than Broader M...,2023-12-06 09:31:00-05:00,2023-12-06 16:01:00-05:00,2023-12-06 09:31:00-05:00,25.6000,25.700,25.6000,25.690,...,25.6000,-0.008503,False,False,0.030635,2.212592e+05,-0.047547,25.60,0.030008,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4578111,2023-08-23 21:22:00-04:00,CAN,"ROSEN, LEADING TRIAL ATTORNEYS, Encourages Can...",2023-08-24 09:31:00-04:00,2023-08-24 16:01:00-04:00,2023-08-24 09:31:00-04:00,2.1500,2.150,2.1200,2.120,...,2.1500,-0.017811,False,False,0.052552,1.472945e+04,0.054455,2.15,0.063720,0.940789
4578113,2023-08-23 21:37:00-04:00,CYBN,Cybin Announces Renewed At-The-Market Equity P...,2023-08-24 09:31:00-04:00,2023-08-24 16:01:00-04:00,2023-08-24 09:31:00-04:00,0.3014,0.304,0.3014,0.303,...,0.3014,-0.017811,False,False,,,,,,0.000000
4578114,2023-08-23 16:05:27-04:00,TM,Environmental Icon Dr. Jane Goodall's Roots & ...,2023-08-24 09:31:00-04:00,2023-08-24 16:01:00-04:00,2023-08-24 09:31:00-04:00,164.5200,164.520,164.1600,164.520,...,164.5200,-0.017811,False,False,0.013989,9.378859e+04,0.002902,164.52,0.012968,0.074097
4578116,2023-08-23 22:43:00-04:00,CD,Chindata Group Unveils Pioneering Full-Stack S...,2023-08-24 09:31:00-04:00,2023-08-24 16:01:00-04:00,2023-08-24 09:31:00-04:00,8.3300,8.340,8.3200,8.330,...,8.3300,-0.017811,False,False,0.036512,3.280190e+04,0.002418,8.33,0.018377,0.000000


## Merge multiple overnight news

In [13]:
tmp = dat.loc[dat['stocks'] == 'AA']

In [22]:
news_count = tmp.loc[:, ['est_entry_time', 'r']].groupby('est_entry_time').count()

In [27]:
news_count.sort_values('r', ascending=False, inplace=True)
news_count.head()

Unnamed: 0_level_0,r
est_entry_time,Unnamed: 1_level_1
2013-07-08 09:31:00-04:00,13
2014-04-08 09:31:00-04:00,11
2014-04-09 09:31:00-04:00,11
2013-10-08 09:31:00-04:00,11
2012-01-10 09:31:00-05:00,11


In [28]:
tmp[tmp.est_entry_time == news_count.index[0]]

Unnamed: 0_level_0,news_time,stocks,parsed_body,est_entry_time,est_exit_time,entry_time,open_entry,high_entry,low_entry,close_entry,...,unadj_entry_open,r_spy,entry_is_too_far_apart,exit_is_too_far_apart,std_252,dollar_volume,r_intra_(t-1),unadj_open,cond_vola,jaccard
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1349,2013-07-08 00:00:00-04:00,AA,"Benzinga Market Primer: Monday, July 8: Earnin...",2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.149691
1350,2013-07-08 00:00:00-04:00,AA,Stocks at Technical Inflection Point as Earnin...,2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.139442
1351,2013-07-08 00:00:00-04:00,AA,July 8: Earnings in the Limelight - Economic H...,2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.159879
1352,2013-07-08 00:00:00-04:00,AA,Stock Downgrades: Unpleasant Surprise Inside f...,2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.128698
1353,2013-07-08 00:00:00-04:00,AA,Pre-Market Primer: Dell Inc. Offer Gets ISS Bl...,2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.146341
1354,2013-07-08 00:00:00-04:00,AA,"The Case for Alcoa. Traditionally, today’s ear...",2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.141892
1355,2013-07-08 00:00:00-04:00,AA,Alcoa Beats on the Top Line - Analyst Blog. Tr...,2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.166144
1356,2013-07-08 00:00:00-04:00,AA,"After-Hours Earnings Report for July 8, 2013 :...",2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.141732
1357,2013-07-08 00:00:00-04:00,AA,Stock Upgrades: Priceline.com Inc Is Going Pla...,2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.136213
1358,2013-07-08 00:00:00-04:00,AA,"After Hours Most Active for Jul 8, 2013 : S, N...",2013-07-08 09:31:00-04:00,2013-07-08 16:01:00-04:00,2013-07-08 09:31:00-04:00,17.790098,17.835308,17.790098,17.812703,...,7.87,-6.100533e-07,False,False,0.014299,416582.314066,0.005148,7.87,0.015414,0.094937
