In [1]:
import torch
from src.model.bert_classifier import BERTClassifier
from src.model.regr_transformer import NNRegressor
from src.config import config
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from transformers import AutoModel
import plotly.express as px
from src.model.data_loading import CustomDataModule

In [2]:
# Settings
model_id = '7zdo79pv'
target_col_name = 'z_score_class'

In [44]:
df = pd.read_parquet(f"{config.data.predictions.regression_dir}/{model_id}.parquet")

In [45]:
df.columns

Index(['est_entry_time', 'est_exit_time', 'entry_time', 'exit_time', 'r',
       'r_spy', 'unadj_entry_open', 'entry_is_too_far_apart',
       'exit_is_too_far_apart', 'parsed_body', 'staleness', 'stocks',
       'std_252', 'dollar_volume', 'r_intra_(t-1)', 'unadj_open', 'cond_vola',
       'r_mkt_adj', 'z_score', 'z_score_class', 'sample_weights', 'jaccard',
       'split', 'max_probs', 'class_preds'],
      dtype='object')

In [10]:
def accuracy(df: pd.DataFrame):
    N = len(df)
    acc = (df['z_score_class'] == df['class_preds']).sum() / N
    return acc

In [148]:
df.groupby("split", observed=True).apply(accuracy)

split
testing       0.397442
training      0.497842
validation    0.398595
dtype: float64

In [154]:
q_list = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]
quantiles = df.groupby("split", observed=True).apply(lambda x: x["max_probs"].quantile(q_list))
quantiles.index = pd.MultiIndex.from_tuples([(i, 'quantile') for i in quantiles.index], names=['split', 'value'])

In [155]:
quantiles

Unnamed: 0_level_0,max_probs,0.50,0.60,0.70,0.80,0.90,0.95,0.99
split,value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
testing,quantile,0.415739,0.434663,0.45866,0.491804,0.55297,0.627647,0.822722
training,quantile,0.43955,0.462055,0.489729,0.530984,0.605869,0.688538,0.862935
validation,quantile,0.421747,0.44167,0.465514,0.498387,0.560097,0.644715,0.835017


In [156]:
accuracies = quantiles.copy(deep=True)
lengths = quantiles.copy(deep=True)

In [119]:
for quantile in quantiles.columns:
    for idx in quantiles.index:
        split = idx[0]
        subframe = df.loc[
            (df['split'] == split) & 
            (df['max_probs'] > quantiles.at[idx, quantile])
            ]
        acc = accuracy(subframe)
        accuracies.at[idx, quantile] = acc
        lengths.at[idx, quantile] = int(subframe.shape[0]) 

In [120]:
accuracies.index = pd.MultiIndex.from_tuples([(i, 'accuracy') for i in accuracies.index.droplevel('value')], names=['split', 'value'])
lengths.index = pd.MultiIndex.from_tuples([(i, 'len') for i in accuracies.index.droplevel('value')], names=['split', 'value'])

In [121]:
summary_df = pd.concat([quantiles, accuracies, lengths]).sort_index()

Unnamed: 0_level_0,max_probs,0.50,0.60,0.70,0.80,0.90,0.95,0.99
split,value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
testing,accuracy,0.438187,0.451152,0.468707,0.497091,0.562909,0.649576,0.837576
testing,len,82499.0,66000.0,49500.0,33000.0,16500.0,8250.0,1650.0
testing,quantile,0.415739,0.434663,0.45866,0.491804,0.55297,0.627647,0.822722
training,accuracy,0.592599,0.620931,0.65725,0.705245,0.776954,0.830176,0.908409
training,len,686179.0,548934.0,411708.0,274472.0,137236.0,68618.0,13724.0
training,quantile,0.43955,0.462055,0.489729,0.530984,0.605869,0.688538,0.862935
validation,accuracy,0.440017,0.453452,0.472203,0.502584,0.576211,0.678507,0.834837
validation,len,144140.0,115311.0,86484.0,57656.0,28828.0,14414.0,2882.0
validation,quantile,0.421747,0.44167,0.465514,0.498387,0.560097,0.644715,0.835017


In [147]:
plot_df = summary_df.xs(key='accuracy', level='value').T
plot_df.index.name = 'quantile'
px.line(plot_df, labels={'value':'accuracy'})

In [157]:
quantiles

Unnamed: 0_level_0,max_probs,0.50,0.60,0.70,0.80,0.90,0.95,0.99
split,value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
testing,quantile,0.415739,0.434663,0.45866,0.491804,0.55297,0.627647,0.822722
training,quantile,0.43955,0.462055,0.489729,0.530984,0.605869,0.688538,0.862935
validation,quantile,0.421747,0.44167,0.465514,0.498387,0.560097,0.644715,0.835017


In [161]:
subframe = df.loc[
    (df['split'] == 'testing') & 
    (df['max_probs'] > quantiles.at[('testing', 'quantile'), 0.9])
    ]

In [163]:
subframe['z_score_class'].value_counts()

z_score_class
2    6010
1    5460
0    5030
Name: count, dtype: int64

In [None]:
subframe

# Change Over Time

# Analysis of Single Forecast: 

In [None]:
idx = 11
tmp = test_dat.loc[pred_margin_mask]
row = tmp.iloc[idx, :]
print(row)
# print(f"Fcst: {row.Fcst}")
# print(f"Target: {row.IntradayReturn}")

print(row.body[:750])
pr_time, ticker, fcst = row[["Date", "ID", "Fcst"]]
df = stocks.query("(Date >= @pr_time) & (ID == @ticker)").head(30)
fig = go.Figure(data=[go.Candlestick(x=df['Date'],
                open=df['Open'],
                high=df['High'],
                low=df['Low'],
                close=df['Close'])])
fig.update_layout(xaxis_rangeslider_visible=False)
fig.show()

In [None]:
print(row)

# Trading Performance

In [None]:
tmp = test_dat.loc[pred_margin_mask].dropna()

In [None]:
tmp.head()

In [None]:
trades = np.sign(tmp["Fcst"])*tmp["CloseToCloseReturn"]

In [None]:
trades.mean()

In [None]:
px.scatter(tmp, x=MODEL_CONFIG.target_col_name, y="Fcst")