In [101]:
from src.config import config
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from transformers import AutoModel
from src.model.bert_classifier import BERTClassifier

In [102]:
# Settings
model_id = '7zdo79pv'
target_col_name = 'z_score_class'

In [103]:
df = pd.read_parquet(f"{config.data.predictions.regression_dir}/{model_id}.parquet")
df = df[df.est_entry_time.dt.hour != 9]


In [51]:
def accuracy(df: pd.DataFrame):
    N = len(df)
    acc = (df['z_score_class'] == df['class_preds']).sum() / N
    return acc

In [52]:
df.groupby("split", observed=True).apply(accuracy)

split
testing       0.355102
training      0.467156
validation    0.368246
dtype: float64

In [53]:
q_list = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]
quantiles = df.groupby("split", observed=True).apply(lambda x: x["max_probs"].quantile(q_list))
quantiles.index = pd.MultiIndex.from_tuples([(i, 'quantile') for i in quantiles.index], names=['split', 'value'])

In [54]:
quantiles

Unnamed: 0_level_0,max_probs,0.50,0.60,0.70,0.80,0.90,0.95,0.99
split,value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
testing,quantile,0.429393,0.446544,0.468057,0.493696,0.539201,0.583723,0.668375
training,quantile,0.433935,0.45443,0.478374,0.514317,0.570011,0.622834,0.719574
validation,quantile,0.428339,0.446638,0.468273,0.498177,0.540965,0.581718,0.672474


In [55]:
accuracies = quantiles.copy(deep=True)
lengths = quantiles.copy(deep=True)

In [56]:
for quantile in quantiles.columns:
    for idx in quantiles.index:
        split = idx[0]
        subframe = df.loc[
            (df['split'] == split) & 
            (df['max_probs'] > quantiles.at[idx, quantile])
            ]
        acc = accuracy(subframe)
        accuracies.at[idx, quantile] = acc
        lengths.at[idx, quantile] = int(subframe.shape[0]) 

In [57]:
accuracies.index = pd.MultiIndex.from_tuples([(i, 'accuracy') for i in accuracies.index.droplevel('value')], names=['split', 'value'])
lengths.index = pd.MultiIndex.from_tuples([(i, 'len') for i in accuracies.index.droplevel('value')], names=['split', 'value'])

In [58]:
summary_df = pd.concat([quantiles, accuracies, lengths]).sort_index()

In [59]:
plot_df = summary_df.xs(key='accuracy', level='value').T
plot_df.index.name = 'quantile'
px.line(plot_df, labels={'value':'accuracy'})

In [94]:
summary_df

Unnamed: 0_level_0,quantile,0.50,0.60,0.70,0.80,0.90,0.95,0.99
split,value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
testing,accuracy,0.388338,0.387026,0.387755,0.386297,0.390671,0.370588,0.257143
testing,len,1715.0,1372.0,1029.0,686.0,343.0,170.0,35.0
testing,quantile,0.429393,0.446544,0.468057,0.493696,0.539201,0.583723,0.668375
training,accuracy,0.542937,0.564047,0.592726,0.630129,0.689839,0.741275,0.854362
training,len,69334.0,55467.0,41601.0,27734.0,13867.0,6934.0,1387.0
training,quantile,0.433935,0.45443,0.478374,0.514317,0.570011,0.622834,0.719574
validation,accuracy,0.376676,0.370648,0.372712,0.364623,0.356322,0.337165,0.356688
validation,len,7829.0,6262.0,4698.0,3132.0,1566.0,783.0,157.0
validation,quantile,0.428339,0.446638,0.468273,0.498177,0.540965,0.581718,0.672474


In [95]:
subframe = df.loc[
    (df['split'] == 'validation') & 
    (df['max_probs'] > quantiles.at[('testing', 'quantile'), 0.9])
    ]

In [96]:
subframe['class_preds'].value_counts()

class_preds
0    1163
2     434
1      19
Name: count, dtype: int64

In [97]:
subframe['z_score_class'].value_counts()

z_score_class
2    629
1    558
0    429
Name: count, dtype: int64

In [98]:
subframe.groupby('z_score_class')['r'].describe().T

z_score_class,0,1,2
count,429.0,558.0,629.0
mean,0.000913,0.029559,-0.025686
std,0.012964,0.068346,0.041679
min,-0.038817,-0.034091,-0.5135
25%,-0.00725,0.005935,-0.039164
50%,0.000624,0.017846,-0.016596
75%,0.009865,0.03863,-0.003204
max,0.044496,1.346154,0.031494


In [100]:
tmp = subframe[["parsed_body", "z_score_class", "class_preds", "max_probs", "stocks"]].sort_values("max_probs", ascending=False)
tmp = tmp[tmp['class_preds'] == 1]
tmp

Unnamed: 0_level_0,parsed_body,z_score_class,class_preds,max_probs,stocks
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4469477,Health Net Awards $1.5 Million Grant to Keck S...,2,1,0.685178,CNC
4434843,Morgan Stanley at Work Announces 2022 End of Y...,1,1,0.595549,MS
4508526,"Polaris Donates More Than $135,000 to Off-Road...",0,1,0.593507,PII
4520099,"More Than 5,000 Nonprofits, Schools Benefit fr...",0,1,0.591979,PCG
4342630,Winmark Corporation Announces Second Quarter R...,1,1,0.57391,WINA
4505765,Class Action Lawsuit Filed on Behalf of Vertex...,2,1,0.572249,VTNR
4329077,Winmark Corporation Announces Addition of Play...,2,1,0.56789,WINA
4501661,"ROSEN, A LEADING LAW FIRM, Encourages TAL Educ...",1,1,0.566307,TAL
4482071,Class Action Lawsuit Filed on Behalf of Vertex...,1,1,0.563102,VTNR
4527501,"Dotdash Meredith Launches D/Cipher, a Transfor...",0,1,0.55889,IAC


In [91]:
tmp.iloc[1]["parsed_body"]

'Winmark Corporation Announces Second Quarter Results. the company announced today net income for the quarter ended $10,368,800 or $2.85 per share diluted compared to net income of $9,027,200 or $2.54 per share diluted in 2022. "Our year to date results reflect positive franchisee performance and the company \'s continued emphasis on providing exceptional operational support," commented Brett D. Heffes, Chair and Chief Executive Officer. the company — the Resale Company®, is a nationally recognized franchising business focused on sustainability and small business formation. We champion and guide entrepreneurs interested in operating one of our award winning resale franchises: Plato\'s Closet®, Once Upon A Child®, Play It Again Sports®, Style Encore® and Music Go Round®. , there were 1,303 franchises in operation and over 2,800 available territories. An additional 70 franchises have been awarded but are not open. This press release contains forward looking statements within the meaning 

# Change Over Time

# Analysis of Single Forecast: 

In [None]:
idx = 11
tmp = test_dat.loc[pred_margin_mask]
row = tmp.iloc[idx, :]
print(row)
# print(f"Fcst: {row.Fcst}")
# print(f"Target: {row.IntradayReturn}")

print(row.body[:750])
pr_time, ticker, fcst = row[["Date", "ID", "Fcst"]]
df = stocks.query("(Date >= @pr_time) & (ID == @ticker)").head(30)
fig = go.Figure(data=[go.Candlestick(x=df['Date'],
                open=df['Open'],
                high=df['High'],
                low=df['Low'],
                close=df['Close'])])
fig.update_layout(xaxis_rangeslider_visible=False)
fig.show()

In [None]:
print(row)

# Trading Performance

In [None]:
tmp = test_dat.loc[pred_margin_mask].dropna()

In [None]:
tmp.head()

In [None]:
trades = np.sign(tmp["Fcst"])*tmp["CloseToCloseReturn"]

In [None]:
trades.mean()

In [None]:
px.scatter(tmp, x=MODEL_CONFIG.target_col_name, y="Fcst")