In [138]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split
from interpret import show
from interpret.perf import ROC

from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())

In [151]:
from ml.data import load_spy_sample, load_vix, add_last_close_pct
from ml.utils import by_date, train_holdout_dates

class DataLoader:
    def __init__(self, months=[1], sample_fraction=None):
        df = load_spy_sample(months=months)

        # Cutoff
        #df = df.filter(pl.col('minute_index') <= 300)

        if sample_fraction:
            df = df.sample(fraction=sample_fraction)

        print(f"Loaded {len(df):,} samples ({df['symbol'].n_unique():,} series)")

        self.df = df
        self.pre_process()

    def pre_process(self):
        df = self.df

        df = df.with_columns(df['option_type'].cast(pl.Categorical))

        df = df.with_columns(
            (pl.col("date").dt.week().alias("week_index"))
        )

        df = add_last_close_pct(df)

        # Moneyness is close / strike e.g. 0.95 is 5% otm
        df = df.filter((pl.col('close_moneyness') > 0.97) & (pl.col('close_moneyness') < 1.01))

        # Low amount of data here
        df = df.filter(pl.col('close') < 3)

        # Doesn't hit stop
        df = df.with_columns((pl.col('max_forward_return') < 1.0).alias('stop_not_hit'))

        # Is worth less than sell price at close
        df = df.with_columns((pl.col('last_close_pct') < 0.0).alias('close_decreased'))

        df = df.with_columns(((pl.col('stop_not_hit') & pl.col('close_decreased')).alias('target')))

        # Add atm straddle
        # atm_straddle = pl.read_parquet('./data/atm_straddle.parquet')
        # self.df = self.df.join(atm_straddle['date', 'minute_index', 'atm_straddle_pct_ma'], on=['date', 'minute_index'], how='left')
        # Add daily vix
        # vix = load_vix()
        # self.df = self.df.join(vix, on='date', how='left')
        self.df = df

    def get_splits(self):

        # Split by date
        train_dates, test_dates = train_holdout_dates(self.df, test_size=0.25)

        X = self.df.drop('symbol', 'max_forward_return', 'close_moneyness', 'stop_not_hit', 'close_decreased', 'last_close_pct')

        print(X.columns)
        X = X['date', 'option_type', 'minute_index', 'close', 'target']

        X_train = by_date(X, train_dates).drop('date')
        X_test = by_date(X, test_dates).drop('date')

        y_train = X_train.drop_in_place('target')
        y_test = X_test.drop_in_place('target')

        self.X = X
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        return X_train, X_test, y_train, y_test

 # 1,2,3,4,5,6,7,8,9,10,11,12
data = DataLoader(months=[1,2,3,4,5,6,7,8,9,10,11,12], sample_fraction=0.10)
#data = DataLoader(months=[1], sample_fraction=0.2)
X_train, X_test, y_train, y_test = data.get_splits()

X_train.head(3)

Loaded 283,427 samples (17,408 series)
['date', 'option_type', 'close', 'minute_index', 'week_index', 'target']


option_type,minute_index,close
cat,i32,f64
"""C""",277,0.2425
"""P""",146,1.386
"""P""",281,1.656


In [None]:
from interpret.data import ClassHistogram

hist = ClassHistogram().explain_data(X_train.to_pandas(), y_train.to_pandas(), name='Train Data')
show(hist)

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier(interactions=[(1,2)], max_interaction_bins=256)
ebm.fit(X_train.to_pandas(), y_train.to_pandas())

# post-process monotonize
#ebm.monotonize("minute_index", increasing=True)

In [None]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [154]:

zmin = 0.
zmax = 0.

heatmaps = []

for i in range(1, 53):
    # Load data
    data = DataLoader(months=[1,2,3,4,5,6,7,8,9,10,11,12], sample_fraction=0.25)

    # Current week only
    data.df = data.df.filter(pl.col("week_index") == i)

    X_train, X_test, y_train, y_test = data.get_splits()

    # Fit
    ebm = ExplainableBoostingClassifier(interactions=[(1,2)], max_interaction_bins=64)
    ebm.fit(X_train.to_pandas(), y_train.to_pandas())

    # Get interaction data
    ebm_global = ebm.explain_global(name='EBM')
    key = 3 # index of interaction
    print(i, ebm_global.feature_names[key])
    data_dict = ebm_global.data(key)

    zmin = min(zmin, data_dict["scores_range"][0])
    zmax = max(zmax, data_dict["scores_range"][1])

    bin_vals = data_dict["scores"]
    bin_vals_t = np.ascontiguousarray(np.transpose(bin_vals, (1, 0)))

    heatmaps.append(bin_vals_t)


xtitle = ebm_global.feature_names[key].split(" & ")[0]
ytitle = ebm_global.feature_names[key].split(" & ")[1]

# made this with my new toys, each frame is 1 week of 2024, basically just checking if the blue blobs stay in the same place
# the fancy term for this is "subperiod consistency"


Loaded 708,568 samples (20,661 series)
['date', 'option_type', 'close', 'minute_index', 'week_index', 'target']
minute_index & close
Loaded 708,568 samples (20,636 series)
['date', 'option_type', 'close', 'minute_index', 'week_index', 'target']
minute_index & close
Loaded 708,568 samples (20,762 series)
['date', 'option_type', 'close', 'minute_index', 'week_index', 'target']
minute_index & close
Loaded 708,568 samples (20,644 series)
['date', 'option_type', 'close', 'minute_index', 'week_index', 'target']
minute_index & close
Loaded 708,568 samples (20,641 series)
['date', 'option_type', 'close', 'minute_index', 'week_index', 'target']
minute_index & close
Loaded 708,568 samples (20,697 series)
['date', 'option_type', 'close', 'minute_index', 'week_index', 'target']
minute_index & close
Loaded 708,568 samples (20,730 series)
['date', 'option_type', 'close', 'minute_index', 'week_index', 'target']
minute_index & close
Loaded 708,568 samples (20,708 series)
['date', 'option_type', 'close

In [None]:
# print(ebm_global.feature_names)
# ebm_global.feature_types
# key=3
# data_dict = ebm_global.data(key)
# ebm_global.visualize(3)



# xtitle = ebm_global.feature_names[key].split(" & ")[0]
# ytitle = ebm_global.feature_names[key].split(" & ")[1]
# data_dict

# bin_labels_left = data_dict["left_names"]
# bin_labels_right = data_dict["right_names"]
# bin_vals = data_dict["scores"]
# bin_vals
# len(bin_labels_left)

In [155]:
import plotly.graph_objects as go
import numpy as np

fig = go.Figure(data=go.Heatmap(z=heatmaps[0], zmin=zmin,zmax=zmax),
               frames=[go.Frame(data=go.Heatmap(z=heatmaps[i], zmin=zmin,zmax=zmax)) for i in range(len(heatmaps))])
fig.update_layout(
    height=1000,
    title="close & minute_index interaction by week | dark is bad",
    xaxis={"title": xtitle}, yaxis={"title": ytitle},
    updatemenus=[
        dict(type="buttons", visible=True,
        buttons=[dict(label="Play", method="animate", args=[None])]
            )])
fig.show()

In [None]:
import plotly.graph_objects as go

bin_vals_t = np.ascontiguousarray(np.transpose(bin_vals, (1, 0)))

heatmaps = [bin_vals_t, bin_vals]

zmin = data_dict["scores_range"][0]
zmax = data_dict["scores_range"][1]

heatmap = go.Heatmap(z=bin_vals_t, x=bin_labels_left, y=bin_labels_right)
if data_dict.get("scores_range", None) is not None:
    heatmap["zmin"] = data_dict["scores_range"][0]
    heatmap["zmax"] = data_dict["scores_range"][1]

layout = go.Layout(title="title", xaxis={"title": xtitle}, yaxis={"title": ytitle})
go.Figure(data=[heatmap], layout=layout)

In [None]:
import plotly.graph_objects as go

bin_vals_t = np.ascontiguousarray(np.transpose(bin_vals, (1, 0)))

heatmaps = [bin_vals_t, bin_vals]

heatmap = go.Heatmap(z=bin_vals_t, x=bin_labels_left, y=bin_labels_right)
if data_dict.get("scores_range", None) is not None:
    heatmap["zmin"] = data_dict["scores_range"][0]
    heatmap["zmax"] = data_dict["scores_range"][1]

layout = go.Layout(title="title", xaxis={"title": xtitle}, yaxis={"title": ytitle})
go.Figure(data=[heatmap], layout=layout)

In [None]:
import plotly.express as px


bin_vals_t = np.ascontiguousarray(np.transpose(bin_vals, (1, 0)))

fig_px = px.imshow(
    bin_vals_t,
    #x=bin_labels_right,
    #y=bin_labels_left,
    #color_continuous_scale="RdBu",
    zmin = data_dict["scores_range"][0],
    zmax = data_dict["scores_range"][1],
    aspect="auto",
    labels=dict(x=ytitle, y=xtitle, color="Score"),
    title="Interaction Heatmap"
)


fig_px.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(
    z=bin_vals,
    x=bin_labels_right,
    y=bin_labels_left,
    colorbar=dict(title="Score"),
    #colorscale="RdBu"
))
fig.update_layout(
    title="Interaction Heatmap",
    xaxis_title=ytitle,
    yaxis_title=xtitle
)
fig.show()

In [None]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import roc_auc_score

# Predict on test set
y_pred = ebm.predict(X_test)

y_pred_proba = ebm.predict_proba(X_test)[:,1]

#y_pred = y_pred_proba > 0.9

#y_pred_proba[:] = True # 0.5
#y_pred_proba[:] = False # 0.5

#y_pred[:] = True #
#y_pred[:] = False #


# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Calculate F1 and accuracy
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"ROC: {roc_auc}")
# better for imbalanced data, doesn't consider tn
print(f"F1 Score: {f1}")
# proportion of tp+tn to all predictions (misleading on imbalanced sets)
print(f"Accuracy: {accuracy}")

# Close and type only
# ROC: 0.637524657368759
# F1 Score: 0.8399778389293089
# Accuracy: 0.7410475463645592

In [None]:
ebm.score(X_test, y_test) # mean accuracy

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred_proba > 0.5)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[1, 0])
disp.plot()

In [None]:
ebm_perf = ROC(ebm).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)

In [None]:
# ebm_local = ebm.explain_local(X_test[:5], y_test[:5], name='EBM')
# show(ebm_local, 0)

In [None]:
import gamchanger as gc
gc.visualize(ebm, X_test.to_pandas(), y_test.to_pandas())