In [1]:
%reload_ext autoreload
%autoreload 2

import polars as pl
import plotly.express as px

from ml.data import load_spy_sample, load_vix
from ml.utils import by_date, train_holdout_dates

from interpret import show
from interpret.perf import ROC
from interpret import set_visualize_provider
from interpret.data import ClassHistogram
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())

In [None]:
class DataLoader:
    def __init__(self, months=[1], sample_fraction=None, call_put=None, name='Train Data') -> None:
        if call_put is not None:
            assert call_put in ("C", "P"), "call_put must be 'C' or 'P'"

        self.name = name
        self.call_put = call_put

        df = load_spy_sample(months=months)

        # Use sampling
        if sample_fraction:
            df = df.sample(fraction=sample_fraction, seed=42)

        print(f"Loaded {len(df):,} samples ({df['symbol'].n_unique():,} series)")

        self.df = df

        self.pre_process()
        self.make_splits()

    def pre_process(self) -> None:
        df = self.df

        # Round close price
        data.df.with_columns(pl.col("close").round(2).alias("close"))

        # Add backest returns
        vbt_returns = pl.read_parquet("C:/Dev/zero/data/vbt_returns_sl_none.parquet")
        df = df.join(vbt_returns, on=['symbol', 'minute_index'], how='left')

        # Filter calls or puts only
        if self.call_put:
            df = df.filter(pl.col('option_type') == self.call_put)

        df = df.with_columns(df['option_type'].cast(pl.Categorical))

        df = df.with_columns(
            (pl.col("close") / pl.col("strike")).alias("close_div_strike"),
            (pl.col("close") / pl.col("moneyness")).alias("close_div_moneyness"),
            (pl.col("close") / pl.col("close_underlying")).alias("close_div_underlying")
        )

        # Assume we don't sell over this price (low data)
        df = df.filter(pl.col('close') < 5)

        # Assume we don't buy under this price (hard to fill)
        df = df.filter(pl.col('close') >= 0.02)

        # Assume we don't sell this far OTM (low data)
        df = df.filter(pl.col('moneyness') >= -3)

        # Target is positive return
        df = df.with_columns((pl.col('return') > 0. ).alias('target'))

        # This happens on the last bar
        df = df.filter(pl.col('target').is_not_null())

        # Add daily vix
        # vix = load_vix()
        # self.df = self.df.join(vix, on='date', how='left')

        self.df = df

    def make_splits(self) -> None:

        X = self.df

        # Features
        X = X['date', 'option_type', 'close', 'minute_index','target']

        # Train/test split by date
        train_dates, test_dates = train_holdout_dates(self.df, test_size=0.25)

        X_train = by_date(X, train_dates).drop('date')
        X_test = by_date(X, test_dates).drop('date')

        y_train = X_train.drop_in_place('target')
        y_test = X_test.drop_in_place('target')

        self.X = X
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test


data = DataLoader(months=[1], sample_fraction=0.2) # ,2,3,4,5,6,7,8,9,10,11,12
data.X_train.head(3)

Loaded 41,045 samples (1,681 series)


option_type,close,minute_index
cat,f64,i32
"""P""",0.267,199
"""C""",1.704545,300
"""C""",2.14,109


symbol,date,option_type,strike,close,minute_index,moneyness,close_underlying,entry_price,exit_price,pnl,return,close_div_strike,close_div_moneyness,close_div_underlying,target
str,date,cat,f64,f64,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool
"""SPY 240105P00466000""",2024-01-05,"""P""",466.0,0.27,199,-0.320856,467.5,0.267,0.01,0.224,0.838951,0.000573,-0.83215,0.000571,true
"""SPY 240110C00475000""",2024-01-10,"""C""",475.0,1.7,300,0.345263,476.64,1.704545,1.44,0.231545,0.13584,0.003589,4.936946,0.003576,true
"""SPY 240124P00490000""",2024-01-24,"""P""",490.0,1.92,143,0.366646,488.21,1.915,4.97,-3.088,-1.612533,0.003908,5.223029,0.003922,false
"""SPY 240109C00471000""",2024-01-09,"""C""",471.0,2.14,109,0.428875,473.02,2.14,2.82,-0.713,-0.333178,0.004544,4.989802,0.004524,false
"""SPY 240112P00469000""",2024-01-12,"""P""",469.0,0.02,155,-1.3815,475.57,0.02,0.01,-0.023,-1.15,0.000043,-0.014477,0.000042,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""SPY 240102C00476000""",2024-01-02,"""C""",476.0,0.05,38,-0.821429,472.09,0.05,0.01,0.007,0.14,0.000105,-0.06087,0.000106,true
"""SPY 240129C00489000""",2024-01-29,"""C""",489.0,0.23,167,-0.167689,488.18,0.234,2.12,-1.919,-8.200855,0.000479,-1.395439,0.000479,false
"""SPY 240117C00477000""",2024-01-17,"""C""",477.0,0.03,36,-1.067086,471.91,0.035,0.01,-0.008,-0.228571,0.000073,-0.0328,0.000074,false
"""SPY 240131P00487000""",2024-01-31,"""P""",487.0,1.22,211,-0.081043,487.395,1.22,3.363333,-2.176333,-1.78388,0.002505,-15.053719,0.002503,false


In [15]:
hist = ClassHistogram().explain_data(data.X_train.to_pandas(), data.y_train.to_pandas(), name=data.name)
show(hist)

In [41]:
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier(feature_types=['nominal', 'uniform', 'uniform'])
ebm.fit(data.X_train.to_pandas(), data.y_train.to_pandas())

#ebm.monotonize("minute_index", increasing=True)

In [29]:
ebm.term_names_

['option_type',
 'close',
 'minute_index',
 'option_type & close',
 'option_type & minute_index',
 'close & minute_index']

In [31]:
ebm.feature_types_in_

['nominal', 'continuous', 'continuous']

In [42]:
ebm.bins_[2]

[array([3.78669276e-01, 7.57338552e-01, 1.13600783e+00, ...,
        3.85863992e+02, 3.86242661e+02, 3.86621331e+02], shape=(1021,)),
 array([  6.24193548,  12.48387097,  18.72580645,  24.96774194,
         31.20967742,  37.4516129 ,  43.69354839,  49.93548387,
         56.17741935,  62.41935484,  68.66129032,  74.90322581,
         81.14516129,  87.38709677,  93.62903226,  99.87096774,
        106.11290323, 112.35483871, 118.59677419, 124.83870968,
        131.08064516, 137.32258065, 143.56451613, 149.80645161,
        156.0483871 , 162.29032258, 168.53225806, 174.77419355,
        181.01612903, 187.25806452, 193.5       , 199.74193548,
        205.98387097, 212.22580645, 218.46774194, 224.70967742,
        230.9516129 , 237.19354839, 243.43548387, 249.67741935,
        255.91935484, 262.16129032, 268.40322581, 274.64516129,
        280.88709677, 287.12903226, 293.37096774, 299.61290323,
        305.85483871, 312.09677419, 318.33870968, 324.58064516,
        330.82258065, 337.06451613

In [44]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [8]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import roc_auc_score

# Predict on test set
y_pred = ebm.predict(data.X_test)

y_pred_proba = ebm.predict_proba(data.X_test)[:,1]

#y_pred = y_pred_proba > 0.9

#y_pred_proba[:] = True # 0.5
#y_pred_proba[:] = False # 0.5

#y_pred[:] = True #
#y_pred[:] = False #


# Calculate ROC AUC score
roc_auc = roc_auc_score(data.y_test, y_pred_proba)

# Calculate F1 and accuracy
f1 = f1_score(data.y_test, y_pred)
accuracy = accuracy_score(data.y_test, y_pred)

print(f"ROC: {roc_auc:.3f}")
# better for imbalanced data, doesn't consider tn
print(f"F1 Score: {f1:.3f}")
# proportion of tp+tn to all predictions (misleading on imbalanced sets)
print(f"Accuracy: {accuracy:.3f}")


# ebm.score(X_test, y_test) # mean accuracy

# Calls and puts 0.15 sample
# ROC: 0.690
# F1 Score: 0.125
# Accuracy: 0.646

# Calls only
# ROC: 0.701
# F1 Score: 0.140
# Accuracy: 0.652

# puts only
# ROC: 0.676
# F1 Score: 0.124
# Accuracy: 0.642



ROC: 0.827
F1 Score: 0.803
Accuracy: 0.743


In [None]:
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# cm = confusion_matrix(data.y_test, y_pred_proba > 0.5)

# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[1, 0])
# disp.plot()

In [None]:
# ebm_perf = ROC(ebm).explain_perf(data.X_test, data.y_test, name='EBM')
# show(ebm_perf)

In [None]:
# ebm_local = ebm.explain_local(X_test[:5], y_test[:5], name='EBM')
# show(ebm_local, 0)

In [None]:
import gamchanger as gc

# Confusion Matrix
# https://developers.google.com/machine-learning/crash-course/classification/thresholding

gc.visualize(ebm, data.X_test.to_pandas(), data.y_test.to_pandas())

6it [00:00, 2999.14it/s]


In [12]:
import pickle

with open("./data/ebm.pkl", "wb") as f:
    pickle.dump(ebm, f)

In [13]:
with open("./data/ebm.pkl", "rb") as f:
    ebm_loaded = pickle.load(f)
ebm_loaded