In [32]:
import pickle
import polars as pl
import vectorbtpro as vbt
from ml.data import load_spy_sample

In [33]:
with open("./data/ebm.pkl", "rb") as f:
    ebm = pickle.load(f)
ebm

In [None]:
class EBMPredictor:
    def __init__(self) -> None:
        self._load()

    def _load(self) -> None:
        with open("./data/ebm.pkl", "rb") as f:
            self.ebm = pickle.load(f)

    def predict(self, df : pl.DataFrame) -> pl.DataFrame:
        # Features
        self.X = df['option_type', 'close', 'minute_index']

        # Predict
        proba = self.ebm.predict_proba(self.X)

        # Add probability of true class
        df = df.with_columns([
            pl.Series("proba", proba),
        ]).with_columns(
            pl.col("proba").arr.get(1).alias(f"proba_{self.ebm.classes_[1]}".lower())
        ).drop('proba')

        # Conditions
        minimum_price = 0.2
        maximum_price = 1.5
        minimum_proba = 0.5

        # Assign rank one to highest probability where conditions are met
        df = df.with_columns(
            # Filter price and probability
            pl.when((pl.col("close") >= minimum_price) & (pl.col("close") <= maximum_price) & (pl.col("proba_true") >= minimum_proba))
                # Rank based on probability of true
                .then(pl.col("proba_true").rank("min", descending=True).over('date', 'minute_index', 'option_type'))
                .otherwise(None)
                .alias("proba_true_rank")
        ).with_columns(
            # Re-rank because previous ranking might not start at one due to filter
            pl.col("proba_true_rank").rank("min", descending=False).over('date', 'minute_index', 'option_type')
        )

        # Add entry true where rank is 1
        df = df.with_columns(
            pl.col("proba_true_rank").eq_missing(1).alias("entry")
        ).drop("proba_true_rank")

        return df



In [124]:
df = load_spy_sample(months=[1])
df = df.with_columns(pl.col("close").round(2).alias("close"))
X = df['option_type', 'close', 'minute_index']
X

option_type,close,minute_index
str,f64,i32
"""C""",2.0,0
"""P""",6.6,0
"""P""",0.01,0
"""P""",0.04,0
"""C""",0.43,0
…,…,…
"""C""",0.01,389
"""C""",2.82,389
"""C""",0.01,389
"""P""",8.61,389


In [125]:
ebm = EBMPredictor()
df = ebm.predict(df)
df

symbol,date,option_type,strike,close,minute_index,moneyness,close_underlying,proba_true,entry
str,date,str,f64,f64,i32,f64,f64,f64,bool
"""SPY 240102C00471000""",2024-01-02,"""C""",471.0,2.0,0,0.322718,472.52,0.533933,false
"""SPY 240102P00479000""",2024-01-02,"""P""",479.0,6.6,0,1.371371,472.52,0.48429,false
"""SPY 240102P00447000""",2024-01-02,"""P""",447.0,0.01,0,-5.40083,472.52,0.000152,false
"""SPY 240102P00467000""",2024-01-02,"""P""",467.0,0.04,0,-1.168205,472.52,0.156075,false
"""SPY 240102C00474000""",2024-01-02,"""C""",474.0,0.43,0,-0.312236,472.52,0.748851,false
…,…,…,…,…,…,…,…,…,…
"""SPY 240131C00492000""",2024-01-31,"""C""",492.0,0.01,389,-1.849593,482.9,9.0109e-7,false
"""SPY 240131C00480000""",2024-01-31,"""C""",480.0,2.82,389,0.604167,482.9,0.098208,false
"""SPY 240131C00493000""",2024-01-31,"""C""",493.0,0.01,389,-2.048682,482.9,9.0109e-7,false
"""SPY 240131P00492000""",2024-01-31,"""P""",492.0,8.61,389,1.884448,482.9,0.45036,false


In [126]:
df_entry_true = df.filter(pl.col("entry") == True)
df_entry_true

symbol,date,option_type,strike,close,minute_index,moneyness,close_underlying,proba_true,entry
str,date,str,f64,f64,i32,f64,f64,f64,bool
"""SPY 240102C00475000""",2024-01-02,"""C""",475.0,0.22,0,-0.522105,472.52,0.788184,true
"""SPY 240102P00470000""",2024-01-02,"""P""",470.0,0.23,0,-0.533311,472.52,0.930577,true
"""SPY 240102C00475000""",2024-01-02,"""C""",475.0,0.22,1,-0.490526,472.67,0.788277,true
"""SPY 240102P00471000""",2024-01-02,"""P""",471.0,0.4,1,-0.353312,472.67,0.935494,true
"""SPY 240102P00471000""",2024-01-02,"""P""",471.0,0.34,2,-0.376496,472.78,0.944917,true
…,…,…,…,…,…,…,…,…,…
"""SPY 240131P00484000""",2024-01-31,"""P""",484.0,0.38,385,0.064091,483.69,0.809527,true
"""SPY 240131P00484000""",2024-01-31,"""P""",484.0,0.34,386,0.057885,483.72,0.80301,true
"""SPY 240131P00484000""",2024-01-31,"""P""",484.0,0.31,387,0.028934,483.86,0.633739,true
"""SPY 240131P00484000""",2024-01-31,"""P""",484.0,0.47,388,0.100307,483.515,0.624752,true


In [55]:
print(f"Classes {ebm.classes_}")
proba = ebm.predict_proba(X)

Classes [False  True]


In [56]:
# Add probability of true class
df = df.with_columns([
    pl.Series("proba", proba),
]).with_columns(
     pl.col("proba").arr.get(1).alias(f"proba_{ebm.classes_[1]}".lower())
).drop('proba')
df

symbol,date,option_type,strike,close,minute_index,moneyness,close_underlying,proba_true
str,date,str,f64,f64,i32,f64,f64,f64
"""SPY 240102C00471000""",2024-01-02,"""C""",471.0,2.0,0,0.322718,472.52,0.533933
"""SPY 240102P00479000""",2024-01-02,"""P""",479.0,6.6,0,1.371371,472.52,0.48429
"""SPY 240102P00447000""",2024-01-02,"""P""",447.0,0.01,0,-5.40083,472.52,0.000152
"""SPY 240102P00467000""",2024-01-02,"""P""",467.0,0.04,0,-1.168205,472.52,0.156075
"""SPY 240102C00474000""",2024-01-02,"""C""",474.0,0.43,0,-0.312236,472.52,0.748851
…,…,…,…,…,…,…,…,…
"""SPY 240131C00492000""",2024-01-31,"""C""",492.0,0.01,389,-1.849593,482.9,9.0109e-7
"""SPY 240131C00480000""",2024-01-31,"""C""",480.0,2.82,389,0.604167,482.9,0.098208
"""SPY 240131C00493000""",2024-01-31,"""C""",493.0,0.01,389,-2.048682,482.9,9.0109e-7
"""SPY 240131P00492000""",2024-01-31,"""P""",492.0,8.61,389,1.884448,482.9,0.45036


In [None]:
# Rank by probability of true for each minute
# df = df.with_columns(
#     pl.col("proba_true").rank("min", descending=True).over('date', 'minute_index', 'option_type').alias("proba_true_rank")
# )
# df

symbol,date,option_type,strike,close,minute_index,moneyness,close_underlying,proba_true,proba_true_rank
str,date,str,f64,f64,i32,f64,f64,f64,u32
"""SPY 240102C00471000""",2024-01-02,"""C""",471.0,2.0,0,0.322718,472.52,0.533933,7
"""SPY 240102P00479000""",2024-01-02,"""P""",479.0,6.6,0,1.371371,472.52,0.48429,11
"""SPY 240102P00447000""",2024-01-02,"""P""",447.0,0.01,0,-5.40083,472.52,0.000152,15
"""SPY 240102P00467000""",2024-01-02,"""P""",467.0,0.04,0,-1.168205,472.52,0.156075,14
"""SPY 240102C00474000""",2024-01-02,"""C""",474.0,0.43,0,-0.312236,472.52,0.748851,4
…,…,…,…,…,…,…,…,…,…
"""SPY 240131C00492000""",2024-01-31,"""C""",492.0,0.01,389,-1.849593,482.9,9.0109e-7,6
"""SPY 240131C00480000""",2024-01-31,"""C""",480.0,2.82,389,0.604167,482.9,0.098208,3
"""SPY 240131C00493000""",2024-01-31,"""C""",493.0,0.01,389,-2.048682,482.9,9.0109e-7,6
"""SPY 240131P00492000""",2024-01-31,"""P""",492.0,8.61,389,1.884448,482.9,0.45036,5


In [84]:
# Want highest probablity of true over a mininum price

minimum_price = 0.2
maximum_price = 1.5
minimum_proba = 0.5

df = df.with_columns(
    # Filter price and probability
    pl.when((pl.col("close") >= minimum_price) & (pl.col("close") <= maximum_price) & (pl.col("proba_true") >= minimum_proba))
        # Rank based on probability of true
        .then(pl.col("proba_true").rank("min", descending=True).over('date', 'minute_index', 'option_type'))
        .otherwise(None)
        .alias("proba_true_rank")
).with_columns(
    # Re-rank because previous ranking might not start at one due to filter
    pl.col("proba_true_rank").rank("min", descending=False).over('date', 'minute_index', 'option_type')
)
df

symbol,date,option_type,strike,close,minute_index,moneyness,close_underlying,proba_true,proba_true_rank
str,date,str,f64,f64,i32,f64,f64,f64,u32
"""SPY 240102C00471000""",2024-01-02,"""C""",471.0,2.0,0,0.322718,472.52,0.533933,
"""SPY 240102P00479000""",2024-01-02,"""P""",479.0,6.6,0,1.371371,472.52,0.48429,
"""SPY 240102P00447000""",2024-01-02,"""P""",447.0,0.01,0,-5.40083,472.52,0.000152,
"""SPY 240102P00467000""",2024-01-02,"""P""",467.0,0.04,0,-1.168205,472.52,0.156075,
"""SPY 240102C00474000""",2024-01-02,"""C""",474.0,0.43,0,-0.312236,472.52,0.748851,2
…,…,…,…,…,…,…,…,…,…
"""SPY 240131C00492000""",2024-01-31,"""C""",492.0,0.01,389,-1.849593,482.9,9.0109e-7,
"""SPY 240131C00480000""",2024-01-31,"""C""",480.0,2.82,389,0.604167,482.9,0.098208,
"""SPY 240131C00493000""",2024-01-31,"""C""",493.0,0.01,389,-2.048682,482.9,9.0109e-7,
"""SPY 240131P00492000""",2024-01-31,"""P""",492.0,8.61,389,1.884448,482.9,0.45036,


In [85]:
from ml.utils import by_date_str


df_minute0 = df.filter((pl.col("minute_index") == 0))
df_minute0 = by_date_str(df_minute0, "2024-01-02")

df_minute0 = df.filter((pl.col("minute_index") == 389))
df_minute0 = by_date_str(df_minute0, "2024-01-31")

df_minute0 = df_minute0.sort('proba_true', descending=True)
display(df_minute0.filter(pl.col("option_type") == "C").head(5))
display(df_minute0.filter(pl.col("option_type") == "P").head(5))



symbol,date,option_type,strike,close,minute_index,moneyness,close_underlying,proba_true,proba_true_rank
str,date,str,f64,f64,i32,f64,f64,f64,u32
"""SPY 240131C00483000""",2024-01-31,"""C""",483.0,0.21,389,-0.020704,482.9,0.150849,
"""SPY 240131C00460000""",2024-01-31,"""C""",460.0,23.42,389,4.978261,482.9,0.106464,
"""SPY 240131C00480000""",2024-01-31,"""C""",480.0,2.82,389,0.604167,482.9,0.098208,
"""SPY 240131C00481000""",2024-01-31,"""C""",481.0,2.4,389,0.39501,482.9,0.097546,
"""SPY 240131C00482000""",2024-01-31,"""C""",482.0,1.17,389,0.186722,482.9,0.096613,


symbol,date,option_type,strike,close,minute_index,moneyness,close_underlying,proba_true,proba_true_rank
str,date,str,f64,f64,i32,f64,f64,f64,u32
"""SPY 240131P00483000""",2024-01-31,"""P""",483.0,0.18,389,0.020708,482.9,0.618875,
"""SPY 240131P00488000""",2024-01-31,"""P""",488.0,4.72,389,1.056119,482.9,0.503702,
"""SPY 240131P00484000""",2024-01-31,"""P""",484.0,0.9,389,0.22779,482.9,0.502329,1.0
"""SPY 240131P00485000""",2024-01-31,"""P""",485.0,1.78,389,0.434873,482.9,0.451682,
"""SPY 240131P00493000""",2024-01-31,"""P""",493.0,8.38,389,2.09153,482.9,0.45036,


In [87]:
df_top_proba = df.filter(pl.col("proba_true_rank") == 1)
df_top_proba

symbol,date,option_type,strike,close,minute_index,moneyness,close_underlying,proba_true,proba_true_rank
str,date,str,f64,f64,i32,f64,f64,f64,u32
"""SPY 240102C00475000""",2024-01-02,"""C""",475.0,0.22,0,-0.522105,472.52,0.788184,1
"""SPY 240102P00470000""",2024-01-02,"""P""",470.0,0.23,0,-0.533311,472.52,0.930577,1
"""SPY 240102C00475000""",2024-01-02,"""C""",475.0,0.22,1,-0.490526,472.67,0.788277,1
"""SPY 240102P00471000""",2024-01-02,"""P""",471.0,0.4,1,-0.353312,472.67,0.935494,1
"""SPY 240102P00471000""",2024-01-02,"""P""",471.0,0.34,2,-0.376496,472.78,0.944917,1
…,…,…,…,…,…,…,…,…,…
"""SPY 240131P00484000""",2024-01-31,"""P""",484.0,0.38,385,0.064091,483.69,0.809527,1
"""SPY 240131P00484000""",2024-01-31,"""P""",484.0,0.34,386,0.057885,483.72,0.80301,1
"""SPY 240131P00484000""",2024-01-31,"""P""",484.0,0.31,387,0.028934,483.86,0.633739,1
"""SPY 240131P00484000""",2024-01-31,"""P""",484.0,0.47,388,0.100307,483.515,0.624752,1
