# PyCaret
Based on "Python for Finance cookbook: Chapter 7."

In [1]:
!pip install pycaret scipy==1.11.4



In [2]:
%matplotlib inline
%config InlineBackend.figure_format = "retina"
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from decimal import Decimal

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data

#### SPX OHLC

In [4]:
spx = yf.Ticker('^SPX')
spx_history = spx.history(period='2y')
spx_history

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-08-17 00:00:00-04:00,4280.399902,4302.180176,4253.080078,4274.040039,3885030000,0.0,0.0
2022-08-18 00:00:00-04:00,4273.129883,4292.529785,4261.979980,4283.740234,3340330000,0.0,0.0
2022-08-19 00:00:00-04:00,4266.310059,4266.310059,4218.700195,4228.479980,3761340000,0.0,0.0
2022-08-22 00:00:00-04:00,4195.080078,4195.080078,4129.859863,4137.990234,3907430000,0.0,0.0
2022-08-23 00:00:00-04:00,4133.089844,4159.770020,4124.029785,4128.729980,3823520000,0.0,0.0
...,...,...,...,...,...,...,...
2024-08-12 00:00:00-04:00,5351.879883,5371.200195,5324.370117,5344.390137,3360160000,0.0,0.0
2024-08-13 00:00:00-04:00,5376.979980,5436.500000,5376.979980,5434.430176,3648980000,0.0,0.0
2024-08-14 00:00:00-04:00,5442.359863,5463.220215,5415.910156,5455.209961,3380050000,0.0,0.0
2024-08-15 00:00:00-04:00,5501.129883,5546.229980,5501.129883,5543.220215,3723310000,0.0,0.0


#### ATM Options

In [5]:
date_range = pd.date_range(start="2024-01-03", end = "2024-01-31", freq='D')
df_concat = pd.DataFrame()

for date in date_range:
    date_hyphens = date.strftime('%Y-%m-%d')
    date_no_hyphens = date.strftime('%Y%m%d')

    if date_hyphens not in spx_history.index:
        print(date_hyphens + " OHLC not found")
        continue
    else:
        print(f"{date_hyphens}")
        row = spx_history.loc[date_hyphens]
        open = row['Open']
        print(f"\tOpen:  {open:.2f}")
        path =  "/content/drive/MyDrive/Finance/Trading/data/spxw"
        strikes = pd.read_csv(f"{path}/strikes/{date_no_hyphens}.csv.gz", compression='gzip')
        atm_strike = strikes.loc[(strikes.sub(Decimal(str(open * 1000))).abs().idxmin())]
        atm_strike = atm_strike.iloc[0].strike
        print(f"\tATM Strike: {atm_strike}")

        interval = '1m'
        df = pd.read_csv(f"{path}/0dte/{interval}/{date_no_hyphens}.csv.gz", compression='gzip')
        df = df[df['strike'] == atm_strike]
        expected_rows = 782
        if (len(df) != expected_rows):
            print(f"{date_no_hyphens} expected {expected_rows} rows but got {df.rows.count()}")
        print(f"\tOption prices: {len(df)}")

        df_concat = pd.concat([df_concat, df])

df = df_concat.copy()

2024-01-03
	Open:  4725.07
	ATM Strike: 4725000
	Option prices: 782
2024-01-04
	Open:  4697.42
	ATM Strike: 4695000
	Option prices: 782
2024-01-05
	Open:  4690.57
	ATM Strike: 4690000
	Option prices: 782
2024-01-06 OHLC not found
2024-01-07 OHLC not found
2024-01-08
	Open:  4703.70
	ATM Strike: 4705000
	Option prices: 782
2024-01-09
	Open:  4741.93
	ATM Strike: 4740000
	Option prices: 782
2024-01-10
	Open:  4759.94
	ATM Strike: 4760000
	Option prices: 782
2024-01-11
	Open:  4792.13
	ATM Strike: 4790000
	Option prices: 782
2024-01-12
	Open:  4791.18
	ATM Strike: 4790000
	Option prices: 782
2024-01-13 OHLC not found
2024-01-14 OHLC not found
2024-01-15 OHLC not found
2024-01-16
	Open:  4772.35
	ATM Strike: 4770000
	Option prices: 782
2024-01-17
	Open:  4739.13
	ATM Strike: 4740000
	Option prices: 782
2024-01-18
	Open:  4760.10
	ATM Strike: 4760000
	Option prices: 782
2024-01-19
	Open:  4796.28
	ATM Strike: 4795000
	Option prices: 782
2024-01-20 OHLC not found
2024-01-21 OHLC not found
20

### Prepare data

In [6]:
# Add columns
df['mid'] = round((df['bid'] + df['ask']) / 2, 4)
# df['vbid'] = round((df['bid'] * df['bid_size']) /2, 4)
# df['vask'] =  round((df['ask'] * df['ask_size']) /2, 4)

# Drop columns
df.drop(columns=['expiration', 'root', 'bid_exchange', 'bid_condition', 'ask_exchange', 'ask_condition'], inplace=True)
df.drop(columns=['bid', 'bid_size', 'ask', 'ask_size'], inplace=True)
df.drop(columns=['Unnamed: 0'], inplace=True)
df.drop_duplicates(inplace=True)

# Pivot bid/ask from separate rows to columns
pivot_df = df.pivot_table(index=['date', 'ms_of_day'], columns='right', values='mid', aggfunc='first')
pivot_df.columns = ['call_mid', 'put_mid'] # Rename the columns
pivot_df = pivot_df.reset_index() # Reset the index

# Remove opening interval and set datetime index
pivot_df = pivot_df[pivot_df['ms_of_day'] != 34200000]
pivot_df['ts'] = pd.to_datetime(pivot_df['date'], format='%Y%m%d') + pd.to_timedelta(pivot_df['ms_of_day'], unit='ms')
pivot_df.set_index('ts', inplace=True)


print(f"Number of quotes loaded: {len(pivot_df)}")
pivot_df

df = pivot_df.copy()
df = df.asfreq('T')  # T = minute; S = second

Number of quotes loaded: 7800


## Forward fill non-trading hours!

In [16]:
df.fillna(method='ffill', inplace=True)

## Setup
* data=df: Specifies the dataset to be used for the experiment. df should be a DataFrame containing your time series data.
* fh=12: Forecast horizon (fh) is set to 12, meaning the experiment will aim to predict the next 12 time points into the future.
* fold=3: Number of cross-validation folds to 3. This means the data will be split into 3 parts to validate the model’s performance.
* session_id=123: Random seed.


In [17]:
from pycaret.datasets import get_data
from pycaret.time_series import TSForecastingExperiment

exp = TSForecastingExperiment()
exp.setup(data=df, fh=12, target='put_mid', fold=3, session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,put_mid
2,Approach,Univariate
3,Exogenous Variables,Present
4,Original data shape,"(40710, 4)"
5,Transformed data shape,"(40710, 4)"
6,Transformed train set shape,"(40698, 4)"
7,Transformed test set shape,"(12, 4)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


<pycaret.time_series.forecasting.oop.TSForecastingExperiment at 0x7df7ed356bc0>

In [18]:
exp.plot_model( plot="diagnostics", fig_kwargs={"height": 800, "width": 1000})
# exp.plot_model(plot="cv")
# exp.plot_model(plot="acf")
# exp.plot_model(plot="periodogram")
# exp.plot_model(plot="fft")

Output hidden; open in https://colab.research.google.com to view.

In [19]:
exp.check_stats()
# exp.check_stats(test="summary")

Unnamed: 0,Test,Test Name,Data,Property,Setting,Value
0,Summary,Statistics,Transformed,Length,,40710.0
1,Summary,Statistics,Transformed,# Missing Values,,0.0
2,Summary,Statistics,Transformed,Mean,,4.279529
3,Summary,Statistics,Transformed,Median,,0.275
4,Summary,Statistics,Transformed,Standard Deviation,,6.368783
5,Summary,Statistics,Transformed,Variance,,40.561396
6,Summary,Statistics,Transformed,Kurtosis,,6.72031
7,Summary,Statistics,Transformed,Skewness,,2.212228
8,Summary,Statistics,Transformed,# Distinct Values,,773.0
9,White Noise,Ljung-Box,Transformed,Test Statictic,"{'alpha': 0.05, 'K': 24}",916786.074744


## Train

In [20]:
best_pipelines = exp.compare_models(
    sort="MAPE", turbo=False, n_select=5
)

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec)
xgboost_cds_dt,Extreme Gradient Boosting w/ Cond. Deseasonalize & Detrending,10.9597,3.464,4.279,5.0024,0.1025,0.1094,-1.9367,2.04
lightgbm_cds_dt,Light Gradient Boosting w/ Cond. Deseasonalize & Detrending,12.1085,3.7011,4.7314,5.3478,0.1089,0.1141,-8.7477,4.1333
et_cds_dt,Extra Trees w/ Cond. Deseasonalize & Detrending,11.6381,3.5842,4.5417,5.1739,0.111,0.1171,-1.8212,22.6833
en_cds_dt,Elastic Net w/ Cond. Deseasonalize & Detrending,12.7583,4.041,4.9839,5.8383,0.1144,0.127,-4.6512,1.64
lr_cds_dt,Linear w/ Cond. Deseasonalize & Detrending,12.2181,3.7525,4.7695,5.4186,0.115,0.1241,-2.2196,1.7133
ridge_cds_dt,Ridge w/ Cond. Deseasonalize & Detrending,12.2173,3.7523,4.7692,5.4183,0.115,0.1241,-2.2191,1.0467
br_cds_dt,Bayesian Ridge w/ Cond. Deseasonalize & Detrending,12.2162,3.7519,4.7688,5.4178,0.115,0.1241,-2.2184,1.11
rf_cds_dt,Random Forest w/ Cond. Deseasonalize & Detrending,12.1694,3.7456,4.749,5.4068,0.1162,0.1226,-2.1111,100.77
ada_cds_dt,AdaBoost w/ Cond. Deseasonalize & Detrending,12.7969,3.9285,4.9947,5.6717,0.121,0.1273,-2.8745,10.01
gbr_cds_dt,Gradient Boosting w/ Cond. Deseasonalize & Detrending,12.8713,3.816,5.0245,5.5098,0.1216,0.13,-2.4528,29.7567


Processing:   0%|          | 0/93 [00:00<?, ?it/s]

## Tune

In [None]:
best_pipelines_tuned = [exp.tune_model(model) for model in best_pipelines]
best_pipelines_tuned

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.4s finished


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2015-12,0.2904,0.3194,0.2512,0.2904,0.0526,0.0545,0.0143
1,2016-12,0.082,0.0944,0.0649,0.0805,0.0152,0.015,0.9575
2,2017-12,0.1545,0.1741,0.1163,0.142,0.0298,0.0305,0.8222
Mean,NaT,0.1757,0.1959,0.1441,0.171,0.0325,0.0334,0.598
SD,NaT,0.0864,0.0931,0.0786,0.0881,0.0154,0.0162,0.4164


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.4s finished


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2015-12,0.3279,0.3672,0.2837,0.3339,0.0593,0.0618,-0.3031
1,2016-12,0.0765,0.0903,0.0606,0.077,0.0143,0.0142,0.9611
2,2017-12,0.1635,0.1869,0.123,0.1524,0.0317,0.0325,0.795
Mean,NaT,0.1893,0.2148,0.1558,0.1878,0.0351,0.0361,0.4843
SD,NaT,0.1043,0.1147,0.094,0.1078,0.0185,0.0196,0.5609


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.2s finished


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2015-12,0.2798,0.3274,0.242,0.2977,0.049,0.0478,-0.0362
1,2016-12,0.2613,0.2656,0.2068,0.2265,0.0471,0.0458,0.6636
2,2017-12,0.2786,0.2844,0.2096,0.232,0.053,0.0516,0.5251
Mean,NaT,0.2732,0.2925,0.2195,0.2521,0.0497,0.0484,0.3842
SD,NaT,0.0085,0.0259,0.016,0.0324,0.0025,0.0024,0.3026


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2015-12,0.2134,0.2403,0.1846,0.2185,0.0385,0.0395,0.442
1,2016-12,0.1289,0.1337,0.1021,0.114,0.0238,0.0234,0.9147
2,2017-12,0.1017,0.1206,0.0765,0.0983,0.0199,0.0201,0.9147
Mean,NaT,0.148,0.1648,0.1211,0.1436,0.0274,0.0277,0.7571
SD,NaT,0.0476,0.0536,0.0461,0.0533,0.008,0.0085,0.2228


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished


[ThetaForecaster(sp=12),
 STLForecaster(low_pass_deg=0, seasonal_deg=0, sp=12, trend_deg=0),
 AutoETS(seasonal='add', sp=12, trend='mul')]

## Blend

In [None]:
blended_model = exp.blend_models(
    best_pipelines_tuned, method="mean"
)

Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2015-12,0.2245,0.2454,0.1942,0.2232,0.0405,0.0415,0.4178
1,2016-12,0.1183,0.1284,0.0937,0.1095,0.0222,0.0219,0.9213
2,2017-12,0.0844,0.1201,0.0635,0.0979,0.0166,0.0169,0.9154
Mean,NaT,0.1424,0.1646,0.1171,0.1435,0.0265,0.0268,0.7515
SD,NaT,0.0597,0.0572,0.0559,0.0565,0.0102,0.0106,0.236
