In [4]:
import os
import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from model_settings import ms

ms.find_root(Path())

# functions

In [5]:
def compute_RMSE(diff):
    if len(diff)>0:
        return np.sqrt(np.mean(diff.values**2))
        
def compute_MAE(diff):
    if len(diff)>0:
        return np.mean(np.abs(diff.values))

# loading model

In [6]:
root = os.path.join(ms.root,ms.MacDirEx)
models_dir = os.path.join(root,ms.trained_models)
models = pd.Series([f for f in os.listdir(models_dir) if not f.startswith('.') and f.find('Legacy')])
for i,m in enumerate(models):
    print(f"{i}     {m}")

0     2024-11-09 124653897492 cboe spx relative barrier
1     2024-11-09 112825652640 cboe spx relative asian


In [7]:
selected_model = models.iloc[1]
model_dir = os.path.join(models_dir,selected_model)
pickle = [f for f in os.listdir(model_dir) if f.endswith('.pkl')][0]
picke_dir = os.path.join(model_dir,pickle)
model = joblib.load(picke_dir)
model

{'seed': 1312,
 'raw_data':         spot_price  strike_price  days_to_maturity  n_fixings  \
 0           1274.0        637.00                 7        1.0   
 1           1274.0        637.00                 7        1.0   
 2           1274.0        637.00                 7        1.0   
 3           1274.0        637.00                 7        1.0   
 4           1274.0        955.50                 7        1.0   
 ...            ...           ...               ...        ...   
 338935      5857.0       7320.75                84        1.0   
 338936      5857.0       8785.00                84        1.0   
 338937      5857.0       8785.00                84        1.0   
 338938      5857.0       8785.00                84        1.0   
 338939      5857.0       8785.00                84        1.0   
 
         fixing_frequency  past_fixings averaging_type     w  risk_free_rate  \
 0                      7             0      geometric  call        0.001578   
 1                 

In [8]:
print('model attributes:\n')
for k in model.keys():
    print(k)

model attributes:

seed
raw_data
dataset
target_name
excluded_features
numerical_features
categorical_features
feature_set
n_features
development_dates
test_dates
train_data
test_data
train_X
train_y
test_X
test_y
preprocessor
pipeline
model
model_fit
dnn_runtime
numerical_scaler
dnn_params
transformers
regressor
dnn_pipeline


In [9]:
for col in model['feature_set']:
    print(f"{col.replace("_"," ")}:",f"\n{model['test_data'][col].copy().squeeze().sort_values().drop_duplicates().reset_index(drop=True)}\n")
print()

days to maturity: 
0     7
1    28
2    84
Name: days_to_maturity, dtype: int64

fixing frequency: 
0     7
1    28
2    84
Name: fixing_frequency, dtype: int64

past fixings: 
0    0
Name: past_fixings, dtype: int64

risk free rate: 
0      0.000330
1      0.000355
2      0.000380
3      0.000406
4      0.000431
         ...   
781    0.054494
782    0.054500
783    0.054513
784    0.054801
785    0.054889
Name: risk_free_rate, Length: 786, dtype: float64

dividend rate: 
0       0.012624
1       0.012628
2       0.012641
3       0.012671
4       0.012687
          ...   
1152    0.023672
1153    0.023681
1154    0.023780
1155    0.023823
1156    0.024133
Name: dividend_rate, Length: 1157, dtype: float64

kappa: 
0       3.553361e-10
1       3.924365e-10
2       7.518568e-10
3       8.464905e-10
4       1.055398e-09
            ...     
1747    6.161121e+01
1748    6.214923e+01
1749    6.690466e+01
1750    8.814194e+01
1751    1.158504e+02
Name: kappa, Length: 1752, dtype: float64

th

In [10]:
train_data = model['train_data'].copy()
test_data = model['test_data'].copy()
train_data['calculation_date'] = pd.to_datetime(train_data['calculation_date'],format='mixed')
test_data['calculation_date'] = pd.to_datetime(test_data['calculation_date'],format='mixed')
test_data = test_data.set_index('calculation_date').sort_index()
train_data = train_data.set_index('calculation_date').sort_index()
print(model['feature_set'])

['days_to_maturity', 'fixing_frequency', 'past_fixings', 'risk_free_rate', 'dividend_rate', 'kappa', 'theta', 'rho', 'eta', 'v0', 'relative_spot', 'averaging_type', 'w']


# retraining

In [11]:
from convsklearn import convsklearn
help(convsklearn)

Help on class convsklearn in module convsklearn.convsklearn:

class convsklearn(builtins.object)
 |  convsklearn(target_name='observed_price', excluded_features=['barrier_price', 'asian_price', 'observed_price', 'outin', 'updown', 'n_fixings'], seed=1312)
 |
 |  a proprietary class of convenience wrappers for sklearn
 |
 |  Methods defined here:
 |
 |  __init__(self, target_name='observed_price', excluded_features=['barrier_price', 'asian_price', 'observed_price', 'outin', 'updown', 'n_fixings'], seed=1312)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |
 |  load_data(self, data)
 |
 |  preprocess_data(self, development_dates, test_dates, plot=True)
 |
 |  run_dnn(self, print_details=True)
 |
 |  test_prediction_accuracy(self)
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |
 |  __dict__
 |      dictionary for instance variables
 |
 |  __weakref__
 |      list of weak references to the object



In [28]:
test_dates = model['test_dates']
full_dataset = model['dataset']
pricename = [f[:f.find('_',0)] for f in full_dataset.columns if f.find('asian_price')!=-1 or f.find('barrier_price')!=-1][0]
all_dates = full_dataset['date'].drop_duplicates().sort_values().reset_index(drop=True)
all_dates = pd.to_datetime(all_dates,format='mixed')
# all_dates
# i = 0
# for date in all_dates:
#     i += 1
#     plt.scatter(date,i)

# retraining_frequency = 21

# retraining_dates = test_dates.iloc[0::retraining_frequency]
# retraining_dates = test_dates.iloc[np.linspace(0,
# retraining_dates = retraining_dates.reset_index(drop=True)
# retraining_dates
all_dates[:20]

0    2012-01-03
1    2012-01-05
2    2012-01-06
3    2012-01-10
4    2012-01-18
5    2012-01-19
6    2012-01-24
7    2012-01-25
8    2012-01-27
9    2012-01-30
10   2012-02-01
11   2012-02-02
12   2012-02-03
13   2012-02-06
14   2012-02-08
15   2012-02-14
16   2012-02-15
17   2012-02-16
18   2012-02-17
19   2012-02-23
Name: date, dtype: datetime64[ns]

In [34]:
ms.root = root
ms.collect_spx_calibrations()
df = ms.spx_calibrations.copy()
df['calculation_date'] = pd.to_datetime(df['calculation_date'],format='mixed')
df

Unnamed: 0,calculation_date,spot_price,theta,kappa,rho,eta,v0,feller,contracts_count,total_volume,risk_free_rate,dividend_rate
0,2012-01-03 13:00:22.494,1274.0,2.008515e-01,3.183380,-1.000000,0.601996,0.012430,0.916374,8.0,323.0,0.001578,0.020858
1,2012-01-05 16:07:08.939,1274.0,5.275148e-02,20.705205,-0.798106,5.380077,0.281898,-26.760769,5.0,24528.0,0.001776,0.020865
2,2012-01-06 15:31:47.973,1278.0,1.008715e-01,1.039146,-1.000000,0.464080,0.039313,-0.005730,11.0,74203.0,0.001675,0.020934
3,2012-01-10 16:14:23.010,1289.0,2.803770e-01,4.503545,-1.000000,0.872259,0.017639,1.764546,9.0,134.0,0.001523,0.020710
4,2012-01-18 09:46:15.111,1292.0,8.745319e-02,9.048155,-0.840488,4.336362,0.173898,-17.221456,6.0,17882.0,0.001017,0.020491
...,...,...,...,...,...,...,...,...,...,...,...,...
1878,2024-10-07 15:57:53.651,5729.0,4.727916e-02,12.794262,-0.721929,3.059122,0.051629,-8.148423,5.0,43542.0,0.049994,0.013044
1879,2024-10-08 15:48:23.420,5726.0,4.301618e-02,12.417660,-0.752695,6.719982,0.158345,-44.089840,8.0,31196.0,0.049994,0.013044
1880,2024-10-09 15:33:43.664,5785.0,2.551542e-01,3.676912,-1.000000,0.917628,0.012088,1.034319,6.0,29867.0,0.040569,0.012830
1881,2024-10-14 12:23:32.382,5863.0,5.510754e-02,20.301854,-0.670089,6.124247,0.000915,-35.268836,7.0,79831.0,0.041813,0.012687


In [36]:
problem = df[(df['calculation_date']>=all_dates[2])&(df['calculation_date']<=all_dates[5])]
problem

Unnamed: 0,calculation_date,spot_price,theta,kappa,rho,eta,v0,feller,contracts_count,total_volume,risk_free_rate,dividend_rate
2,2012-01-06 15:31:47.973,1278.0,0.100872,1.039146,-1.0,0.46408,0.039313,-0.00573,11.0,74203.0,0.001675,0.020934
3,2012-01-10 16:14:23.010,1289.0,0.280377,4.503545,-1.0,0.872259,0.017639,1.764546,9.0,134.0,0.001523,0.02071
4,2012-01-18 09:46:15.111,1292.0,0.087453,9.048155,-0.840488,4.336362,0.173898,-17.221456,6.0,17882.0,0.001017,0.020491
5,2012-01-18 13:35:21.070,1299.0,0.123682,7.111792,-0.748386,2.702167,0.000188,-5.542509,6.0,54919.0,0.001017,0.020491


In [None]:
from df_collector import df_collector
help(df_collector)
trades = df_collector.collect_dfs(os.path.join(root,ms.cboe_spx_trades),n_jobs=1)

In [None]:
cols = ['cpu','insample_MAE','insample_RMSE','outofsample_MAE','outofsample_RMSE']
df = pd.DataFrame(np.tile(np.nan,(len(retraining_dates),len(cols))),columns=cols,index=retraining_dates)

models = {}

retrainer_base = convsklearn()
retrainer_base.excluded_features += model['excluded_features']
retrainer_base.target_name = model['target_name']
retrainer_base.load_data(full_dataset)

bar = tqdm(total=len(retraining_dates),leave=True)
for date in retraining_dates:
    print(date)
    development_dates = all_dates[all_dates<=date]
    subset_test_dates = all_dates[~all_dates.isin(development_dates)].iloc[:retraining_frequency]
    retrainer = retrainer_base
    print(f"training:  {min(development_dates).strftime('%A, %d %B, %Y')}  -  {max(development_dates).strftime('%A, %d %B, %Y')} ({len(development_dates)} trading days)")
    print(f"testing:   {min(subset_test_dates).strftime('%A, %d %B, %Y')}  -  {max(subset_test_dates).strftime('%A, %d %B, %Y')} ({len(subset_test_dates)} trading days)")
    retrainer.preprocess_data(development_dates, subset_test_dates,plot=False)
    retrainer.run_dnn(print_details=True)
    retrainer.test_prediction_accuracy()
    models[date] = retrainer.__dict__
    m = retrainer.__dict__
    df.at[date,'cpu'] = m['dnn_runtime']
    df.at[date,'insample_MAE'] = compute_MAE(m['train_data']['insample_error'])
    df.at[date,'insample_RMSE'] = compute_RMSE(m['train_data']['insample_error'])
    df.at[date,'outofsample_MAE'] = compute_MAE(m['test_data']['outofsample_error'])
    df.at[date,'outofsample_RMSE'] = compute_RMSE(m['test_data']['outofsample_error'])
    print(df.dropna())
    print('\n','%'*10,'\n')
    bar.update(1)
bar.close()

In [None]:
from plotters import PlotCols
PlotCols(df)

In [None]:
retrained = {'errors':df,'models':models}
tag = ms.timetag()
joblib.dump(retrained,f'{tag} retrained_relative_{pricename}_options.pkl')