In [1]:
import os
import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm

# functions

In [2]:
def compute_RMSE(diff):
    if len(diff)>0:
        return np.sqrt(np.mean(diff.values**2))
        
def compute_MAE(diff):
    if len(diff)>0:
        return np.mean(np.abs(diff.values))

def plot_errors(train_data,test_data):
    test_diff = test_data['outofsample_error']
    train_diff = train_data['insample_error']
    
    test_plot = test_data[['date','spot_price','rho','v0']].copy()
    test_plot = test_plot.reset_index().set_index('date')
    test_plot['MAE'] = test_diff.resample('D').apply(compute_RMSE)
    test_plot['RMSE'] = test_diff.resample('D').apply(compute_RMSE)
    test_plot = test_plot.reset_index()
    test_plot = test_plot.drop_duplicates(subset=['date'],keep='last').set_index('date').drop(columns='calculation_date')
    
    train_plot = train_data[['date','spot_price','rho','v0']].copy()
    train_plot = train_plot.reset_index().set_index('date')
    train_plot['MAE'] = train_diff.resample('D').apply(compute_RMSE)
    train_plot['RMSE'] = train_diff.resample('D').apply(compute_RMSE)
    train_plot = train_plot.reset_index()
    train_plot = train_plot.drop_duplicates(subset=['date'],keep='last').set_index('date').drop(columns='calculation_date')
    trainx = pd.date_range(start=min(train_data.index),end=max(train_data.index),periods=train_plot.shape[0])
    train_plot.index = trainx

    
    testx = pd.date_range(start=min(test_data.index),end=max(test_data.index),periods=test_plot.shape[0])
    test_plot.index = testx
    
    
    fig,axs = plt.subplots(max(len(train_plot.columns),len(test_plot.columns)),figsize=(10,10),sharex=True)
    for i,col in enumerate(train_plot.columns):
        axs[i].plot(train_plot[col],color='green',label='in-sample')
        axs[i].set_title(col.replace('_',' '))
        axs[i].legend()
    for i,col in enumerate(test_plot.columns):
        axs[i].plot(test_plot[col],color='purple',label='out-of-sample')
        axs[i].set_title(col.replace('_',' '))
        axs[i].legend()
    plt.show()

# loading model

In [3]:
from model_settings import ms
root = Path().resolve().parent.parent
models_dir = os.path.join(root,ms.trained_models)
models = pd.Series([f for f in os.listdir(models_dir) if f.find('Legacy')==-1])
for i,m in enumerate(models):
    print(f"{i}     {m}")

0     2024-11-09 112825652640 cboe spx relative asian
1     2024-11-09 124653897492 cboe spx relative barrier


In [4]:
selected_model = models.iloc[1]
model_dir = os.path.join(models_dir,selected_model)
pickle = [f for f in os.listdir(model_dir) if f.endswith('.pkl')][0]
picke_dir = os.path.join(model_dir,pickle)
model = joblib.load(picke_dir)
model

{'seed': 1312,
 'raw_data':          spot_price  strike_price  barrier  days_to_maturity updown outin  \
 0            1274.0        1146.6    637.0                60   Down   Out   
 1            1274.0        1146.6    637.0                60   Down   Out   
 2            1274.0        1146.6    637.0                60   Down    In   
 3            1274.0        1146.6    637.0                60   Down    In   
 4            1274.0        1146.6    637.0                90   Down   Out   
 ...             ...           ...      ...               ...    ...   ...   
 4067275      5857.0        6442.7   8785.5               540     Up    In   
 4067276      5857.0        6442.7   8785.5               720     Up   Out   
 4067277      5857.0        6442.7   8785.5               720     Up   Out   
 4067278      5857.0        6442.7   8785.5               720     Up    In   
 4067279      5857.0        6442.7   8785.5               720     Up    In   
 
             w barrier_type_name  r

In [5]:
print('model attributes:\n')
for k in model.keys():
    print(k)

model attributes:

seed
raw_data
dataset
target_name
excluded_features
numerical_features
categorical_features
feature_set
n_features
development_dates
test_dates
train_data
test_data
train_X
train_y
test_X
test_y
preprocessor
pipeline
model
model_fit
dnn_runtime
numerical_scaler
dnn_params
transformers
regressor
dnn_pipeline


In [6]:
for col in model['feature_set']:
    print(f"{col.replace("_"," ")}:",f"\n{model['test_data'][col].copy().squeeze().sort_values().drop_duplicates().reset_index(drop=True)}\n")
print()

days to maturity: 
0     60
1     90
2    180
3    360
4    540
5    720
Name: days_to_maturity, dtype: int64

dividend rate: 
0       0.012624
1       0.012628
2       0.012641
3       0.012671
4       0.012687
          ...   
1152    0.023672
1153    0.023681
1154    0.023780
1155    0.023823
1156    0.024133
Name: dividend_rate, Length: 1157, dtype: float64

risk free rate: 
0      0.000330
1      0.000355
2      0.000380
3      0.000406
4      0.000431
         ...   
781    0.054494
782    0.054500
783    0.054513
784    0.054801
785    0.054889
Name: risk_free_rate, Length: 786, dtype: float64

theta: 
0       8.181674e-11
1       4.564269e-10
2       4.968155e-10
3       6.916618e-10
4       1.119760e-09
            ...     
1747    1.700398e+00
1748    1.732709e+00
1749    1.811241e+00
1750    1.836051e+00
1751    1.869788e+00
Name: theta, Length: 1752, dtype: float64

kappa: 
0       3.553361e-10
1       3.924365e-10
2       7.518568e-10
3       8.464905e-10
4       1.055398e

In [7]:
train_data = model['train_data'].copy()
test_data = model['test_data'].copy()
train_data['calculation_date'] = pd.to_datetime(train_data['calculation_date'],format='mixed')
test_data['calculation_date'] = pd.to_datetime(test_data['calculation_date'],format='mixed')
test_data = test_data.set_index('calculation_date').sort_index()
train_data = train_data.set_index('calculation_date').sort_index()
print(model['feature_set'])

['days_to_maturity', 'dividend_rate', 'risk_free_rate', 'theta', 'kappa', 'rho', 'eta', 'v0', 'relative_spot', 'relative_barrier', 'relative_rebate', 'w', 'barrier_type_name']


# retraining

In [8]:
from convsklearn import convsklearn
help(convsklearn)

Help on class convsklearn in module convsklearn.convsklearn:

class convsklearn(builtins.object)
 |  convsklearn(target_name='observed_price', excluded_features=['barrier_price', 'asian_price', 'observed_price', 'outin', 'updown', 'n_fixings'], seed=1312)
 |
 |  a proprietary class of convenience wrappers for sklearn
 |
 |  Methods defined here:
 |
 |  __init__(self, target_name='observed_price', excluded_features=['barrier_price', 'asian_price', 'observed_price', 'outin', 'updown', 'n_fixings'], seed=1312)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |
 |  load_data(self, data)
 |
 |  preprocess_data(self, development_dates, test_dates, plot=True)
 |
 |  run_dnn(self, print_details=True)
 |
 |  test_prediction_accuracy(self)
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |
 |  __dict__
 |      dictionary for instance variables
 |
 |  __weakref__
 |      list of weak references to the object



In [9]:
test_dates = model['test_dates']
full_dataset = model['dataset']
pricename = [f[:f.find('_',0)] for f in full_dataset.columns if f.find('asian_price')!=-1 or f.find('barrier_price')!=-1][0]
all_dates = full_dataset['date'].drop_duplicates().sort_values().reset_index(drop=True)
all_dates

0      2012-01-03
1      2012-01-05
2      2012-01-06
3      2012-01-10
4      2012-01-18
          ...    
1409   2024-09-24
1410   2024-10-07
1411   2024-10-08
1412   2024-10-09
1413   2024-10-14
Name: date, Length: 1414, dtype: datetime64[ns]

In [None]:
retraining_frequency = 30
retraining_dates = test_dates.iloc[0::retraining_frequency]
retraining_dates = retraining_dates.reset_index(drop=True)

cols = ['cpu','insample_MAE','insample_RMSE','outofsample_MAE','outofsample_RMSE']
df = pd.DataFrame(np.tile(np.nan,(len(retraining_dates),len(cols))),columns=cols,index=retraining_dates)

models = {}

retrainer_base = convsklearn()
retrainer_base.excluded_features += model['excluded_features']
retrainer_base.target_name = model['target_name']
retrainer_base.load_data(full_dataset)

bar = tqdm(total=len(retraining_dates),leave=True)
for date in retraining_dates:
    print(date)
    development_dates = all_dates[all_dates<=date]
    subset_test_dates = all_dates[~all_dates.isin(development_dates)].iloc[:retraining_frequency]
    retrainer = retrainer_base
    print(f"training:  {min(development_dates).strftime('%A, %d %B, %Y')}  -  {max(development_dates).strftime('%A, %d %B, %Y')} ({len(development_dates)} trading days)")
    print(f"testing:   {min(subset_test_dates).strftime('%A, %d %B, %Y')}  -  {max(subset_test_dates).strftime('%A, %d %B, %Y')} ({len(subset_test_dates)} trading days)")
    retrainer.preprocess_data(development_dates, subset_test_dates,plot=False)
    retrainer.run_dnn(print_details=True)
    retrainer.test_prediction_accuracy()
    models[date] = retrainer.__dict__
    m = retrainer.__dict__
    df.at[date,'cpu'] = m['dnn_runtime']
    df.at[date,'insample_MAE'] = compute_MAE(m['train_data']['insample_error'])
    df.at[date,'insample_RMSE'] = compute_RMSE(m['train_data']['insample_error'])
    df.at[date,'outofsample_MAE'] = compute_MAE(m['test_data']['outofsample_error'])
    df.at[date,'outofsample_RMSE'] = compute_RMSE(m['test_data']['outofsample_error'])
    print(df.dropna())
    print('\n','%'*10,'\n')
    bar.update(1)
bar.close()

  0%|                                                                                                                                                                                     | 0/44 [00:00<?, ?it/s]

2012-11-06 00:00:00
training:  Tuesday, 03 January, 2012  -  Tuesday, 06 November, 2012 (101 trading days)
testing:   Thursday, 08 November, 2012  -  Friday, 22 March, 2013 (30 trading days)

training on 268941 samples...

alpha: 0.01
hidden_layer_sizes: (13, 13)
learning_rate: adaptive
learning_rate_init: 0.1
solver: sgd
early_stopping: False
max_iter: 500
warm_start: True
tol: 0.0001
random_state: 1312
cpu: 56.302024126052856


  2%|███▉                                                                                                                                                                         | 1/44 [00:56<40:40, 56.76s/it]


in sample:
     RMSE: 0.0155463925008248
     MAE: 0.009653831727588923

out of sample:
     RMSE: 0.06803331708071107
     MAE: 0.023977838594117902
                  cpu  insample_MAE  insample_RMSE  outofsample_MAE  \
date                                                                  
2012-11-06  56.302024      0.009654       0.015546         0.023978   

            outofsample_RMSE  
date                          
2012-11-06          0.068033  

 %%%%%%%%%% 

2013-03-22 00:00:00
training:  Tuesday, 03 January, 2012  -  Friday, 22 March, 2013 (131 trading days)
testing:   Tuesday, 26 March, 2013  -  Monday, 10 June, 2013 (30 trading days)

training on 358041 samples...

alpha: 0.01
hidden_layer_sizes: (13, 13)
learning_rate: adaptive
learning_rate_init: 0.1
solver: sgd
early_stopping: False
max_iter: 500
warm_start: True
tol: 0.0001
random_state: 1312
cpu: 82.06557607650757


  5%|███████▊                                                                                                                                                                     | 2/44 [02:19<50:21, 71.94s/it]


in sample:
     RMSE: 0.014768462310316616
     MAE: 0.009204596553839345

out of sample:
     RMSE: 0.09131353550084434
     MAE: 0.028433728421179236
                  cpu  insample_MAE  insample_RMSE  outofsample_MAE  \
date                                                                  
2012-11-06  56.302024      0.009654       0.015546         0.023978   
2013-03-22  82.065576      0.009205       0.014768         0.028434   

            outofsample_RMSE  
date                          
2012-11-06          0.068033  
2013-03-22          0.091314  

 %%%%%%%%%% 

2013-06-10 00:00:00
training:  Tuesday, 03 January, 2012  -  Monday, 10 June, 2013 (161 trading days)
testing:   Wednesday, 12 June, 2013  -  Thursday, 29 August, 2013 (30 trading days)

training on 433101 samples...

alpha: 0.01
hidden_layer_sizes: (13, 13)
learning_rate: adaptive
learning_rate_init: 0.1
solver: sgd
early_stopping: False
max_iter: 500
warm_start: True
tol: 0.0001
random_state: 1312
cpu: 118.78674817085

  7%|███████████▋                                                                                                                                                               | 3/44 [04:18<1:03:58, 93.62s/it]


in sample:
     RMSE: 0.013174333704412315
     MAE: 0.008066376456901878

out of sample:
     RMSE: 0.05829185103492043
     MAE: 0.016714210932100387
                   cpu  insample_MAE  insample_RMSE  outofsample_MAE  \
date                                                                   
2012-11-06   56.302024      0.009654       0.015546         0.023978   
2013-03-22   82.065576      0.009205       0.014768         0.028434   
2013-06-10  118.786748      0.008066       0.013174         0.016714   

            outofsample_RMSE  
date                          
2012-11-06          0.068033  
2013-03-22          0.091314  
2013-06-10          0.058292  

 %%%%%%%%%% 

2013-08-29 00:00:00
training:  Tuesday, 03 January, 2012  -  Thursday, 29 August, 2013 (191 trading days)
testing:   Monday, 09 September, 2013  -  Monday, 18 November, 2013 (30 trading days)

training on 521661 samples...

alpha: 0.01
hidden_layer_sizes: (13, 13)
learning_rate: adaptive
learning_rate_init: 0.1
sol

  9%|███████████████▍                                                                                                                                                          | 4/44 [06:31<1:12:48, 109.21s/it]


in sample:
     RMSE: 0.012106834760522402
     MAE: 0.007388693728062532

out of sample:
     RMSE: 0.022906469874282874
     MAE: 0.008700240083612785
                   cpu  insample_MAE  insample_RMSE  outofsample_MAE  \
date                                                                   
2012-11-06   56.302024      0.009654       0.015546         0.023978   
2013-03-22   82.065576      0.009205       0.014768         0.028434   
2013-06-10  118.786748      0.008066       0.013174         0.016714   
2013-08-29  132.374332      0.007389       0.012107         0.008700   

            outofsample_RMSE  
date                          
2012-11-06          0.068033  
2013-03-22          0.091314  
2013-06-10          0.058292  
2013-08-29          0.022906  

 %%%%%%%%%% 

2013-11-18 00:00:00
training:  Tuesday, 03 January, 2012  -  Monday, 18 November, 2013 (221 trading days)
testing:   Thursday, 21 November, 2013  -  Friday, 14 February, 2014 (30 trading days)

training on 598827

 11%|███████████████████▎                                                                                                                                                      | 5/44 [09:18<1:24:30, 130.01s/it]


in sample:
     RMSE: 0.013716460455963343
     MAE: 0.008465828106514873

out of sample:
     RMSE: 0.07790404167734967
     MAE: 0.02449723932707823
                   cpu  insample_MAE  insample_RMSE  outofsample_MAE  \
date                                                                   
2012-11-06   56.302024      0.009654       0.015546         0.023978   
2013-03-22   82.065576      0.009205       0.014768         0.028434   
2013-06-10  118.786748      0.008066       0.013174         0.016714   
2013-08-29  132.374332      0.007389       0.012107         0.008700   
2013-11-18  166.158169      0.008466       0.013716         0.024497   

            outofsample_RMSE  
date                          
2012-11-06          0.068033  
2013-03-22          0.091314  
2013-06-10          0.058292  
2013-08-29          0.022906  
2013-11-18          0.077904  

 %%%%%%%%%% 

2014-02-14 00:00:00
training:  Tuesday, 03 January, 2012  -  Friday, 14 February, 2014 (251 trading days)
testi

 14%|███████████████████████▏                                                                                                                                                  | 6/44 [11:32<1:23:08, 131.28s/it]


in sample:
     RMSE: 0.01624564549649192
     MAE: 0.01021841370026935

out of sample:
     RMSE: 0.04527912671545104
     MAE: 0.014823417884159442
                   cpu  insample_MAE  insample_RMSE  outofsample_MAE  \
date                                                                   
2012-11-06   56.302024      0.009654       0.015546         0.023978   
2013-03-22   82.065576      0.009205       0.014768         0.028434   
2013-06-10  118.786748      0.008066       0.013174         0.016714   
2013-08-29  132.374332      0.007389       0.012107         0.008700   
2013-11-18  166.158169      0.008466       0.013716         0.024497   
2014-02-14  132.935501      0.010218       0.016246         0.014823   

            outofsample_RMSE  
date                          
2012-11-06          0.068033  
2013-03-22          0.091314  
2013-06-10          0.058292  
2013-08-29          0.022906  
2013-11-18          0.077904  
2014-02-14          0.045279  

 %%%%%%%%%% 

2014-06-0

 16%|███████████████████████████                                                                                                                                               | 7/44 [14:52<1:34:50, 153.79s/it]


in sample:
     RMSE: 0.0138488427500832
     MAE: 0.008266915134387601

out of sample:
     RMSE: 0.03561305633042726
     MAE: 0.010750398986350053
                   cpu  insample_MAE  insample_RMSE  outofsample_MAE  \
date                                                                   
2012-11-06   56.302024      0.009654       0.015546         0.023978   
2013-03-22   82.065576      0.009205       0.014768         0.028434   
2013-06-10  118.786748      0.008066       0.013174         0.016714   
2013-08-29  132.374332      0.007389       0.012107         0.008700   
2013-11-18  166.158169      0.008466       0.013716         0.024497   
2014-02-14  132.935501      0.010218       0.016246         0.014823   
2014-06-04  199.190350      0.008267       0.013849         0.010750   

            outofsample_RMSE  
date                          
2012-11-06          0.068033  
2013-03-22          0.091314  
2013-06-10          0.058292  
2013-08-29          0.022906  
2013-11-18    

In [None]:
from plotters import PlotCols
PlotCols(df)

In [None]:
retrained = {'errors':df,'models':models}
tag = ms.timetag()
joblib.dump(retrained,f'{tag} retrained_relative_{pricename}_options.pkl')