In [1]:
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# functions

In [2]:
def compute_RMSE(diff):
    if len(diff)>0:
        return np.sqrt(np.mean(diff.values**2))
        
def compute_MAE(diff):
    if len(diff)>0:
        return np.mean(np.abs(diff.values))

def plot_errors(plotcols, test_data, train_data):
    test_diff = test_data['outofsample_error']
    train_diff = train_data['insample_error']
    
    test_data['RMSE'] = test_diff.resample('D').apply(compute_RMSE).dropna()
    test_data['MAE'] = test_diff.resample('D').apply(compute_MAE).dropna()
    test_plot = test_data[plotcols].copy().drop_duplicates()
    
    train_data['RMSE'] = train_diff.resample('D').apply(compute_RMSE).dropna()
    train_data['MAE'] = train_diff.resample('D').apply(compute_MAE).dropna()
    train_plot = train_data[plotcols].copy().drop_duplicates()
    
    trainx = pd.date_range(start=min(train_data.index),end=max(train_data.index),periods=train_plot.shape[0])
    testx = pd.date_range(start=min(test_data.index),end=max(test_data.index),periods=test_plot.shape[0])
    
    fig,axs = plt.subplots(len(plotcols),figsize=(10,10),sharex=True)
    for i,col in enumerate(plotcols):
        axs[i].plot(trainx,train_plot[col],color='green',label='in-sample')
        axs[i].set_title(col.replace('_',' '))
        axs[i].legend()
    for i,col in enumerate(plotcols):
        axs[i].plot(testx,test_plot[col],color='purple',label='out-of-sample')
        axs[i].set_title(col.replace('_',' '))
        axs[i].legend()
    plt.show()

def retrain(train_data,test_data,retraining_frequency):
    retraining_i = np.arange(retraining_frequency,len(test_dates),retraining_frequency)
    retraining_dates = test_dates[retraining_i].reset_index(drop=True)
    print(f"retraining dates:\n{retraining_dates}")
    for i,date in enumerate(retraining_dates):
        if i<len(retraining_dates)-1:
            print()
            print(date.strftime('%c'))
            new_train = pd.concat([train_data,test_data[test_data.index<=date]],ignore_index=False).copy().dropna(how='any',axis=1).reset_index(drop=False)
            new_test = test_data[
                (test_data.index>date)
                # &(test_data.index<=retraining_dates[i+1])
            ].reset_index(drop=False)
            arrs = trainer.get_train_test_arrays(new_train, new_test)
            train_X = arrs['train_X']
            train_y = arrs['train_y']
            test_X = arrs['test_X']
            test_y = arrs['test_y']
            preprocessor = trainer.preprocess()
            retrained_model = trainer.run_dnn(preprocessor,train_X,train_y)
            train_test = trainer.test_prediction_accuracy(new_train,new_test,retrained_model)
            new_test_data = train_test['test_data'].set_index('calculation_date')
            new_train_data = train_test['train_data'].set_index('calculation_date')
            print()
            plot_errors(plotcols,new_test_data,new_train_data)
    return retrained_model

# loading model

In [3]:
from model_settings import ms
root = Path().resolve().parent.parent
models_dir = os.path.join(root,ms.trained_models)
models = [f for f in os.listdir(models_dir) if f.find('.')==-1]
for i,m in enumerate(models):
    print(f"{i}     {m}")

0     2024_11_01 180435470848 short-term cboe asians
1     2024_11_01 195924619203 live cboe barriers
2     2024_11_01 204910611379 bloomberg barriers
3     2024_11_01 205356527985 bloomberg asians
4     2024_11_02 113141414136 cboe asians
5     2024_11_02 113706914873 cboe barriers
6     2024_11_03 145046372112 cboe asians
7     2024_11_05 184958719691 Oosterlee test


# loading data

In [5]:
"""
select model here
"""
model = models[-1]
""""""

plotcols = ['v0','RMSE', 'MAE','spot_price']

from convsklearn import asian_trainer, barrier_trainer

model_dir = os.path.join(models_dir,model)
model_files = [f for f in os.listdir(model_dir) if f.find('ipynb')==-1 and f.find('.html')==-1]
for i,m in enumerate(model_files):
    print(f"{i}     {m}")
print()
if any('asian' in file for file in model_files):
    trainer = asian_trainer
if any('barrier' in file for file in model_files):
    trainer = barrier_trainer

train_data = pd.read_csv(os.path.join(model_dir,[f for f in model_files if f.find('train')!=-1][0])).iloc[:,1:].copy()
test_data = pd.read_csv(os.path.join(model_dir,[f for f in model_files if f. find('test')!=-1][0])).iloc[:,1:].copy()
train_data['calculation_date'] = pd.to_datetime(train_data['calculation_date'],format='mixed').dt.normalize()
test_data['calculation_date'] = pd.to_datetime(test_data['calculation_date'],format='mixed').dt.normalize()
train_data = train_data.set_index('calculation_date')
test_data = test_data.set_index('calculation_date')
test_dates = pd.Series(test_data.index).sort_values(ascending=True).drop_duplicates().reset_index(drop=True)
model_fit = joblib.load(os.path.join(model_dir,[f for f in model_files if f.endswith('.pkl')][0]))

for col in trainer.feature_set:
    print(f"{col.replace("_"," "):}",f"\n{test_data[col].copy().squeeze().sort_values().drop_duplicates().reset_index(drop=True)}\n")
print()
print(model_fit)

plot_errors(plotcols, test_data, train_data)
test_data

0     2024_11_05 184958719691 Oosterlee test test_data.csv
1     2024_11_05 184958719691 Oosterlee test train_data.csv
2     2024_11_05 184958719691 Oosterlee test.pkl



NameError: name 'trainer' is not defined

In [7]:
test_data.columns

Index(['spot_price', 'strike_price', 'barrier', 'days_to_maturity', 'updown',
       'outin', 'w', 'barrier_type_name', 'rebate', 'dividend_rate',
       'risk_free_rate', 'theta', 'kappa', 'rho', 'eta', 'v0', 'date',
       'barrier_price', 'observed_price', 'moneyness', 'relative_moneyness',
       'relative_barrier', 'relative_price'],
      dtype='object')

# retraining

In [None]:
retrained_model = retrain(train_data=train_data,test_data=test_data,retraining_frequency=30)