In [1]:
import os
import joblib

import pandas as pd
import seaborn as sns

from pathlib import Path
from tqdm import tqdm
from model_settings import ms
from plotters import PlotCols
import matplotlib.pyplot as plt

ms.find_root(Path())

# functions

In [2]:
import matplotlib.pyplot as plt
import numpy as np

def compute_RMSE(diff):
    if len(diff)>0:
        return np.sqrt(np.mean(diff.values**2))
        
def compute_MAE(diff):
    if len(diff)>0:
        return np.mean(np.abs(diff.values))

def CompareDists(X,Y):
    bins=int(np.sqrt(min(len(X),len(Y)))) +
    try:
        xlabel = X.name.replace('_',' ')
    except Exception:
        xlabel = ''
    try:
        ylabel = Y.name.replace('_',' ')
    except Exception:
        ylabel = ''
    plt.figure()
    plt.hist(X,bins=bins,label=xlabel,density=True,color='green')
    plt.hist(Y,bins=bins,label=ylabel,density=True,color='purple',histtype='step')
    plt.legend()
    plt.show()

# loading model

In [3]:
root = os.path.join(ms.root)
models_dir = os.path.join(root,ms.trained_models)
models = pd.Series([f for f in os.listdir(models_dir) if not f.startswith('.') and f.find('Legacy')])
for i,m in enumerate(models):
    print(f"{i}     {m}")
i = int(input('select model: '))
selected_model = models.iloc[i]

0     2024-12-03 115353819604 inital cboe spx relative asian
1     2024-12-03 221708375538 inital cboe spx relative barrier


select model:  0


In [4]:
model_dir = os.path.join(models_dir,selected_model)
pickle = [f for f in os.listdir(model_dir) if f.endswith('.pkl')][0]
picke_dir = os.path.join(model_dir,pickle)
model = joblib.load(picke_dir)
pricename = f"{selected_model[selected_model.rfind(' ')+1:]}_price"
initial = model['model']
model

{'seed': 1312,
 'raw_data':         spot_price  strike_price  days_to_maturity  n_fixings  \
 0           1274.0        637.00                 7        1.0   
 1           1274.0        637.00                 7        1.0   
 2           1274.0        637.00                 7        1.0   
 3           1274.0        637.00                 7        1.0   
 4           1274.0        955.50                 7        1.0   
 ...            ...           ...               ...        ...   
 338935      5857.0       7320.75                84        1.0   
 338936      5857.0       8785.00                84        1.0   
 338937      5857.0       8785.00                84        1.0   
 338938      5857.0       8785.00                84        1.0   
 338939      5857.0       8785.00                84        1.0   
 
         fixing_frequency  past_fixings averaging_type     w  risk_free_rate  \
 0                      7             0      geometric  call        0.001578   
 1                 

In [5]:
pricename

'asian_price'

In [6]:
model['model']

In [7]:
test_data = model['test_data']
test_data['sqerr'] = test_data['outofsample_error']**2

In [None]:
print('model attributes:\n')
for k in model.keys():
    print(k)

In [None]:
for col in model['feature_set']:
    print(f"{col.replace("_"," ")}:",f"\n{model['test_data'][col].copy().squeeze().sort_values().drop_duplicates().reset_index(drop=True)}\n")
print()

In [None]:
train_data = model['train_data'].copy()
test_data = model['test_data'].copy()
train_data['calculation_date'] = pd.to_datetime(train_data['calculation_date'],format='mixed')
test_data['calculation_date'] = pd.to_datetime(test_data['calculation_date'],format='mixed')
test_data = test_data.set_index('calculation_date').sort_index()
train_data = train_data.set_index('calculation_date').sort_index()
print(model['feature_set'])
model['numerical_features']

In [None]:
full_dataset = pd.concat([train_data,test_data])
test_dates = test_data['date'].drop_duplicates().reset_index(drop=True)
all_dates = full_dataset['date'].drop_duplicates().sort_values().reset_index(drop=True)
all_dates = pd.to_datetime(all_dates,format='mixed')

retraining_frequency = 20
n = len(test_dates)//retraining_frequency
n

In [None]:
full_dataset

In [None]:
cols = ['outofsample_MAE','outofsample_RMSE']
df = pd.DataFrame()
models = {}

for i in range(0,n):
    subset_test_dates = pd.to_datetime(model['test_dates'][(i*retraining_frequency):(i+1)*retraining_frequency],format='fixed')
    subset_test = test_data[test_data['date'].isin(subset_test_dates)]
    
    target = subset_test['relative_observed']
    prediction = initial.predict(subset_test[model['feature_set']])
    
    error = prediction-target
    
    predicted_price = prediction*subset_test['strike_price']
    pricing_error = prediction-subset_test[pricename]
    
    date = subset_test_dates.iloc[0]
    df.at[date,'outofsample_MAE'] = compute_MAE(error)
    df.at[date,'outofsample_RMSE'] = compute_RMSE(error)
    # df.at[date,'price_MAE'] = compute_MAE(pricing_error)
    # df.at[date,'price_RMSE'] = compute_RMSE(pricing_error)
    df.at[date,'avgsqrtv0'] = np.mean(np.sqrt(subset_test['v0']))
    for col in [
        'rho','theta',
        'spot_price'
    ]:
        df.at[date,f"avg_{col}"] = np.mean(subset_test[col])

df.index = pd.to_datetime(df.index)
df

In [None]:
PlotCols(df,figsize=(10,10))

In [None]:
test_data = model['test_data'].copy()

test_data['sqrtv0'] = np.sqrt(test_data['v0'])
sqrtv0 = test_data['sqrtv0']
quants = sqrtv0.describe()[3:]
quants

In [None]:
test_data[['v0','sqrtv0','relative_spot',]].describe()

In [None]:
print(test_data.dtypes)

In [None]:
test_data['observed_price'] = test_data['relative_observed']*test_data['strike_price']
test_data['relative_error'] = test_data['observed_price']/test_data[pricename]-1

# visual inspection

In [None]:
sns.kdeplot(data=test_data, x='observed_price', label='Estimated', color='purple')
sns.histplot(data=test_data, x=pricename, label='Target', color='green', stat='density', alpha=0.5)
plt.legend()
plt.show()

In [None]:
train_zoom = test_data[
    (test_data['relative_observed']>0.05)
    &(test_data['relative_observed']<0.5)
]
sns.kdeplot(data=train_zoom, x='observed_price', label='Estimated', color='purple')
sns.histplot(data=train_zoom, x=pricename, label='Target', color='green', stat='density', alpha=0.5)
plt.legend()
plt.show()

# feature importance

In [None]:
from sklearn.inspection import permutation_importance
train = model['train_data'].copy()

r = permutation_importance(initial, train[model['feature_set']], train[model['target_name']],
                           n_repeats=30,
                           random_state=1312,
                           scoring='neg_mean_squared_error'
                          )
importances = pd.DataFrame(data=r['importances'],index=model['feature_set']).T
importances_mean = pd.Series(r['importances_mean'],index=model['feature_set'])
importances_std = pd.Series(r['importances_std'],index=model['feature_set'])
importances

In [None]:
security_tag = pricename[:pricename.find('_')].title()+' Options'

In [None]:
import plotly.express as px
fig = px.box(
    importances[ model['feature_set']
        # 'kappa','theta','rho','eta','v0',
        # # 'relative_spot'
    ],
    height=1000,
    width=1200,
    facet_col_spacing=0,
    facet_row_spacing=0,
    notched=True, 
    title=f'Feature Importance for {security_tag}'
)
fig.update_xaxes(title='Feature')
fig.update_yaxes(title='')

# partial dependence

In [None]:
import matplotlib.pyplot as plt
from time import time
from sklearn.inspection import PartialDependenceDisplay

common_params = {
    "subsample": 50,
    "n_jobs": 2,
    "grid_resolution": 20,
    "random_state": 0,
}

heston_and_price = ['kappa','relative_spot','theta','rho','eta','v0']
print("Computing partial dependence plots...")
features_info = {
    "features": heston_and_price,
    "kind": "average",
}
tic = time()
_, ax = plt.subplots(ncols=3, nrows=2, figsize=(9, 8), constrained_layout=True)
display = PartialDependenceDisplay.from_estimator(
    initial,
    model['train_X'],
    **features_info,
    ax=ax,
    **common_params,
)
print(f"done in {time() - tic:.3f}s")
_ = display.figure_.suptitle(
    (
        f"Partial dependence of {pricename.replace('_',' option ')}s"
    ),
    fontsize=16,
)

In [None]:
PDPfeatures = [f for f in model['numerical_features']]
features_info = {
    "features": PDPfeatures,
    "kind": "average",
    "categorical_features":model['categorical_features']
}
tic = time()
_, ax = plt.subplots(figsize=(9, 9), constrained_layout=True)
display = PartialDependenceDisplay.from_estimator(
    initial,
    model['train_X'],
    **features_info,
    ax=ax,
    **common_params,
)
print(f"done in {time() - tic:.3f}s")
_ = display.figure_.suptitle(
    (
        f"Partial dependence of {pricename.replace('_',' option ')}s"
    ),
    fontsize=16,
)

In [None]:
sns.pairplot(test_data[['kappa','theta','rho','eta','v0']])