In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
root = Path().resolve().parent.parent
data_dir = os.path.join(root,ms.cboe_spx_barriers['dump'])
files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
dates = pd.Series([d[:d.find('_',0)]for d in files]).drop_duplicates().reset_index(drop=True)
dates

0      2022-04-18
1      2022-04-19
2      2022-04-20
3      2022-04-21
4      2022-04-22
          ...    
623    2024-10-09
624    2024-10-10
625    2024-10-11
626    2024-10-14
627    2024-10-15
Length: 628, dtype: object

In [2]:
files = [f for f in files if f.find('2022-04-20')!=-1]
files = [os.path.join(data_dir,f) for f in files]
dataset = pd.concat([pd.read_csv(f) for f in files],ignore_index=True).iloc[:,1:]

In [3]:
price_name = 'barrier_price' 
dataset = dataset[dataset[price_name]<dataset['spot_price']].dropna().reset_index(drop=True).copy()

In [4]:
dataset['observed_price'] = np.maximum(dataset[price_name] + np.random.normal(scale=(0.15)**2,size=dataset.shape[0]),0)
dataset['moneyness'] = ms.df_moneyness(dataset)
dataset['relative_moneyness'] = dataset['moneyness']/dataset['spot_price']
dataset['relative_barrier'] = dataset['barrier']/dataset['spot_price']
dataset['relative_price'] = dataset['observed_price']/dataset['spot_price']
dataset = dataset[(dataset['w']=='put')&(dataset['barrier_type_name']=='DownOut')].reset_index(drop=True)

In [5]:
dates = pd.Series(np.sort(dataset['date'].unique()))
development_dates = dates[:5]
test_dates = dates[~dates.isin(development_dates)]
train_data = dataset[dataset['date'].isin(development_dates)]
test_data = dataset[dataset['date'].isin(development_dates)]
train_data

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,updown,outin,w,barrier_type_name,rebate,dividend_rate,...,eta,v0,calculation_date,date,barrier_price,observed_price,moneyness,relative_moneyness,relative_barrier,relative_price
0,4482.0,4033.80,2241.000,60,Down,Out,put,DownOut,0.0,0.0,...,2.137911,0.035035,2022-04-20 14:01:48.274,2022-04-21,31.484624,31.507318,-448.20,-0.1,0.50,0.007030
1,4482.0,4033.80,2241.000,90,Down,Out,put,DownOut,0.0,0.0,...,2.137911,0.035035,2022-04-20 14:01:48.274,2022-04-21,41.709863,41.697809,-448.20,-0.1,0.50,0.009303
2,4482.0,4033.80,2241.000,180,Down,Out,put,DownOut,0.0,0.0,...,2.137911,0.035035,2022-04-20 14:01:48.274,2022-04-21,60.414210,60.418080,-448.20,-0.1,0.50,0.013480
3,4482.0,4033.80,2241.000,360,Down,Out,put,DownOut,0.0,0.0,...,2.137911,0.035035,2022-04-20 14:01:48.274,2022-04-21,78.052508,78.088728,-448.20,-0.1,0.50,0.017423
4,4482.0,4033.80,2241.000,540,Down,Out,put,DownOut,0.0,0.0,...,2.137911,0.035035,2022-04-20 14:01:48.274,2022-04-21,82.619711,82.609782,-448.20,-0.1,0.50,0.018431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5395,4470.5,4917.55,4425.795,90,Down,Out,put,DownOut,0.0,0.0,...,1.781419,0.035274,2022-04-20 16:17:40.131,2022-04-21,30.476177,30.475804,447.05,0.1,0.99,0.006817
5396,4470.5,4917.55,4425.795,180,Down,Out,put,DownOut,0.0,0.0,...,1.781419,0.035274,2022-04-20 16:17:40.131,2022-04-21,7.538869,7.514594,447.05,0.1,0.99,0.001681
5397,4470.5,4917.55,4425.795,360,Down,Out,put,DownOut,0.0,0.0,...,1.781419,0.035274,2022-04-20 16:17:40.131,2022-04-21,1.055007,1.098168,447.05,0.1,0.99,0.000246
5398,4470.5,4917.55,4425.795,540,Down,Out,put,DownOut,0.0,0.0,...,1.781419,0.035274,2022-04-20 16:17:40.131,2022-04-21,0.376165,0.377944,447.05,0.1,0.99,0.000085


In [6]:
def make_dnn_pipeline(feature_set,numerical_scaler):
    preprocessor = ColumnTransformer([('scaling',numerical_scaler,feature_set)])
    dnn_pipeline = make_pipeline(preprocessor,MLPRegressor(max_iter=1000,random_state=1312))
    return TransformedTargetRegressor(
        regressor=dnn_pipeline,
        transformer= Pipeline([("StandardScaler", StandardScaler())])
    )

In [7]:
feature_set = [
    'relative_moneyness',
    'days_to_maturity',
    'risk_free_rate',
    'dividend_rate',
    'kappa',
    'theta',
    'rho',
    'eta',
    'v0',
    'relative_barrier',
    # 'barrier_type_name',
    # 'w'
]


In [8]:
train_X = train_data[feature_set]
train_y = train_data['relative_price']

dnn = make_dnn_pipeline(feature_set,numerical_scaler)

model_fit = dnn.fit(train_X,train_y)
train_pred = model_fit.predict(train_X)*train_data['spot_price']
print('MAE:',np.mean(np.abs(train_pred-train_data['spot_price'])))

NameError: name 'numerical_scaler' is not defined

In [None]:
feature_set = [
    'spot_price',
    'strike_price',
    'days_to_maturity',
    'risk_free_rate',
    'dividend_rate',
    'kappa',
    'theta',
    'rho',
    'eta',
    'v0',
    'barrier',
    # 'barrier_type_name',
    # 'w'
]

In [None]:
train_X = train_data[feature_set]
train_y = train_data['observed_price']

dnn = make_dnn_pipeline(feature_set,numerical_scaler)

model_fit = dnn.fit(train_X,train_y)
train_pred = model_fit.predict(train_X)
print('MAE:',np.mean(np.abs(train_pred-train_data['spot_price'])))

In [None]:
from convsklearn import barrier_trainer
barrier_trainer.feature_set = feature_set
preprocessor = ColumnTransformer([('scaling',numerical_scaler,feature_set)])
old_dnn = barrier_trainer.run_dnn(preprocessor,train_X,train_y)
train_pred = old_dnn.predict(train_X)
print('MAE:',np.mean(np.abs(train_pred-train_data['spot_price'])))