In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Thu Oct 24 14:08:33 2024



# Loading data

In [2]:
root = Path().resolve().parent.parent
datadir = os.path.join(root,ms.cboe_spx_barrier_dump)
files = [f for f in os.listdir(datadir) if f.endswith('.csv')]
files = [os.path.join(datadir,f) for f in files]
dfs = []
bar = tqdm(total=len(files))
for f in files:
    dfs.append(pd.read_csv(f).iloc[:,1:])
    bar.update(1)
bar.close()
dataset = pd.concat(dfs,ignore_index=True).dropna().reset_index(drop=True)
dataset

100%|████████████████████████████████████████| 755/755 [00:01<00:00, 439.45it/s]


Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,updown,outin,w,barrier_type_name,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price
0,3826.79000,3444.11100,1913.39500,60,Down,Out,call,DownOut,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,443.17479
1,3826.79000,3444.11100,1913.39500,60,Down,Out,put,DownOut,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,35.97999
2,3826.79000,3444.11100,1913.39500,60,Down,In,call,DownIn,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,0.23768
3,3826.79000,3444.11100,1913.39500,60,Down,In,put,DownIn,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,1.92670
4,3826.79000,3444.11100,1913.39500,90,Down,Out,call,DownOut,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,474.83357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1616890,4142.17500,4556.39250,6213.26250,540,Up,In,put,UpIn,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,8.15521
1616891,4142.17500,4556.39250,6213.26250,720,Up,Out,call,UpOut,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,242.52494
1616892,4142.17500,4556.39250,6213.26250,720,Up,Out,put,UpOut,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,460.70698
1616893,4142.17500,4556.39250,6213.26250,720,Up,In,call,UpIn,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,161.92059


In [3]:
from model_settings import vanilla_pricer
vanillas = vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d')
dataset.dtypes


initializing vanilla pricer
Actual/365 (Fixed) day counter
seed: 123



spot_price                  float64
strike_price                float64
barrier                     float64
days_to_maturity              int64
updown                       object
outin                        object
w                            object
barrier_type_name            object
rebate                      float64
dividend_rate               float64
risk_free_rate              float64
theta                       float64
kappa                       float64
rho                         float64
eta                         float64
v0                          float64
calculation_date     datetime64[ns]
barrier_price               float64
dtype: object

# Preprocessing

In [4]:
from convsklearn import barrier_trainer
price = 'barrier_price'
trainer = barrier_trainer
trainer.activation_function = 'relu'
trainer.solver = 'sgd'
dataset[price] = pd.to_numeric(dataset[price],errors='coerce')
dataset['observed_price'] = np.maximum(dataset[price] + np.random.normal(scale=(0.15)**2,size=dataset.shape[0]),0)
dataset

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,updown,outin,w,barrier_type_name,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price,observed_price
0,3826.79000,3444.11100,1913.39500,60,Down,Out,call,DownOut,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,443.17479,443.17868
1,3826.79000,3444.11100,1913.39500,60,Down,Out,put,DownOut,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,35.97999,35.95563
2,3826.79000,3444.11100,1913.39500,60,Down,In,call,DownIn,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,0.23768,0.21598
3,3826.79000,3444.11100,1913.39500,60,Down,In,put,DownIn,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,1.92670,1.92848
4,3826.79000,3444.11100,1913.39500,90,Down,Out,call,DownOut,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,474.83357,474.82356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1616890,4142.17500,4556.39250,6213.26250,540,Up,In,put,UpIn,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,8.15521,8.12957
1616891,4142.17500,4556.39250,6213.26250,720,Up,Out,call,UpOut,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,242.52494,242.52468
1616892,4142.17500,4556.39250,6213.26250,720,Up,Out,put,UpOut,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,460.70698,460.65242
1616893,4142.17500,4556.39250,6213.26250,720,Up,In,call,UpIn,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,161.92059,161.91646


In [5]:
dataset.dropna()

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,updown,outin,w,barrier_type_name,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price,observed_price
0,3826.79000,3444.11100,1913.39500,60,Down,Out,call,DownOut,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,443.17479,443.17868
1,3826.79000,3444.11100,1913.39500,60,Down,Out,put,DownOut,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,35.97999,35.95563
2,3826.79000,3444.11100,1913.39500,60,Down,In,call,DownIn,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,0.23768,0.21598
3,3826.79000,3444.11100,1913.39500,60,Down,In,put,DownIn,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,1.92670,1.92848
4,3826.79000,3444.11100,1913.39500,90,Down,Out,call,DownOut,0.00000,0.00000,0.04000,0.09671,3.94232,-0.62153,1.60064,0.04595,2022-07-18,474.83357,474.82356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1616890,4142.17500,4556.39250,6213.26250,540,Up,In,put,UpIn,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,8.15521,8.12957
1616891,4142.17500,4556.39250,6213.26250,720,Up,Out,call,UpOut,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,242.52494,242.52468
1616892,4142.17500,4556.39250,6213.26250,720,Up,Out,put,UpOut,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,460.70698,460.65242
1616893,4142.17500,4556.39250,6213.26250,720,Up,In,call,UpIn,0.00000,0.00000,0.04000,0.06146,15.19219,-0.53665,7.97166,0.00056,2021-04-13,161.92059,161.91646


## Train/test split

In [6]:
unique_dates = dataset['calculation_date'].sort_values(
    ascending=True).unique().tolist()
filter_date = unique_dates[int(0.85*len(unique_dates))]
train_data = dataset[
    (
      # (dataset['calculation_date']>=datetime(2007,1,1))
      #  &
      (dataset['calculation_date']<=filter_date)
    )
].copy()

test_data = dataset[
    (
        (dataset['calculation_date']>filter_date)
        # &
        # (dataset['calculation_date']<=datetime(2012,12,31))
    )
].copy()

In [7]:
test_data

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,updown,outin,w,barrier_type_name,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price,observed_price
2160,3777.46500,3399.71850,1888.73250,60,Down,Out,call,DownOut,0.00000,0.00000,0.04000,0.08650,2.86339,-0.67597,1.46652,0.07612,2022-09-22,450.77271,450.77703
2161,3777.46500,3399.71850,1888.73250,60,Down,Out,put,DownOut,0.00000,0.00000,0.04000,0.08650,2.86339,-0.67597,1.46652,0.07612,2022-09-22,47.43322,47.42873
2162,3777.46500,3399.71850,1888.73250,60,Down,In,call,DownIn,0.00000,0.00000,0.04000,0.08650,2.86339,-0.67597,1.46652,0.07612,2022-09-22,0.24981,0.30090
2163,3777.46500,3399.71850,1888.73250,60,Down,In,put,DownIn,0.00000,0.00000,0.04000,0.08650,2.86339,-0.67597,1.46652,0.07612,2022-09-22,3.30424,3.29803
2164,3777.46500,3399.71850,1888.73250,90,Down,Out,call,DownOut,0.00000,0.00000,0.04000,0.08650,2.86339,-0.67597,1.46652,0.07612,2022-09-22,482.80868,482.78184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1614730,3838.59000,4222.44900,5757.88500,540,Up,In,put,UpIn,0.00000,0.00000,0.04000,0.11578,6.33951,-0.69412,2.86226,0.04775,2022-10-26,9.65886,9.67786
1614731,3838.59000,4222.44900,5757.88500,720,Up,Out,call,UpOut,0.00000,0.00000,0.04000,0.11578,6.33951,-0.69412,2.86226,0.04775,2022-10-26,170.77971,170.78475
1614732,3838.59000,4222.44900,5757.88500,720,Up,Out,put,UpOut,0.00000,0.00000,0.04000,0.11578,6.33951,-0.69412,2.86226,0.04775,2022-10-26,608.09181,608.09634
1614733,3838.59000,4222.44900,5757.88500,720,Up,In,call,UpIn,0.00000,0.00000,0.04000,0.11578,6.33951,-0.69412,2.86226,0.04775,2022-10-26,398.44330,398.45039


In [8]:
test_train_ratio = int(round(100*test_data.shape[0]/train_data.shape[0],0))
print(f"train/test: {100-test_train_ratio}/{test_train_ratio}")

train/test: 83/17


In [9]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data,feature_set = trainer.feature_set, target_name=trainer.target_name)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']

# Training

In [10]:
train_y

0         443.17868
1          35.95563
2           0.21598
3           1.92848
4         474.82356
             ...   
1616890     8.12957
1616891   242.52468
1616892   460.65242
1616893   161.91646
1616894    12.17275
Name: observed_price, Length: 1379970, dtype: float64

In [11]:
print("features:")
for f in trainer.feature_set:
    print(f"    {f.replace('_',' ')}")
print(f"target:\n    {trainer.target_name.replace('_',' ')}")

features:
    spot price
    strike price
    days to maturity
    risk free rate
    dividend rate
    kappa
    theta
    rho
    eta
    v0
    barrier
    barrier type name
    w
target:
    observed price


In [12]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (13, 13, 13)
learning rate: adaptive
activation: relu
solver: sgd
alpha: 0.0001




ValueError: Input y contains NaN.

# Testing

In [None]:
test_data.describe()

In [None]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
        )
outofsample_RMSE = errors['outofsample_RMSE']

# Saving