In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Thu Oct 24 01:11:08 2024



# Loading data

In [2]:
root = Path().resolve().parent.parent
datadir = os.path.join(root,ms.bloomberg_spx_barrier_dump)
files = [f for f in os.listdir(datadir) if f.endswith('.csv')]
files = [os.path.join(datadir,f) for f in files]
dfs = [pd.read_csv(f).iloc[:,1:] for f in files]
dataset = pd.concat(dfs,ignore_index=True).dropna().reset_index(drop=True)
dataset

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,updown,outin,w,barrier_type_name,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price
0,1416.59000,1274.93100,708.29500,60,Down,Out,call,DownOut,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,146.29070
1,1416.59000,1274.93100,708.29500,60,Down,Out,put,DownOut,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,0.48509
2,1416.59000,1274.93100,708.29500,60,Down,In,call,DownIn,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,0.06683
3,1416.59000,1274.93100,708.29500,60,Down,In,put,DownIn,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,0.00000
4,1416.59000,1274.93100,708.29500,90,Down,Out,call,DownOut,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,149.46448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1806475,1071.64000,1178.80400,1607.46000,540,Up,In,put,UpIn,0.00000,0.02157,0.04000,0.09480,11.78475,-0.57234,0.63227,0.13851,2010-05-20,4.13846
1806476,1071.64000,1178.80400,1607.46000,720,Up,Out,call,UpOut,0.00000,0.02157,0.04000,0.09480,11.78475,-0.57234,0.63227,0.13851,2010-05-20,16.47482
1806477,1071.64000,1178.80400,1607.46000,720,Up,Out,put,UpOut,0.00000,0.02157,0.04000,0.09480,11.78475,-0.57234,0.63227,0.13851,2010-05-20,204.52813
1806478,1071.64000,1178.80400,1607.46000,720,Up,In,call,UpIn,0.00000,0.02157,0.04000,0.09480,11.78475,-0.57234,0.63227,0.13851,2010-05-20,134.47694


In [3]:
from model_settings import vanilla_pricer
vanillas = vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d')
dataset.dtypes


initializing vanilla pricer
Actual/365 (Fixed) day counter
seed: 123



spot_price                  float64
strike_price                float64
barrier                     float64
days_to_maturity              int64
updown                       object
outin                        object
w                            object
barrier_type_name            object
rebate                      float64
dividend_rate               float64
risk_free_rate              float64
theta                       float64
kappa                       float64
rho                         float64
eta                         float64
v0                          float64
calculation_date     datetime64[ns]
barrier_price               float64
dtype: object

# Preprocessing

In [4]:
from convsklearn import barrier_trainer

trainer = barrier_trainer
dataset['barrier_price'] = pd.to_numeric(dataset['barrier_price'],errors='coerce')
dataset['observed_price'] = ms.noisyfier(dataset['barrier_price'])

## Train/test split

In [5]:
unique_dates = dataset['calculation_date'].sort_values(
    ascending=True).unique().tolist()
filter_date = unique_dates[int(0.85*len(unique_dates))]
train_data = dataset[
    (
      # (dataset['calculation_date']>=datetime(2007,1,1))
      #  &
      (dataset['calculation_date']<=filter_date)
    )
].copy()

test_data = dataset[
    (
        (dataset['calculation_date']>filter_date)
        # &
        # (dataset['calculation_date']<=datetime(2012,12,31))
    )
].copy()

In [6]:
test_data

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,updown,outin,w,barrier_type_name,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price,observed_price
1532160,1110.24000,999.21600,555.12000,60,Down,Out,call,DownOut,0.00000,0.02070,0.04000,0.06728,3.94373,-0.90383,0.18945,0.03106,2009-11-17,118.64290,118.58499
1532161,1110.24000,999.21600,555.12000,60,Down,Out,put,DownOut,0.00000,0.02070,0.04000,0.06728,3.94373,-0.90383,0.18945,0.03106,2009-11-17,4.83450,4.95680
1532162,1110.24000,999.21600,555.12000,60,Down,In,call,DownIn,0.00000,0.02070,0.04000,0.06728,3.94373,-0.90383,0.18945,0.03106,2009-11-17,0.05403,0.00000
1532163,1110.24000,999.21600,555.12000,60,Down,In,put,DownIn,0.00000,0.02070,0.04000,0.06728,3.94373,-0.90383,0.18945,0.03106,2009-11-17,0.00000,0.11833
1532164,1110.24000,999.21600,555.12000,90,Down,Out,call,DownOut,0.00000,0.02070,0.04000,0.06728,3.94373,-0.90383,0.18945,0.03106,2009-11-17,124.96143,125.24602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1806475,1071.64000,1178.80400,1607.46000,540,Up,In,put,UpIn,0.00000,0.02157,0.04000,0.09480,11.78475,-0.57234,0.63227,0.13851,2010-05-20,4.13846,4.29835
1806476,1071.64000,1178.80400,1607.46000,720,Up,Out,call,UpOut,0.00000,0.02157,0.04000,0.09480,11.78475,-0.57234,0.63227,0.13851,2010-05-20,16.47482,16.78454
1806477,1071.64000,1178.80400,1607.46000,720,Up,Out,put,UpOut,0.00000,0.02157,0.04000,0.09480,11.78475,-0.57234,0.63227,0.13851,2010-05-20,204.52813,204.31814
1806478,1071.64000,1178.80400,1607.46000,720,Up,In,call,UpIn,0.00000,0.02157,0.04000,0.09480,11.78475,-0.57234,0.63227,0.13851,2010-05-20,134.47694,134.45926


In [7]:
test_train_ratio = int(round(100*test_data.shape[0]/train_data.shape[0],0))
print(f"train/test: {100-test_train_ratio}/{test_train_ratio}")

train/test: 82/18


In [8]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data,feature_set = trainer.feature_set, target_name=trainer.target_name)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']

# Training

In [9]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (13, 13, 13)
learning rate: adaptive
activation: relu
solver: sgd
alpha: 0.0001

cpu: 367.7598912715912


# Testing

In [10]:
test_data.describe()

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price,observed_price
count,274320.0,274320.0,274320.0,274320.0,274320.0,274320.0,274320.0,274320.0,274320.0,274320.0,274320.0,274320.0,274320,274320.0,274320.0
mean,1134.10921,1134.10921,1134.10921,325.0,0.0,0.02023,0.04,0.05812,3.95241,-0.85783,0.19371,0.02856,2010-02-18 08:30:14.173228800,43.58023,43.59005
min,1056.71,951.039,528.355,60.0,0.0,0.01886,0.04,0.04162,0.25135,-1.0,0.11096,0.00992,2009-11-17 00:00:00,0.0,0.0
25%,1103.23,1069.0965,839.0637,90.0,0.0,0.01967,0.04,0.04715,3.09774,-0.95828,0.16073,0.01704,2010-01-04 00:00:00,0.8325,0.85095
50%,1126.26,1133.445,1126.65295,270.0,0.0,0.02014,0.04,0.05353,3.86078,-0.84123,0.17583,0.02156,2010-02-19 00:00:00,21.64612,21.64265
75%,1165.95,1197.08725,1413.4563,540.0,0.0,0.02075,0.04,0.06744,4.72555,-0.81554,0.19331,0.03157,2010-04-07 00:00:00,77.02175,77.01203
max,1217.14,1338.854,1825.71,720.0,0.0,0.02207,0.04,0.21985,12.96091,-0.47333,0.75922,0.13851,2010-05-20 00:00:00,244.36173,244.31197
std,39.63419,83.28635,352.07697,241.50613,0.0,0.00077,0.0,0.0187,1.65355,0.11377,0.08645,0.01977,,49.45577,49.44724


In [11]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
        )
outofsample_RMSE = errors['outofsample_RMSE']


in sample:
     RMSE: 5.719257852866361
     MAE: 3.8823675896859724

out of sample:
     RMSE: 7.33625689636539
     MAE: 4.717669695526482


# Saving