In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Thu Oct 24 17:26:41 2024



# Loading data

In [3]:
root = Path().resolve().parent.parent
datadir = os.path.join(root,ms.bloomberg_spx_barrier_dump)
files = [f for f in os.listdir(datadir) if f.endswith('.csv')]
files = [os.path.join(datadir,f) for f in files]
dfs = []
bar = tqdm(total=len(files))
for f in files:
    dfs.append(pd.read_csv(f).iloc[:,1:])
    bar.update(1)
bar.close()
dataset = pd.concat(dfs,ignore_index=True).dropna().reset_index(drop=True)

100%|█████████████████████████████████████████████████████████████████████████████| 1510/1510 [00:11<00:00, 132.60it/s]


In [4]:
from model_settings import vanilla_pricer
vanillas = vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d')


initializing vanilla pricer
Actual/365 (Fixed) day counter
seed: 123



spot_price                  float64
strike_price                float64
barrier                     float64
days_to_maturity              int64
updown                       object
outin                        object
w                            object
barrier_type_name            object
rebate                      float64
dividend_rate               float64
risk_free_rate              float64
theta                       float64
kappa                       float64
rho                         float64
eta                         float64
v0                          float64
calculation_date     datetime64[ns]
barrier_price               float64
dtype: object

# Preprocessing

In [6]:
from convsklearn import barrier_trainer
price = 'barrier_price'
trainer = barrier_trainer
trainer.activation_function = 'relu'
trainer.solver = 'sgd'
dataset[price] = pd.to_numeric(dataset[price],errors='coerce')
dataset['observed_price'] = np.maximum(dataset[price] + np.random.normal(scale=(0.15)**2,size=dataset.shape[0]),0)

In [7]:
dataset.dropna()

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,updown,outin,w,barrier_type_name,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price,observed_price
0,1416.59000,1274.93100,708.29500,60,Down,Out,call,DownOut,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,146.29070,146.28191
1,1416.59000,1274.93100,708.29500,60,Down,Out,put,DownOut,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,0.48509,0.45735
2,1416.59000,1274.93100,708.29500,60,Down,In,call,DownIn,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,0.06683,0.06697
3,1416.59000,1274.93100,708.29500,60,Down,In,put,DownIn,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,0.00000,0.00000
4,1416.59000,1274.93100,708.29500,90,Down,Out,call,DownOut,0.00000,0.01812,0.04000,0.08296,0.23344,-1.00000,0.09850,0.01066,2007-01-03,149.46448,149.46844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3226675,1425.77000,1568.34700,2138.65500,540,Up,In,put,UpIn,0.00000,0.02223,0.04000,0.05621,0.89821,-0.41897,0.32487,0.02466,2012-12-31,0.35048,0.33773
3226676,1425.77000,1568.34700,2138.65500,720,Up,Out,call,UpOut,0.00000,0.02223,0.04000,0.05621,0.89821,-0.41897,0.32487,0.02466,2012-12-31,54.54956,54.56071
3226677,1425.77000,1568.34700,2138.65500,720,Up,Out,put,UpOut,0.00000,0.02223,0.04000,0.05621,0.89821,-0.41897,0.32487,0.02466,2012-12-31,189.96091,189.96628
3226678,1425.77000,1568.34700,2138.65500,720,Up,In,call,UpIn,0.00000,0.02223,0.04000,0.05621,0.89821,-0.41897,0.32487,0.02466,2012-12-31,51.74802,51.73937


## Train/test split

In [12]:
unique_dates = dataset['calculation_date'].sort_values(
    ascending=True).unique().tolist()

development_dates = unique_dates[:100]
train_data = dataset[dataset['calculation_date'].isin(development_dates)]
test_data = dataset[~dataset['calculation_date'].isin(development_dates)]

In [15]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data,feature_set = trainer.feature_set, target_name=trainer.target_name)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']

0        146.28191
1          0.45735
2          0.06697
3          0.00000
4        149.46844
            ...   
215995     0.06028
215996    78.34833
215997   163.07695
215998     5.57172
215999     0.05337
Name: observed_price, Length: 216000, dtype: float64

# Training

In [16]:
print("features:")
for f in trainer.feature_set:
    print(f"    {f.replace('_',' ')}")
print(f"target:\n    {trainer.target_name.replace('_',' ')}")

features:
    spot price
    strike price
    days to maturity
    risk free rate
    dividend rate
    kappa
    theta
    rho
    eta
    v0
    barrier
    barrier type name
    w
target:
    observed price


In [17]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (13, 13, 13)
learning rate: adaptive
activation: relu
solver: sgd
alpha: 0.0001

cpu: 291.321035861969


# Testing

In [18]:
test_data.describe()

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price,observed_price
count,3010680.0,3010680.0,3010680.0,3010680.0,3010680.0,3010680.0,3010680.0,3010680.0,3010680.0,3010680.0,3010680.0,3010680.0,3010680,3010680.0,3010680.0
mean,1226.75927,1226.75927,1226.75927,323.2291,0.0,0.02217,0.04,0.07068,3.11697,-0.88889,0.22035,0.05459,2010-03-19 01:36:25.483678208,49.7317,49.73239
min,676.03,608.427,338.015,60.0,0.0,0.01791,0.04,0.02342,0.08326,-1.0,0.10989,0.00739,2007-05-29 00:00:00,0.0,0.0
25%,1099.45,1089.29,877.7708,90.0,0.0,0.01993,0.04,0.05322,1.62844,-1.0,0.16426,0.02066,2008-10-28 00:00:00,1.34306,1.3429
50%,1267.76,1246.53775,1204.33448,180.0,0.0,0.02159,0.04,0.06273,2.41118,-0.99997,0.18868,0.03432,2010-03-25 00:00:00,26.55503,26.55704
75%,1370.2,1380.9315,1528.77308,540.0,0.0,0.02276,0.04,0.0749,4.22586,-0.83105,0.2293,0.05839,2011-08-11 00:00:00,87.1186,87.11947
max,1565.18,1721.698,2347.77,720.0,0.0,0.03773,0.04,0.3561,15.99144,-0.18176,1.52882,0.69402,2012-12-31 00:00:00,293.09806,293.0738
std,193.73254,209.66463,429.09777,241.30482,0.0,0.00347,0.0,0.03304,2.15832,0.18073,0.13079,0.06233,,55.44691,55.44633


In [19]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
)
outofsample_RMSE = errors['outofsample_RMSE']


in sample:
     RMSE: 4.698364674058325
     MAE: 3.0770484563643716

out of sample:
     RMSE: 50.080127438029955
     MAE: 20.2497888745519


# Saving

In [20]:
train_end_tag = str(datetime.fromtimestamp(
    train_end).strftime("%Y_%m_%d %H-%M-%S"))
file_tag = str(train_end_tag + " " + specs[0] + " " + str(int(outofsample_RMSE)) + "RMSE")
os.chdir(os.path.join(notebook_dir,'trained_models'))
files_dir = os.path.join(
    notebook_dir,'trained_models','trained_models',
    file_tag)

if Path(files_dir).exists():
    pass
else:
    os.mkdir(files_dir)

file_dir = os.path.join(files_dir,file_tag)

S = np.sort(train_data['spot_price'].unique())
K = np.sort(train_data['strike_price'].unique())
T = np.sort(train_data['days_to_maturity'].unique())
W = np.sort(train_data['w'].unique())
n_calls = train_data[train_data['w']=='call'].shape[0]
n_puts = train_data[train_data['w']=='put'].shape[0]
insample.to_csv(f"{file_dir} insample.csv")
outsample.to_csv(f"{file_dir} outsample.csv")
joblib.dump(model_fit,str(f"{file_dir}.pkl"))
pd.set_option("display.max_columns",None)

with open(f'{file_dir}.txt', 'w') as file:
    file.write(train_start_tag)
    file.write(f"\nspot(s):\n{S}")
    file.write(f"\n\nstrikes:\n{K}\n")
    file.write(f"\nmaturities:\n{T}\n")
    file.write(f"\ntypes:\n{W}\n")
    try:
        file.write(f"\n{train_data['barrier_type_name'].unique()}")
    except Exception:
        pass
    try:
        for col in ['averaging_type','fixing_frequency','past_fixings','n_fixings']:
            file.write(f"\n{col}:")
            file.write(f"\n{dataset[col].drop_duplicates().sort_values().values}\n")
    except Exception as e:
        print(e)
        pass
    file.write(f"\nnumber of calls, puts:\n{n_calls},{n_puts}\n")
    file.write(f"\ntotal prices:\n{train_data.shape[0]}\n")
    for spec in specs:
        file.write(f"{spec}\n")
    file.write("#"*17+"\n# training data #\n"+"#"*17+
          f"\n{train_data.describe()}\n")
    file.write("#"*13+"\n# test data #\n"+"#"*13+
          f"\n{test_data.describe()}\n")
    file.write(f"\n{dataset.dtypes}")
    file.write(
        f"\nin sample results:"
        f"\n     RMSE: {errors['insample_RMSE']}"
        f"\n     MAE: {errors['insample_MAE']}\n"
        f"\nout of sample results:"
        f"\n     RMSE: {errors['outofsample_RMSE']}"
        f"\n     MAE: {errors['outofsample_MAE']}\n"
        )
    file.write("\nfeatures:\n")
    for feature in trainer.feature_set:
        file.write(f"     {feature}\n")
    file.write(f"\ntarget: {trainer.target_name}\n")
    file.write(f"\ncpu: {train_runtime}\n")
    file.write(datetime.fromtimestamp(train_end).strftime('%c'))

'averaging_type'
