In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Thu Oct 24 07:51:17 2024



# Loading data

In [2]:
root = Path().resolve().parent.parent
datadir = os.path.join(root,ms.cboe_spx_asian_option_dump)
files = [f for f in os.listdir(datadir) if f.endswith('.csv')]
files = [os.path.join(datadir,f) for f in files]
dfs = []
bar = tqdm(total=len(files))
for f in files:
    dfs.append(pd.read_csv(f).iloc[:,1:])
    bar.update(1)
bar.close()
dataset = pd.concat(dfs,ignore_index=True).dropna().reset_index(drop=True)
dataset

100%|████████████████████████████████████████| 310/310 [00:00<00:00, 446.60it/s]


Unnamed: 0,spot_price,strike_price,days_to_maturity,n_fixings,fixing_frequency,past_fixings,averaging_type,w,risk_free_rate,dividend_rate,calculation_date,kappa,theta,rho,eta,v0,asian
0,2325.82500,1162.00000,7,7.00000,1,0,geometric,call,0.04000,0.00000,2020-03-20,7.77781,0.18907,-0.69179,9.49267,0.92894,1164.80602
1,2325.82500,1162.00000,7,7.00000,1,0,geometric,put,0.04000,0.00000,2020-03-20,7.77781,0.18907,-0.69179,9.49267,0.92894,0.18868
2,2325.82500,1162.00000,7,7.00000,1,0,arithmetic,call,0.04000,0.00000,2020-03-20,7.77781,0.18907,-0.69179,9.49267,0.92894,1164.80602
3,2325.82500,1162.00000,7,7.00000,1,0,arithmetic,put,0.04000,0.00000,2020-03-20,7.77781,0.18907,-0.69179,9.49267,0.92894,0.18868
4,2325.82500,1743.50000,7,7.00000,1,0,geometric,call,0.04000,0.00000,2020-03-20,7.77781,0.18907,-0.69179,9.49267,0.92894,591.22437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130195,3253.56500,4066.50000,336,1.00000,336,0,arithmetic,put,0.04000,0.00000,2020-10-30,3.54927,0.20945,-1.00000,0.70181,0.02613,745.41279
130196,3253.56500,4880.00000,336,1.00000,336,0,geometric,call,0.04000,0.00000,2020-10-30,3.54927,0.20945,-1.00000,0.70181,0.02613,0.00130
130197,3253.56500,4880.00000,336,1.00000,336,0,geometric,put,0.04000,0.00000,2020-10-30,3.54927,0.20945,-1.00000,0.70181,0.02613,1547.37714
130198,3253.56500,4880.00000,336,1.00000,336,0,arithmetic,call,0.04000,0.00000,2020-10-30,3.54927,0.20945,-1.00000,0.70181,0.02613,0.01325


In [3]:
from model_settings import vanilla_pricer
vanillas = vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d')
dataset.dtypes


initializing vanilla pricer
Actual/365 (Fixed) day counter
seed: 123



spot_price                 float64
strike_price               float64
days_to_maturity             int64
n_fixings                  float64
fixing_frequency             int64
past_fixings                 int64
averaging_type              object
w                           object
risk_free_rate             float64
dividend_rate              float64
calculation_date    datetime64[ns]
kappa                      float64
theta                      float64
rho                        float64
eta                        float64
v0                         float64
asian                      float64
dtype: object

# Preprocessing

In [4]:
from convsklearn import asian_trainer
price = 'asian'
trainer = asian_trainer
dataset[price] = pd.to_numeric(dataset[price],errors='coerce')
dataset['observed_price'] = ms.noisyfier(dataset[price])

## Train/test split

In [5]:
unique_dates = dataset['calculation_date'].sort_values(
    ascending=True).unique().tolist()
filter_date = unique_dates[int(0.85*len(unique_dates))]
train_data = dataset[
    (
      # (dataset['calculation_date']>=datetime(2007,1,1))
      #  &
      (dataset['calculation_date']<=filter_date)
    )
].copy()

test_data = dataset[
    (
        (dataset['calculation_date']>filter_date)
        # &
        # (dataset['calculation_date']<=datetime(2012,12,31))
    )
].copy()

In [6]:
test_data

Unnamed: 0,spot_price,strike_price,days_to_maturity,n_fixings,fixing_frequency,past_fixings,averaging_type,w,risk_free_rate,dividend_rate,calculation_date,kappa,theta,rho,eta,v0,asian,observed_price
9660,3966.44500,1983.00000,7,7.00000,1,0,geometric,call,0.04000,0.00000,2021-03-16,19.50634,0.06913,-0.52337,9.22073,0.01559,1985.11136,1985.20172
9661,3966.44500,1983.00000,7,7.00000,1,0,geometric,put,0.04000,0.00000,2021-03-16,19.50634,0.06913,-0.52337,9.22073,0.01559,0.00000,0.11105
9662,3966.44500,1983.00000,7,7.00000,1,0,arithmetic,call,0.04000,0.00000,2021-03-16,19.50634,0.06913,-0.52337,9.22073,0.01559,1985.11136,1985.09341
9663,3966.44500,1983.00000,7,7.00000,1,0,arithmetic,put,0.04000,0.00000,2021-03-16,19.50634,0.06913,-0.52337,9.22073,0.01559,0.00000,0.09899
9664,3966.44500,2974.50000,7,7.00000,1,0,geometric,call,0.04000,0.00000,2021-03-16,19.50634,0.06913,-0.52337,9.22073,0.01559,994.44300,994.45104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129775,3847.89000,4809.00000,336,1.00000,336,0,arithmetic,put,0.04000,0.00000,2021-01-25,13.14041,0.11018,-0.74373,6.87835,0.01651,860.33722,860.40020
129776,3847.89000,5771.00000,336,1.00000,336,0,geometric,call,0.04000,0.00000,2021-01-25,13.14041,0.11018,-0.74373,6.87835,0.01651,0.08598,0.00000
129777,3847.89000,5771.00000,336,1.00000,336,0,geometric,put,0.04000,0.00000,2021-01-25,13.14041,0.11018,-0.74373,6.87835,0.01651,1810.75320,1810.75476
129778,3847.89000,5771.00000,336,1.00000,336,0,arithmetic,call,0.04000,0.00000,2021-01-25,13.14041,0.11018,-0.74373,6.87835,0.01651,0.14022,0.15162


In [7]:
test_train_ratio = int(round(100*test_data.shape[0]/train_data.shape[0],0))
print(f"train/test: {100-test_train_ratio}/{test_train_ratio}")

train/test: 83/17


In [8]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data,feature_set = trainer.feature_set, target_name=trainer.target_name)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']

# Training

In [9]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (15, 15, 15)
learning rate: adaptive
activation: relu
solver: sgd
alpha: 0.0001

cpu: 7.666822910308838


# Testing

In [10]:
test_data.describe()

Unnamed: 0,spot_price,strike_price,days_to_maturity,n_fixings,fixing_frequency,past_fixings,risk_free_rate,dividend_rate,calculation_date,kappa,theta,rho,eta,v0,asian,observed_price
count,19320.0,19320.0,19320.0,19320.0,19320.0,19320.0,19320.0,19320.0,19320,19320.0,19320.0,19320.0,19320.0,19320.0,19320.0,19320.0
mean,3884.85935,3884.38043,156.71429,35.52381,51.28571,0.0,0.04,0.0,2021-02-24 04:41:44.347826176,13.064,0.11688,-0.70446,6.88556,0.02649,599.81379,599.82146
min,3723.08,1861.0,1.0,1.0,1.0,0.0,0.04,0.0,2021-01-22 00:00:00,0.18961,0.06715,-1.0,0.50144,0.0,0.0,0.0
25%,3844.35,2883.0,28.0,1.0,1.0,0.0,0.04,0.0,2021-02-08 00:00:00,4.72908,0.0831,-0.73362,4.11381,0.00992,1.58943,1.59578
50%,3903.07,3902.5,168.0,4.0,7.0,0.0,0.04,0.0,2021-02-24 12:00:00,12.1385,0.0974,-0.68073,6.76313,0.01527,101.06027,101.10872
75%,3925.375,4906.5,336.0,24.0,84.0,0.0,0.04,0.0,2021-03-12 00:00:00,19.05655,0.11808,-0.65628,9.30971,0.04168,998.73987,998.77145
max,3973.385,5960.0,336.0,336.0,336.0,0.0,0.04,0.0,2021-03-29 00:00:00,39.35249,0.71559,-0.52337,14.5814,0.1442,2094.09975,2093.88966
std,58.25073,1374.92877,126.00861,77.39009,81.26133,0.0,0.0,0.0,,9.03114,0.09449,0.09496,3.29906,0.02774,750.15123,750.14466


In [11]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
        )
outofsample_RMSE = errors['outofsample_RMSE']


in sample:
     RMSE: 35.89133629347023
     MAE: 23.634440686128393

out of sample:
     RMSE: 40.140164787246505
     MAE: 28.455412919747125


In [12]:
trainer.feature_set

['spot_price',
 'strike_price',
 'days_to_maturity',
 'risk_free_rate',
 'dividend_rate',
 'kappa',
 'theta',
 'rho',
 'eta',
 'v0',
 'fixing_frequency',
 'n_fixings',
 'past_fixings',
 'averaging_type',
 'w']

# Saving