In [2]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Thu Oct 24 20:50:47 2024



# Loading data

In [6]:
root = Path().resolve().parent.parent
datadir = os.path.join(root,ms.cboe_spx_asian_option_dump)
files = [f for f in os.listdir(datadir) if f.endswith('.csv')]
files = [os.path.join(datadir,f) for f in files]
dfs = []
bar = tqdm(total=len(files))
for f in files:
    dfs.append(pd.read_csv(f).iloc[:,1:])
    bar.update(1)
bar.close()
dataset = pd.concat(dfs,ignore_index=True).dropna().reset_index(drop=True)

100%|███████████████████████████████████████████████████████████████████████████████| 559/559 [00:04<00:00, 135.48it/s]


In [7]:
from model_settings import vanilla_pricer
vanillas = vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d')


initializing vanilla pricer
Actual/365 (Fixed) day counter
seed: 123



# Preprocessing

In [12]:
from convsklearn import asian_trainer
price = 'asian'
trainer = asian_trainer
trainer.activation_function = 'relu'
trainer.solver = 'sgd'
dataset[price] = pd.to_numeric(dataset[price],errors='coerce')
dataset['observed_price'] = np.maximum(dataset[price] + np.random.normal(scale=(0.15)**2,size=dataset.shape[0]),0)

In [13]:
dataset.dropna()

Unnamed: 0,spot_price,strike_price,days_to_maturity,n_fixings,fixing_frequency,past_fixings,averaging_type,w,risk_free_rate,dividend_rate,calculation_date,kappa,theta,rho,eta,v0,asian,observed_price
0,3250.84000,1625.00000,7,7.00000,1,0,geometric,call,0.04000,0.00000,2020-01-02,9.58021,0.03326,-0.65526,2.89669,0.00525,1627.09177,1627.09320
1,3250.84000,1625.00000,7,7.00000,1,0,geometric,put,0.04000,0.00000,2020-01-02,9.58021,0.03326,-0.65526,2.89669,0.00525,0.00000,0.02309
2,3250.84000,1625.00000,7,7.00000,1,0,arithmetic,call,0.04000,0.00000,2020-01-02,9.58021,0.03326,-0.65526,2.89669,0.00525,1627.09177,1627.05875
3,3250.84000,1625.00000,7,7.00000,1,0,arithmetic,put,0.04000,0.00000,2020-01-02,9.58021,0.03326,-0.65526,2.89669,0.00525,0.00000,0.00162
4,3250.84000,2437.75000,7,7.00000,1,0,geometric,call,0.04000,0.00000,2020-01-02,9.58021,0.03326,-0.65526,2.89669,0.00525,814.96501,814.95672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234775,4509.21000,5635.75000,336,1.00000,336,0,arithmetic,put,0.04000,0.00000,2022-03-24,3.16652,0.10202,-0.72608,2.79473,0.03740,1008.85734,1008.83812
234776,4509.21000,6763.00000,336,1.00000,336,0,geometric,call,0.04000,0.00000,2022-03-24,3.16652,0.10202,-0.72608,2.79473,0.03740,0.09540,0.10452
234777,4509.21000,6763.00000,336,1.00000,336,0,geometric,put,0.04000,0.00000,2022-03-24,3.16652,0.10202,-0.72608,2.79473,0.03740,2115.39489,2115.41695
234778,4509.21000,6763.00000,336,1.00000,336,0,arithmetic,call,0.04000,0.00000,2022-03-24,3.16652,0.10202,-0.72608,2.79473,0.03740,0.16357,0.12787


## Train/test split

In [26]:
unique_dates = dataset['calculation_date'].sort_values(
    ascending=True).unique().tolist()
pd.Series(unique_dates)

0     2020-01-02
1     2020-01-03
2     2020-01-06
3     2020-01-07
4     2020-01-08
         ...    
554   2022-03-18
555   2022-03-21
556   2022-03-22
557   2022-03-23
558   2022-03-24
Length: 559, dtype: datetime64[ns]

In [28]:
development_dates = unique_dates[:400]
train_data = dataset[dataset['calculation_date'].isin(development_dates)]
test_data = dataset[~dataset['calculation_date'].isin(development_dates)]

In [29]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']

# Training

In [30]:
print("features:")
for f in trainer.feature_set:
    print(f"    {f.replace('_',' ')}")
print(f"target:\n    {trainer.target_name.replace('_',' ')}")

features:
    spot price
    strike price
    days to maturity
    risk free rate
    dividend rate
    kappa
    theta
    rho
    eta
    v0
    fixing frequency
    n fixings
    past fixings
    averaging type
    w
target:
    observed price


In [31]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (15, 15, 15)
learning rate: adaptive
activation: relu
solver: sgd
alpha: 0.0001

cpu: 262.4084196090698


# Testing

In [32]:
test_data.describe()

Unnamed: 0,spot_price,strike_price,days_to_maturity,n_fixings,fixing_frequency,past_fixings,risk_free_rate,dividend_rate,calculation_date,kappa,theta,rho,eta,v0,asian,observed_price
count,66780.0,66780.0,66780.0,66780.0,66780.0,66780.0,66780.0,66780.0,66780,66780.0,66780.0,66780.0,66780.0,66780.0,66780.0,66780.0
mean,4514.61025,4514.12579,156.71429,35.52381,51.28571,0.0,0.04,0.0,2021-11-29 21:44:09.056603648,16.33833,0.2329,-0.71204,7.69233,0.03021,694.41715,694.418
min,4174.015,2087.0,1.0,1.0,1.0,0.0,0.04,0.0,2021-08-09 00:00:00,0.02807,0.0433,-1.0,0.6109,0.0,0.0,0.0
25%,4406.05,3304.5,28.0,1.0,1.0,0.0,0.04,0.0,2021-10-04 00:00:00,5.21499,0.0635,-0.74359,4.31819,0.00167,1.43146,1.42238
50%,4505.095,4504.5,168.0,4.0,7.0,0.0,0.04,0.0,2021-11-30 00:00:00,13.32861,0.07473,-0.70773,7.23044,0.01318,108.84715,108.83066
75%,4648.785,5810.75,336.0,24.0,84.0,0.0,0.04,0.0,2022-01-27 00:00:00,20.93834,0.10772,-0.67923,9.84748,0.0462,1169.51508,1169.49555
max,4801.65,7202.0,336.0,336.0,336.0,0.0,0.04,0.0,2022-03-24 00:00:00,138.4092,13.04739,-0.52998,52.54234,0.16352,2524.1856,2524.16523
std,145.43856,1603.60792,126.00629,77.38867,81.25983,0.0,0.0,0.0,,18.53713,1.21588,0.06792,5.43732,0.03685,873.54879,873.54805


In [33]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
)
outofsample_RMSE = errors['outofsample_RMSE']


in sample:
     RMSE: 29.162589923391174
     MAE: 18.984397160322427

out of sample:
     RMSE: 51.41802136849447
     MAE: 31.619709384580265


# Saving

In [34]:
train_end_tag = str(datetime.fromtimestamp(
    train_end).strftime("%Y_%m_%d %H-%M-%S"))
file_tag = str(train_end_tag + " " + specs[0] + " " + str(int(outofsample_RMSE)) + "RMSE")
os.chdir(os.path.join(notebook_dir,'trained_models'))
files_dir = os.path.join(
    notebook_dir,'trained_models','trained_models',
    file_tag)

if Path(files_dir).exists():
    pass
else:
    os.mkdir(files_dir)

file_dir = os.path.join(files_dir,file_tag)

S = np.sort(train_data['spot_price'].unique())
K = np.sort(train_data['strike_price'].unique())
T = np.sort(train_data['days_to_maturity'].unique())
W = np.sort(train_data['w'].unique())
n_calls = train_data[train_data['w']=='call'].shape[0]
n_puts = train_data[train_data['w']=='put'].shape[0]
insample.to_csv(f"{file_dir} insample.csv")
outsample.to_csv(f"{file_dir} outsample.csv")
joblib.dump(model_fit,str(f"{file_dir}.pkl"))
pd.set_option("display.max_columns",None)

with open(f'{file_dir}.txt', 'w') as file:
    file.write(train_start_tag)
    file.write(f"\nspot(s):\n{S}")
    file.write(f"\n\nstrikes:\n{K}\n")
    file.write(f"\nmaturities:\n{T}\n")
    file.write(f"\ntypes:\n{W}\n")
    try:
        file.write(f"\n{train_data['barrier_type_name'].unique()}")
    except Exception:
        pass
    try:
        for col in ['averaging_type','fixing_frequency','past_fixings','n_fixings']:
            file.write(f"\n{col}:")
            file.write(f"\n{dataset[col].drop_duplicates().sort_values().values}\n")
    except Exception as e:
        print(e)
        pass
    file.write(f"\nnumber of calls, puts:\n{n_calls},{n_puts}\n")
    file.write(f"\ntotal prices:\n{train_data.shape[0]}\n")
    for spec in specs:
        file.write(f"{spec}\n")
    file.write("#"*17+"\n# training data #\n"+"#"*17+
          f"\n{train_data.describe()}\n")
    file.write("#"*13+"\n# test data #\n"+"#"*13+
          f"\n{test_data.describe()}\n")
    file.write(f"\n{dataset.dtypes}")
    file.write(
        f"\nin sample results:"
        f"\n     RMSE: {errors['insample_RMSE']}"
        f"\n     MAE: {errors['insample_MAE']}\n"
        f"\nout of sample results:"
        f"\n     RMSE: {errors['outofsample_RMSE']}"
        f"\n     MAE: {errors['outofsample_MAE']}\n"
        )
    file.write("\nfeatures:\n")
    for feature in trainer.feature_set:
        file.write(f"     {feature}\n")
    file.write(f"\ntarget: {trainer.target_name}\n")
    file.write(f"\ncpu: {train_runtime}\n")
    file.write(datetime.fromtimestamp(train_end).strftime('%c'))