In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Sun Oct 27 22:48:01 2024



# loading data

In [2]:
root = Path().resolve().parent.parent
datadir = os.path.join(root,ms.cboe_spx_intraday_barriers['dump'])
files = [f for f in os.listdir(datadir) if f.endswith('.csv')]
files = [os.path.join(datadir,f) for f in files]
dfs = []
bar = tqdm(total=len(files)+1)
for f in files:
    dfs.append(pd.read_csv(f).iloc[:,1:])
    bar.update(1)
dataset = pd.concat(dfs,ignore_index=True).dropna().reset_index(drop=True)
bar.update(1)
bar.close()

100%|█████████████████████████████████████████████████████████████████████████████████| 76/76 [00:00<00:00, 159.43it/s]


In [3]:
from quantlib_pricers import vanilla_pricer
vanillas = vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d %H:%M:%S')


initializing vanilla pricer
Actual/365 (Fixed) day counter
seed: 123



# model specification

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer, OneHotEncoder

from convsklearn import asian_trainer, barrier_trainer
price = 'barrier_price' 
filetag = 'intraday barrier options'
trainer = barrier_trainer

trainer.activation_function = 'relu'
trainer.max_iter = 1000
trainer.solver = 'sgd'
# trainer.target_transformer_pipeline = Pipeline(steps=[('QuantileTransformer', QuantileTransformer())])
# trainer.transformers = [
#     ('QuantileTransformer',QuantileTransformer(),
#         [
#             'spot_price',
#             'strike_price',
#             'days_to_maturity',
#             'risk_free_rate',
#             'dividend_rate',
#             'kappa',
#             'theta',
#             'rho',
#             'eta',
#             'v0',
#             'barrier'
#         ]
#     ),
#     ('OneHotEncoder',OneHotEncoder(sparse_output=False),['barrier_type_name', 'w'])
# ]
trainer.__dict__

{'target_name': 'observed_price',
 'numerical_features': ['spot_price',
  'strike_price',
  'days_to_maturity',
  'risk_free_rate',
  'dividend_rate',
  'kappa',
  'theta',
  'rho',
  'eta',
  'v0',
  'barrier'],
 'categorical_features': ['barrier_type_name', 'w'],
 'feature_set': ['spot_price',
  'strike_price',
  'days_to_maturity',
  'risk_free_rate',
  'dividend_rate',
  'kappa',
  'theta',
  'rho',
  'eta',
  'v0',
  'barrier',
  'barrier_type_name',
  'w'],
 'random_state': None,
 'max_iter': 1000,
 'n_layers': None,
 'layer_size': 13,
 'hidden_layer_sizes': (13, 13, 13),
 'solver': 'sgd',
 'alpha': 0.0001,
 'learning_rate': 'adaptive',
 'activation_function': 'relu',
 'rf_n_estimators': 50,
 'rf_min_samples_leaf': 2000,
 'transformers': [('StandardScaler',
   StandardScaler(),
   ['spot_price',
    'strike_price',
    'days_to_maturity',
    'risk_free_rate',
    'dividend_rate',
    'kappa',
    'theta',
    'rho',
    'eta',
    'v0',
    'barrier']),
  ('OneHotEncoder',
   On

# preprocessing

In [5]:
dataset[price] = pd.to_numeric(dataset[price],errors='coerce')
dataset['observed_price'] = np.maximum(dataset[price] + np.random.normal(scale=(0.15)**2,size=dataset.shape[0]),0)
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'])
dataset = dataset.sort_values(by='calculation_date')
dataset = dataset.dropna().reset_index(drop=True)
unique_dates = dataset['calculation_date'].drop_duplicates().squeeze()
dataset.describe()

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price,observed_price
count,162000.0,162000.0,162000.0,162000.0,162000.0,162000.0,162000.0,162000.0,162000.0,162000.0,162000.0,162000.0,162000,162000.0,162000.0
mean,1285.55485,1285.55485,1285.55485,325.0,0.0,0.0,0.04,0.09556,1.87907,-1.0,0.57935,0.01568,2012-01-10 01:48:38.399999744,51.34116,51.34209
min,1266.532,1139.8788,633.266,60.0,0.0,0.0,0.04,0.03188,0.45086,-1.0,0.21502,0.01047,2012-01-03 09:42:00,0.0,0.0
25%,1278.91727,1214.97141,957.12981,90.0,0.0,0.0,0.04,0.07168,1.22943,-1.0,0.5205,0.01157,2012-01-04 11:46:00,0.32034,0.32322
50%,1284.738,1284.738,1285.70339,270.0,0.0,0.0,0.04,0.09812,1.60462,-1.0,0.61645,0.01278,2012-01-10 12:42:00,21.94762,21.94467
75%,1292.6425,1357.27463,1612.34619,540.0,0.0,0.0,0.04,0.12005,2.14469,-1.0,0.64503,0.01843,2012-01-13 10:31:00,86.66665,86.66559
max,1301.44714,1431.59186,1952.17071,720.0,0.0,0.0,0.04,0.17686,7.3413,-1.0,1.13352,0.03074,2012-01-18 11:16:00,341.60284,341.63388
std,8.50613,83.41909,396.41398,241.50644,0.0,0.0,0.0,0.03169,1.14459,0.0,0.13688,0.0055,,63.71246,63.71164


## train/test split

In [6]:
unique_dates

0        2012-01-03 09:42:00
2160     2012-01-03 09:50:00
4320     2012-01-03 10:00:00
6480     2012-01-03 10:05:00
8640     2012-01-03 10:20:00
                 ...        
151200   2012-01-18 09:53:00
153360   2012-01-18 10:38:00
155520   2012-01-18 10:55:00
157680   2012-01-18 11:00:00
159840   2012-01-18 11:16:00
Name: calculation_date, Length: 75, dtype: datetime64[ns]

In [7]:
development_dates = unique_dates[unique_dates<=pd.Timestamp(2008,9,1)]
development_dates = unique_dates[:len(unique_dates)//3]
test_dates = unique_dates[~unique_dates.isin(development_dates)]
train_data = dataset[dataset['calculation_date'].isin(development_dates)]
test_data = dataset[dataset['calculation_date'].isin(test_dates)]

In [8]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']
print(len(train_y),len(test_y))

54000 108000


# training

In [9]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (13, 13, 13)
learning rate: adaptive
activation: relu
solver: sgd
alpha: 0.0001

cpu: 24.24937891960144


# testing

In [10]:
model_fit

In [11]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
)
outofsample_RMSE = errors['outofsample_RMSE']


in sample:
     RMSE: 11.354643548621649
     MAE: 7.677111935293059

out of sample:
     RMSE: 11.887898680780271
     MAE: 8.035070689767114


# saving

In [12]:
train_end_tag = str(datetime.fromtimestamp(
    train_end).strftime("%Y_%m_%d %H-%M-%S"))
file_tag = str(train_end_tag + " " + specs[0] + " " + filetag)
os.chdir(os.path.join(notebook_dir,'trained_models'))
files_dir = os.path.join(
    notebook_dir,'trained_models','trained_models',
    file_tag)

def save_model():
    if Path(files_dir).exists():
        pass
    else:
        os.mkdir(files_dir)
    file_dir = os.path.join(files_dir,file_tag)
    S = np.sort(train_data['spot_price'].unique())
    K = np.sort(train_data['strike_price'].unique())
    T = np.sort(train_data['days_to_maturity'].unique())
    W = np.sort(train_data['w'].unique())
    n_calls = train_data[train_data['w']=='call'].shape[0]
    n_puts = train_data[train_data['w']=='put'].shape[0]
    insample.to_csv(f"{file_dir} insample.csv")
    outsample.to_csv(f"{file_dir} outsample.csv")
    joblib.dump(model_fit,str(f"{file_dir}.pkl"))
    pd.set_option("display.max_columns",None)
    
    with open(f'{file_dir}.txt', 'w') as file:
        file.write(train_start_tag)
        file.write(f"\nspot(s):\n{S}")
        file.write(f"\n\nstrikes:\n{K}\n")
        file.write(f"\nmaturities:\n{T}\n")
        file.write(f"\ntypes:\n{W}\n")
        try:
            file.write(f"\n{train_data['barrier_type_name'].unique()}")
        except Exception:
            pass
        try:
            for col in ['averaging_type','fixing_frequency','past_fixings','n_fixings']:
                file.write(f"\n{col}:")
                file.write(f"\n{dataset[col].drop_duplicates().sort_values().values}\n")
        except Exception:
            pass
        file.write(f"\nnumber of calls, puts:\n{n_calls},{n_puts}\n")
        file.write(f"\ntotal prices:\n{train_data.shape[0]}\n")
        for spec in specs:
            file.write(f"{spec}\n")
        file.write("#"*17+"\n# training data #\n"+"#"*17+
              f"\n{train_data.describe()}\n")
        file.write("#"*13+"\n# test data #\n"+"#"*13+
              f"\n{test_data.describe()}\n")
        file.write(f"\n{dataset.dtypes}")
        file.write(
            f"\nin sample results:"
            f"\n     RMSE: {errors['insample_RMSE']}"
            f"\n     MAE: {errors['insample_MAE']}\n"
            f"\nout of sample results:"
            f"\n     RMSE: {errors['outofsample_RMSE']}"
            f"\n     MAE: {errors['outofsample_MAE']}\n"
            )
        file.write("\nfeatures:\n")
        for feature in trainer.feature_set:
            file.write(f"     {feature}\n")
        file.write(f"\ntarget: {trainer.target_name}\n")
        file.write(f"\ncpu: {train_runtime}\n")
        file.write(datetime.fromtimestamp(train_end).strftime('%c'))
        print(f"model saved to {file_dir}")

print(f"execute the command 'save_model()' to save the following model: {file_tag}")

execute the command 'save_model()' to save the following model: 2024_10_27 22-48-25 Deep Neural Network intraday barrier options
