In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Tue Oct 29 00:33:32 2024



# loading data

In [2]:
root = Path().resolve().parent.parent
datadir = os.path.join(root,ms.bloomberg_spx_barriers['dump'])
files = [f for f in os.listdir(datadir) if f.endswith('.csv')]
files = [os.path.join(datadir,f) for f in files]
dfs = []
bar = tqdm(total=len(files)+1)
for f in files:
    dfs.append(pd.read_csv(f).iloc[:,1:])
    bar.update(1)
dataset = pd.concat(dfs,ignore_index=True).dropna().reset_index(drop=True)
bar.update(1)
bar.close()

100%|█████████████████████████████████████████████████████████████████████████████| 1511/1511 [00:11<00:00, 133.14it/s]


In [3]:
from quantlib_pricers import vanilla_pricer
vanillas = vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d')


initializing vanilla pricer
Actual/365 (Fixed) day counter
seed: 123



# model specification

In [4]:
from convsklearn import barrier_trainer
price = 'barrier_price' 
filetag = 'intraday barrier options'
trainer = barrier_trainer

# preprocessing

In [5]:
dataset[price] = pd.to_numeric(dataset[price],errors='coerce')
dataset['observed_price'] = np.maximum(dataset[price] + np.random.normal(scale=(0.15)**2,size=dataset.shape[0]),0)
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'])
dataset = dataset.sort_values(by='calculation_date')
dataset = dataset.dropna().reset_index(drop=True)
unique_dates = dataset['calculation_date'].drop_duplicates().squeeze()
dataset.describe()

Unnamed: 0,spot_price,strike_price,barrier,days_to_maturity,rebate,dividend_rate,risk_free_rate,theta,kappa,rho,eta,v0,calculation_date,barrier_price,observed_price
count,3226680.0,3226680.0,3226680.0,3226680.0,3226680.0,3226680.0,3226680.0,3226680.0,3226680.0,3226680.0,3226680.0,3226680.0,3226680,3226680.0,3226680.0
mean,1241.58404,1241.58404,1241.58404,323.34765,0.0,0.02194,0.04,0.06826,3.01194,-0.89051,0.2143,0.05174,2010-01-04 11:51:24.279818496,48.9665,48.96721
min,676.03,608.427,338.015,60.0,0.0,0.0178,0.04,0.01992,0.0397,-1.0,0.08818,0.00524,2007-01-03 00:00:00,0.0,0.0
25%,1108.72,1102.88675,889.14787,90.0,0.0,0.01963,0.04,0.05171,1.54387,-1.0,0.15604,0.01891,2008-06-26 00:00:00,1.10267,1.10289
50%,1284.21,1264.23375,1221.79568,180.0,0.0,0.02142,0.04,0.06148,2.30859,-0.99998,0.18637,0.03175,2010-01-12 00:00:00,25.38862,25.38911
75%,1397.95,1397.95,1545.735,540.0,0.0,0.02264,0.04,0.07378,4.07581,-0.82945,0.22585,0.05573,2011-07-07 00:00:00,85.79311,85.79368
max,1565.18,1721.698,2347.77,720.0,0.0,0.03773,0.04,0.40505,15.99144,-0.18176,1.52882,0.69402,2012-12-31 00:00:00,293.09806,293.1273
std,195.40461,211.57766,433.95284,241.31868,0.0,0.00347,0.0,0.03476,2.16348,0.17935,0.1301,0.06115,,55.18902,55.18841


## train/test split

In [6]:
unique_dates

0         2007-01-03
2160      2007-01-04
4320      2007-01-05
6480      2007-01-08
8640      2007-01-09
             ...    
3215880   2012-12-24
3218040   2012-12-26
3220200   2012-12-27
3222360   2012-12-28
3224520   2012-12-31
Name: calculation_date, Length: 1510, dtype: datetime64[ns]

In [7]:
development_dates = unique_dates[unique_dates<=pd.Timestamp(2008,9,1)]
development_dates = unique_dates[:len(unique_dates)//3]
test_dates = unique_dates[~unique_dates.isin(development_dates)]
train_data = dataset[dataset['calculation_date'].isin(development_dates)]
test_data = dataset[dataset['calculation_date'].isin(test_dates)]

In [8]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']
print(len(train_y),len(test_y))

1052640 2174040


# training

In [9]:
model_fit = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

alpha: 0.01
hidden_layer_sizes: (13, 13)
learning_rate: adaptive
learning_rate_init: 0.1
solver: sgd
early_stopping: False
max_iter: 500
warm_start: True
tol: 0.0001
cpu: 114.35860466957092

cpu: 129.1106824874878




# testing

In [10]:
model_fit

In [11]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
)
outofsample_RMSE = errors['outofsample_RMSE']

KeyboardInterrupt: 

# saving

In [None]:
train_end_tag = str(datetime.fromtimestamp(
    train_end).strftime("%Y_%m_%d %H-%M-%S"))
file_tag = str(train_end_tag + " " + specs[0] + " " + filetag)
os.chdir(os.path.join(notebook_dir,'trained_models'))
files_dir = os.path.join(
    notebook_dir,'trained_models','trained_models',
    file_tag)

def save_model():
    if Path(files_dir).exists():
        pass
    else:
        os.mkdir(files_dir)
    file_dir = os.path.join(files_dir,file_tag)
    S = np.sort(train_data['spot_price'].unique())
    K = np.sort(train_data['strike_price'].unique())
    T = np.sort(train_data['days_to_maturity'].unique())
    W = np.sort(train_data['w'].unique())
    n_calls = train_data[train_data['w']=='call'].shape[0]
    n_puts = train_data[train_data['w']=='put'].shape[0]
    insample.to_csv(f"{file_dir} insample.csv")
    outsample.to_csv(f"{file_dir} outsample.csv")
    joblib.dump(model_fit,str(f"{file_dir}.pkl"))
    pd.set_option("display.max_columns",None)
    
    with open(f'{file_dir}.txt', 'w') as file:
        file.write(train_start_tag)
        file.write(f"\nspot(s):\n{S}")
        file.write(f"\n\nstrikes:\n{K}\n")
        file.write(f"\nmaturities:\n{T}\n")
        file.write(f"\ntypes:\n{W}\n")
        try:
            file.write(f"\n{train_data['barrier_type_name'].unique()}")
        except Exception:
            pass
        try:
            for col in ['averaging_type','fixing_frequency','past_fixings','n_fixings']:
                file.write(f"\n{col}:")
                file.write(f"\n{dataset[col].drop_duplicates().sort_values().values}\n")
        except Exception:
            pass
        file.write(f"\nnumber of calls, puts:\n{n_calls},{n_puts}\n")
        file.write(f"\ntotal prices:\n{train_data.shape[0]}\n")
        for spec in specs:
            file.write(f"{spec}\n")
        file.write("#"*17+"\n# training data #\n"+"#"*17+
              f"\n{train_data.describe()}\n")
        file.write("#"*13+"\n# test data #\n"+"#"*13+
              f"\n{test_data.describe()}\n")
        file.write(f"\n{dataset.dtypes}")
        file.write(
            f"\nin sample results:"
            f"\n     RMSE: {errors['insample_RMSE']}"
            f"\n     MAE: {errors['insample_MAE']}\n"
            f"\nout of sample results:"
            f"\n     RMSE: {errors['outofsample_RMSE']}"
            f"\n     MAE: {errors['outofsample_MAE']}\n"
            )
        file.write("\nfeatures:\n")
        for feature in trainer.feature_set:
            file.write(f"     {feature}\n")
        file.write(f"\ntarget: {trainer.target_name}\n")
        file.write(f"\ncpu: {train_runtime}\n")
        file.write(datetime.fromtimestamp(train_end).strftime('%c'))
        print(f"model saved to {file_dir}")

print(f"execute the command 'save_model()' to save the following model: {file_tag}")