In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Fri Oct 25 12:22:20 2024



# Loading data

In [2]:
root = Path().resolve().parent.parent
datadir = os.path.join(root,ms.bloomberg_spx_short_term_asian_option_dump)
files = [f for f in os.listdir(datadir) if f.endswith('.csv')]
files = [os.path.join(datadir,f) for f in files]
dfs = []
bar = tqdm(total=len(files))
for f in files:
    dfs.append(pd.read_csv(f).iloc[:,1:])
    bar.update(1)
bar.close()
dataset = pd.concat(dfs,ignore_index=True).dropna().reset_index(drop=True)

100%|████████████████████████████████████████████████████████████████████████████| 1052/1052 [00:00<00:00, 1141.23it/s]


In [3]:
from model_settings import vanilla_pricer
vanillas = vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d')


initializing vanilla pricer
Actual/365 (Fixed) day counter
seed: 123



# Preprocessing

In [4]:
from convsklearn import asian_trainer
price = 'asian'
trainer = asian_trainer
trainer.activation_function = 'tanh'
trainer.solver = 'lbfgs'
dataset[price] = pd.to_numeric(dataset[price],errors='coerce')
dataset['observed_price'] = np.maximum(dataset[price] + np.random.normal(scale=(0.15)**2,size=dataset.shape[0]),0)

In [5]:
dataset.dropna()

Unnamed: 0,spot_price,strike_price,days_to_maturity,n_fixings,fixing_frequency,past_fixings,averaging_type,w,risk_free_rate,dividend_rate,calculation_date,kappa,theta,rho,eta,v0,asian,observed_price
0,1416.59000,708.00000,7,1.00000,7,0,geometric,call,0.04000,0.01812,2007-01-03,0.23344,0.08296,-1.00000,0.09850,0.01066,708.32950,708.33202
1,1416.59000,708.00000,7,1.00000,7,0,geometric,put,0.04000,0.01812,2007-01-03,0.23344,0.08296,-1.00000,0.09850,0.01066,0.00000,0.00819
2,1416.59000,708.00000,7,1.00000,7,0,arithmetic,call,0.04000,0.01812,2007-01-03,0.23344,0.08296,-1.00000,0.09850,0.01066,708.35337,708.36260
3,1416.59000,708.00000,7,1.00000,7,0,arithmetic,put,0.04000,0.01812,2007-01-03,0.23344,0.08296,-1.00000,0.09850,0.01066,0.00000,0.00000
4,1416.59000,1062.00000,7,1.00000,7,0,geometric,call,0.04000,0.01812,2007-01-03,0.23344,0.08296,-1.00000,0.09850,0.01066,354.60096,354.61931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189355,1310.02000,1637.50000,84,1.00000,84,0,arithmetic,put,0.04000,0.01978,2011-03-07,1.57555,0.05176,-1.00000,0.14746,0.02908,321.42690,321.42079
189356,1310.02000,1965.00000,84,1.00000,84,0,geometric,call,0.04000,0.01978,2011-03-07,1.57555,0.05176,-1.00000,0.14746,0.02908,0.00000,0.00000
189357,1310.02000,1965.00000,84,1.00000,84,0,geometric,put,0.04000,0.01978,2011-03-07,1.57555,0.05176,-1.00000,0.14746,0.02908,646.73684,646.72817
189358,1310.02000,1965.00000,84,1.00000,84,0,arithmetic,call,0.04000,0.01978,2011-03-07,1.57555,0.05176,-1.00000,0.14746,0.02908,0.00000,0.02969


## Train/test split

In [6]:
unique_dates = dataset['calculation_date'].sort_values(
    ascending=True).unique().tolist()
pd.Series(unique_dates)

0      2007-01-03
1      2007-01-04
2      2007-01-05
3      2007-01-08
4      2007-01-09
          ...    
1047   2011-03-01
1048   2011-03-02
1049   2011-03-03
1050   2011-03-04
1051   2011-03-07
Length: 1052, dtype: datetime64[ns]

In [7]:
development_dates = unique_dates[:400]
train_data = dataset[dataset['calculation_date'].isin(development_dates)]
test_data = dataset[~dataset['calculation_date'].isin(development_dates)]

In [8]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']

# Training

In [9]:
print("features:")
for f in trainer.feature_set:
    print(f"    {f.replace('_',' ')}")
print(f"target:\n    {trainer.target_name.replace('_',' ')}")

features:
    spot price
    strike price
    days to maturity
    risk free rate
    dividend rate
    kappa
    theta
    rho
    eta
    v0
    fixing frequency
    n fixings
    past fixings
    averaging type
    w
target:
    observed price


In [10]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (15, 15, 15)
learning rate: adaptive
activation: tanh
solver: lbfgs
alpha: 0.0001

cpu: 43.33262300491333


# Testing

In [11]:
test_data.describe()

Unnamed: 0,spot_price,strike_price,days_to_maturity,n_fixings,fixing_frequency,past_fixings,risk_free_rate,dividend_rate,calculation_date,kappa,theta,rho,eta,v0,asian,observed_price
count,117360.0,117360.0,117360.0,117360.0,117360.0,117360.0,117360.0,117360.0,117360,117360.0,117360.0,117360.0,117360.0,117360.0,117360.0,117360.0
mean,1061.05305,1060.54525,48.22222,2.77778,31.11111,0.0,0.04,0.02344,2009-11-19 06:39:45.276073728,4.18487,0.08245,-0.90925,0.22056,0.08194,162.54698,162.55025
min,676.03,338.0,7.0,1.0,7.0,0.0,0.04,0.01791,2008-08-05 00:00:00,0.09849,0.03784,-1.0,0.11096,0.00992,0.0,0.0
25%,927.0375,691.375,28.0,1.0,7.0,0.0,0.04,0.02009,2009-03-29 06:00:00,2.62004,0.0608,-1.0,0.17313,0.02577,0.0,0.01081
50%,1090.525,1055.875,28.0,1.0,28.0,0.0,0.04,0.02152,2009-11-17 12:00:00,3.8995,0.07043,-0.99907,0.1933,0.04051,19.52019,19.50916
75%,1170.115,1366.5,84.0,3.0,28.0,0.0,0.04,0.02519,2010-07-14 06:00:00,5.36111,0.08808,-0.85304,0.23216,0.09539,291.05885,291.06185
max,1342.85,2014.0,84.0,12.0,84.0,0.0,0.04,0.03773,2011-03-07 00:00:00,13.25023,0.3561,-0.24433,1.52388,0.69402,671.60633,671.621
std,150.56778,407.71854,32.90667,3.42469,29.71897,0.0,0.0,0.00473,,2.08836,0.03572,0.13682,0.10813,0.1018,211.51565,211.51314


In [12]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
)
outofsample_RMSE = errors['outofsample_RMSE']


in sample:
     RMSE: 2.473474162106138
     MAE: 1.6090949636569643

out of sample:
     RMSE: 33.831156387805684
     MAE: 19.29436156457262


# Saving

In [16]:
train_end_tag = str(datetime.fromtimestamp(
    train_end).strftime("%Y_%m_%d %H-%M-%S"))
file_tag = str(train_end_tag + " " + specs[0] + " " + price)
os.chdir(os.path.join(notebook_dir,'trained_models'))
files_dir = os.path.join(
    notebook_dir,'trained_models','trained_models',
    file_tag)

def save_model():
    if Path(files_dir).exists():
        pass
    else:
        os.mkdir(files_dir)
    file_dir = os.path.join(files_dir,file_tag)
    S = np.sort(train_data['spot_price'].unique())
    K = np.sort(train_data['strike_price'].unique())
    T = np.sort(train_data['days_to_maturity'].unique())
    W = np.sort(train_data['w'].unique())
    n_calls = train_data[train_data['w']=='call'].shape[0]
    n_puts = train_data[train_data['w']=='put'].shape[0]
    insample.to_csv(f"{file_dir} insample.csv")
    outsample.to_csv(f"{file_dir} outsample.csv")
    joblib.dump(model_fit,str(f"{file_dir}.pkl"))
    pd.set_option("display.max_columns",None)
    
    with open(f'{file_dir}.txt', 'w') as file:
        file.write(train_start_tag)
        file.write(f"\nspot(s):\n{S}")
        file.write(f"\n\nstrikes:\n{K}\n")
        file.write(f"\nmaturities:\n{T}\n")
        file.write(f"\ntypes:\n{W}\n")
        try:
            file.write(f"\n{train_data['barrier_type_name'].unique()}")
        except Exception:
            pass
        try:
            for col in ['averaging_type','fixing_frequency','past_fixings','n_fixings']:
                file.write(f"\n{col}:")
                file.write(f"\n{dataset[col].drop_duplicates().sort_values().values}\n")
        except Exception as e:
            print(e)
            pass
        file.write(f"\nnumber of calls, puts:\n{n_calls},{n_puts}\n")
        file.write(f"\ntotal prices:\n{train_data.shape[0]}\n")
        for spec in specs:
            file.write(f"{spec}\n")
        file.write("#"*17+"\n# training data #\n"+"#"*17+
              f"\n{train_data.describe()}\n")
        file.write("#"*13+"\n# test data #\n"+"#"*13+
              f"\n{test_data.describe()}\n")
        file.write(f"\n{dataset.dtypes}")
        file.write(
            f"\nin sample results:"
            f"\n     RMSE: {errors['insample_RMSE']}"
            f"\n     MAE: {errors['insample_MAE']}\n"
            f"\nout of sample results:"
            f"\n     RMSE: {errors['outofsample_RMSE']}"
            f"\n     MAE: {errors['outofsample_MAE']}\n"
            )
        file.write("\nfeatures:\n")
        for feature in trainer.feature_set:
            file.write(f"     {feature}\n")
        file.write(f"\ntarget: {trainer.target_name}\n")
        file.write(f"\ncpu: {train_runtime}\n")
        file.write(datetime.fromtimestamp(train_end).strftime('%c'))
        print(f"model saved to {file_dir}")

In [17]:
save_model()

model saved to E:\git\machine-learning-option-pricing\trained_models\trained_models\2024_10_25 12-23-03 Deep Neural Network asian\2024_10_25 12-23-03 Deep Neural Network asian
