In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import modin.pandas as md
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Sat Oct 19 16:46:06 2024



# Loading data

In [2]:
filespath = os.path.join(notebook_dir,'asian_option_generation','historical_asian_options')

files = [file for file in os.listdir(filespath) if file.endswith('.csv')]
dfs = []
for file in files:
    dfs.append(pd.read_csv(os.path.join(filespath,file)))
    
try:
    dataset = pd.concat(dfs,ignore_index=True).iloc[:,1:]
except Exception as e:
    print(e)
    pass

In [3]:
from model_settings import vanilla_pricer
vanillas = vanilla_pricer.vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d')
dataset.dtypes

vanilla option pricer initialized


spot_price                 float64
strike_price                 int64
risk_free_rate             float64
dividend_rate              float64
w                           object
averaging_type              object
fixing_frequency             int64
n_fixings                    int64
past_fixings                 int64
kappa                      float64
theta                      float64
rho                        float64
eta                        float64
v0                         float64
calculation_date    datetime64[ns]
days_to_maturity             int64
asian_price                float64
dtype: object

In [4]:
dataset.loc[:,'vanilla'] = vanillas.df_heston_price(dataset)
dataset.loc[:,'difference'] = dataset['vanilla'] -  dataset['asian_price']
dataset.loc[:,'moneyness'] = ms.vmoneyness(dataset['spot_price'],dataset['strike_price'],dataset['w'])
dataset

Unnamed: 0,spot_price,strike_price,risk_free_rate,dividend_rate,w,averaging_type,fixing_frequency,n_fixings,past_fixings,kappa,theta,rho,eta,v0,calculation_date,days_to_maturity,asian_price,vanilla,difference,moneyness
0,558.30000,279,0.04000,0.00000,call,arithmetic,30,1,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,30,0.00000,280.21615,280.21615,1.00108
1,558.30000,279,0.04000,0.00000,call,arithmetic,30,5,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,150,279.69099,284.33338,4.64240,1.00108
2,558.30000,279,0.04000,0.00000,call,arithmetic,30,10,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,300,279.55904,289.88452,10.32547,1.00108
3,558.30000,279,0.04000,0.00000,call,geometric,30,1,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,30,0.00000,280.21615,280.21615,1.00108
4,558.30000,279,0.04000,0.00000,call,geometric,30,5,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,150,278.82847,284.33338,5.50491,1.00108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,582.35000,873,0.04000,0.00000,put,arithmetic,30,5,0,4.77644,0.04654,-0.72834,2.49610,0.03890,2024-10-17,150,281.02293,276.43679,-4.58614,0.49910
2156,582.35000,873,0.04000,0.00000,put,arithmetic,30,10,0,4.77644,0.04654,-0.72834,2.49610,0.03890,2024-10-17,300,271.69968,262.51777,-9.18191,0.49910
2157,582.35000,873,0.04000,0.00000,put,geometric,30,1,0,4.77644,0.04654,-0.72834,2.49610,0.03890,2024-10-17,30,288.80321,287.78458,-1.01863,0.49910
2158,582.35000,873,0.04000,0.00000,put,geometric,30,5,0,4.77644,0.04654,-0.72834,2.49610,0.03890,2024-10-17,150,281.76403,276.43679,-5.32724,0.49910


In [5]:
# maturities = np.sort(np.array(dataset['days_to_maturity'].unique().tolist()))
# strikes = np.sort(np.array(dataset['strike_price'].unique().tolist()))
# spots = np.sort(np.array(dataset['spot_price'].unique().tolist()))
# dates = np.sort(np.array(dataset['calculation_date'].unique().tolist()))
# test = dataset.copy()
# test = test[
#         (test['averaging_type']=='geometric')&
#         (test['n_fixings']==1)&
#         (test['calculation_date']==dates[0])
# ].sort_values(by=['strike_price'])

# test_puts = test[(test['w'] == 'put')][['strike_price','difference']].sort_values(by='strike_price').set_index('strike_price')
# test_calls = test[(test['w'] == 'call')][['strike_price','difference']].sort_values(by='strike_price').set_index('strike_price')
# plt.figure()
# plt.plot(test_puts)

# Preprocessing

In [6]:
import convsklearn
categorical_features = ['averaging_type', 'w']
numerical_features = [
    'spot_price',
    'strike_price',
    'days_to_maturity',
    'risk_free_rate',
    'dividend_rate',
    'kappa',
    'theta',
    'rho',
    'eta',
    'v0',
    'fixing_frequency',
    'n_fixings',
    'past_fixings'
]
target_name = 'observed_price'
trainer = convsklearn.convsklearn(categorical_features = categorical_features, numerical_features = numerical_features, target_name = target_name)
for col in trainer.numerical_features:
    dataset[col] = pd.to_numeric(dataset[col],errors='coerce')
dataset['asian_price'] = pd.to_numeric(dataset['asian_price'],errors='coerce')
dataset['observed_price'] = ms.noisyfier(dataset['asian_price'])

## Train/test split

In [14]:
dataset['n_fixings'].unique()

array([ 1,  5, 10])

In [7]:
test_data = dataset[dataset['n_fixings']==5]
train_data = dataset[dataset['n_fixings']!=5]

In [17]:
test_data

Unnamed: 0,spot_price,strike_price,risk_free_rate,dividend_rate,w,averaging_type,fixing_frequency,n_fixings,past_fixings,kappa,theta,rho,eta,v0,calculation_date,days_to_maturity,asian_price,vanilla,difference,moneyness,observed_price
1,558.30000,279,0.04000,0.00000,call,arithmetic,30,5,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,150,279.69099,284.33338,4.64240,1.00108,279.67686
4,558.30000,279,0.04000,0.00000,call,geometric,30,5,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,150,278.82847,284.33338,5.50491,1.00108,278.75105
7,558.30000,279,0.04000,0.00000,put,arithmetic,30,5,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,150,0.01084,0.48457,0.47373,-0.50027,0.29346
10,558.30000,279,0.04000,0.00000,put,geometric,30,5,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,150,0.03239,0.48457,0.45218,-0.50027,0.00000
13,558.30000,418,0.04000,0.00000,call,arithmetic,30,5,0,2.93594,0.05647,-0.76887,1.91828,0.02676,2024-08-28,150,143.16126,150.08953,6.92827,0.33565,143.04023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2146,582.35000,727,0.04000,0.00000,put,geometric,30,5,0,4.77644,0.04654,-0.72834,2.49610,0.03890,2024-10-17,150,138.15415,133.07094,-5.08322,0.24839,138.29962
2149,582.35000,873,0.04000,0.00000,call,arithmetic,30,5,0,4.77644,0.04654,-0.72834,2.49610,0.03890,2024-10-17,150,0.00050,0.02017,0.01967,-0.33293,0.00000
2152,582.35000,873,0.04000,0.00000,call,geometric,30,5,0,4.77644,0.04654,-0.72834,2.49610,0.03890,2024-10-17,150,0.00000,0.02017,0.02017,-0.33293,0.11855
2155,582.35000,873,0.04000,0.00000,put,arithmetic,30,5,0,4.77644,0.04654,-0.72834,2.49610,0.03890,2024-10-17,150,281.02293,276.43679,-4.58614,0.49910,281.09580


In [18]:
test_train_ratio = int(round(100*test_data.shape[0]/train_data.shape[0],0))
print(f"train/test: {100-test_train_ratio}/{test_train_ratio}")

train/test: 50/50


In [19]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data,feature_set = trainer.feature_set, target_name=trainer.target_name)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']

# Training

In [20]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (15, 15, 15)
learning rate: adaptive
activation: relu
solver: sgd
alpha: 0.0001

cpu: 71.12141704559326


# Testing

In [21]:
test_data.describe()

Unnamed: 0,spot_price,strike_price,risk_free_rate,dividend_rate,fixing_frequency,n_fixings,past_fixings,kappa,theta,rho,eta,v0,calculation_date,days_to_maturity,asian_price,vanilla,difference,moneyness,observed_price
count,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720,720.0,720.0,720.0,720.0,720.0,720.0
mean,566.11056,565.63333,0.04,0.0,30.0,5.0,0.0,4.31247,0.61756,-0.73545,2.19199,0.03081,2024-09-22 18:00:00,150.0,86.31982,88.79626,2.47644,0.08023,86.33363
min,540.36,270.0,0.04,0.0,30.0,5.0,0.0,0.00246,0.03353,-0.98127,0.80065,0.01068,2024-08-28 00:00:00,150.0,0.0,0.0,-5.40589,-0.5016,0.0
25%,558.905,418.75,0.04,0.0,30.0,5.0,0.0,1.80665,0.04973,-0.76503,1.34532,0.02068,2024-09-10 18:00:00,150.0,0.02238,0.4845,0.01514,-0.25091,0.13512
50%,568.435,568.0,0.04,0.0,30.0,5.0,0.0,3.06496,0.052,-0.73239,1.79007,0.03137,2024-09-23 12:00:00,150.0,13.51227,22.38876,1.18842,0.0,13.46922
75%,573.0275,716.0,0.04,0.0,30.0,5.0,0.0,6.89828,0.0691,-0.68618,2.86286,0.03872,2024-10-04 18:00:00,150.0,145.3763,152.64027,6.61824,0.33495,145.31115
max,584.32,876.0,0.04,0.0,30.0,5.0,0.0,12.42279,15.41199,-0.61937,4.66356,0.0504,2024-10-17 00:00:00,150.0,292.61274,297.56505,12.93469,1.00643,292.4336
std,10.80291,200.60797,0.0,0.0,0.0,0.0,0.0,3.44122,2.60226,0.06769,1.08489,0.01073,,0.0,109.64456,108.59602,5.01421,0.42805,109.62898


In [22]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
        )
outofsample_RMSE = errors['outofsample_RMSE']


in sample:
     RSME: 29.866946785070084
     MAE: 16.041160065959048

out of sample:
     RSME: 47.86525658839923
     MAE: 26.060682940776704


# Saving

In [23]:
train_end_tag = str(datetime.fromtimestamp(
    train_end).strftime("%Y_%m_%d %H-%M-%S"))
file_tag = str(train_end_tag + " " + specs[0] + " " + str(int(outofsample_RMSE)) + "RMSE")
os.chdir(os.path.join(notebook_dir,'trained_models'))
files_dir = os.path.join(
    notebook_dir,'trained_models','trained_models',
    file_tag)

if Path(files_dir).exists():
    pass
else:
    os.mkdir(files_dir)

file_dir = os.path.join(files_dir,file_tag)

S = np.sort(train_data['spot_price'].unique())
K = np.sort(train_data['strike_price'].unique())
T = np.sort(train_data['days_to_maturity'].unique())
W = np.sort(train_data['w'].unique())
n_calls = train_data[train_data['w']=='call'].shape[0]
n_puts = train_data[train_data['w']=='put'].shape[0]
insample.to_csv(f"{file_dir} insample.csv")
outsample.to_csv(f"{file_dir} outsample.csv")
joblib.dump(model_fit,str(f"{file_dir}.pkl"))
pd.set_option("display.max_columns",None)

with open(f'{file_dir}.txt', 'w') as file:
    file.write(train_start_tag)
    file.write(f"\nspot(s):\n{S}")
    file.write(f"\n\nstrikes:\n{K}\n")
    file.write(f"\nmaturities:\n{T}\n")
    file.write(f"\ntypes:\n{W}\n")
    try:
        file.write(f"\n{train_data['barrier_type_name'].unique()}")
    except Exception:
        pass
    try:
        for col in ['averaging_type','fixing_frequency','past_fixings','n_fixings']:
            file.write(f"\n{col}:")
            file.write(f"\n{dataset[col].drop_duplicates().sort_values().values}\n")
    except Exception as e:
        print(e)
        pass
    file.write(f"\nnumber of calls, puts:\n{n_calls},{n_puts}\n")
    file.write(f"\ntotal prices:\n{train_data.shape[0]}\n")
    for spec in specs:
        file.write(f"{spec}\n")
    file.write("#"*17+"\n# training data #\n"+"#"*17+
          f"\n{train_data.describe()}\n")
    file.write("#"*13+"\n# test data #\n"+"#"*13+
          f"\n{test_data.describe()}\n")
    file.write(f"\n{dataset.dtypes}")
    file.write(
        f"\nin sample results:"
        f"\n     RMSE: {errors['insample_RMSE']}"
        f"\n     MAE: {errors['insample_MAE']}\n"
        f"\nout of sample results:"
        f"\n     RMSE: {errors['outofsample_RMSE']}"
        f"\n     MAE: {errors['outofsample_MAE']}\n"
        )
    file.write("\nfeatures:\n")
    for feature in trainer.feature_set:
        file.write(f"     {feature}\n")
    file.write(f"\ntarget: {trainer.target_name}\n")
    file.write(f"\ncpu: {train_runtime}\n")
    file.write(datetime.fromtimestamp(train_end).strftime('%c'))