In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Fri Oct 18 23:28:35 2024



# Loading data

In [2]:
with pd.HDFStore('asians.h5') as store:
    keys = store.keys()
    dfs = []
    bar = tqdm(total = len(keys))
    for key in keys:
        dfs.append(store[key].iloc[:,1:])
        bar.update(1)
bar.close()
store.close()
dataset = pd.concat(dfs,ignore_index=True)
dataset

100%|█████████████████████████████████████████████████████████████████████████████| 1511/1511 [00:08<00:00, 174.38it/s]


Unnamed: 0,asian_price,spot_price,strike_price,risk_free_rate,dividend_rate,w,averaging_type,fixing_frequency,n_fixings,past_fixings,kappa,theta,rho,eta,v0,calculation_date,days_to_maturity
0,0.00000,1416.59000,708,0.04000,0.01812,call,arithmetic,1,1,0,0.23344,0.08296,-1.00000,0.09850,0.01066,2007-01-03 00:00:00.000000,1
1,0.00000,1416.59000,708,0.04000,0.01812,call,arithmetic,1,5,0,0.23344,0.08296,-1.00000,0.09850,0.01066,2007-01-03 00:00:00.000000,5
2,0.00000,1416.59000,708,0.04000,0.01812,call,arithmetic,1,10,0,0.23344,0.08296,-1.00000,0.09850,0.01066,2007-01-03 00:00:00.000000,10
3,0.00000,1416.59000,708,0.04000,0.01812,call,arithmetic,7,1,0,0.23344,0.08296,-1.00000,0.09850,0.01066,2007-01-03 00:00:00.000000,7
4,0.00000,1416.59000,708,0.04000,0.01812,call,arithmetic,7,5,0,0.23344,0.08296,-1.00000,0.09850,0.01066,2007-01-03 00:00:00.000000,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453295,40.84120,100.00000,150,0.04000,0.00000,put,geometric,180,5,0,0.80000,0.00800,0.20000,0.10000,0.00500,2024-10-18 22:23:29.548587,900
453296,32.81657,100.00000,150,0.04000,0.00000,put,geometric,180,10,0,0.80000,0.00800,0.20000,0.10000,0.00500,2024-10-18 22:23:29.548587,1800
453297,46.20488,100.00000,150,0.04000,0.00000,put,geometric,360,1,0,0.80000,0.00800,0.20000,0.10000,0.00500,2024-10-18 22:23:29.548587,360
453298,32.85081,100.00000,150,0.04000,0.00000,put,geometric,360,5,0,0.80000,0.00800,0.20000,0.10000,0.00500,2024-10-18 22:23:29.548587,1800


# Preprocessing

In [13]:
import convsklearn
categorical_features = ['averaging_type', 'w']
numerical_features = [
    'spot_price',
    'strike_price',
    'days_to_maturity',
    'risk_free_rate',
    'dividend_rate',
    'kappa',
    'theta',
    'rho',
    'eta',
    'v0',
    'fixing_frequency',
    'n_fixings',
    'past_fixings'
]
target_name = 'observed_price'
trainer = convsklearn.convsklearn(categorical_features = categorical_features, numerical_features = numerical_features, target_name = target_name)
for col in trainer.numerical_features:
    dataset[col] = pd.to_numeric(dataset[col],errors='coerce')
dataset['asian_price'] = pd.to_numeric(dataset['asian_price'],errors='coerce')
dataset['observed_price'] = ms.noisyfier(dataset['asian_price'])

## Train/test split

In [14]:
unique_dates = dataset['calculation_date'].sort_values(
    ascending=True).unique().tolist()
filter_date = unique_dates[int(0.85*len(unique_dates))]
train_data = dataset[
    (
      # (dataset['calculation_date']>=datetime(2007,1,1))
      #  &
      (dataset['calculation_date']<=filter_date)
    )
].copy()

test_data = dataset[
    (
        (dataset['calculation_date']>filter_date)
        # &
        # (dataset['calculation_date']<=datetime(2012,12,31))
    )
].copy()


In [15]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data,feature_set = trainer.feature_set, target_name=trainer.target_name)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']
train_startdate = train_data.describe()['calculation_date']['min'].strftime('%A, %Y-%m-%d')
train_enddate = train_data.describe()['calculation_date']['max'].strftime('%A, %Y-%m-%d')
print(f"\ntraining from\n{train_startdate}\nto\n{train_enddate}\n")


training from
Wednesday, 2007-01-03
to
Tuesday, 2012-02-07



# Training

In [16]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (15, 15, 15)
learning rate: adaptive
activation: relu
solver: sgd
alpha: 0.0001

cpu: 348.5179822444916


# Testing

In [17]:
test_data.describe()

Unnamed: 0,asian_price,spot_price,strike_price,risk_free_rate,dividend_rate,fixing_frequency,n_fixings,past_fixings,kappa,theta,rho,eta,v0,calculation_date,days_to_maturity,observed_price
count,67800.0,67800.0,67800.0,67800.0,67800.0,67800.0,67800.0,67800.0,67800.0,67800.0,67800.0,67800.0,67800.0,67800,67800.0,67800.0
mean,0.06452,1381.65779,1381.15221,0.04,0.02237,115.6,5.33333,0.0,1.3254,0.07058,-0.81372,0.19039,0.02036,2012-08-07 23:40:27.475878912,616.53333,0.1247
min,0.0,100.0,50.0,0.04,0.0,1.0,1.0,0.0,0.08326,0.008,-1.0,0.1,0.005,2012-02-08 00:00:00,1.0,0.0
25%,0.0,1357.67,1016.0,0.04,0.02207,7.0,1.0,0.0,0.90903,0.0551,-1.0,0.13563,0.01543,2012-04-30 00:00:00,10.0,0.0
50%,0.0,1393.515,1391.0,0.04,0.02236,30.0,5.0,0.0,1.23626,0.06355,-0.90935,0.15397,0.01869,2012-07-19 12:00:00,150.0,0.00116
75%,0.0,1414.97,1768.0,0.04,0.02285,180.0,10.0,0.0,1.7527,0.07159,-0.67331,0.20861,0.02417,2012-10-09 00:00:00,900.0,0.10354
max,50.05751,1465.27,2197.0,0.04,0.02436,360.0,10.0,0.0,3.2559,0.3115,0.2,0.55532,0.0511,2024-10-18 22:23:29.548587,3600.0,50.21497
std,1.58918,94.99834,498.77081,0.0,0.00162,138.66118,3.68181,0.0,0.62339,0.0353,0.2261,0.08508,0.00774,,994.32451,1.58906


In [18]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
        )


in sample:
     RSME: 0.08742095363630817
     MAE: 0.069230579546607

out of sample:
     RSME: 1.5898388875725606
     MAE: 0.13366468766792575


# Saving