In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Thu Oct 24 00:09:58 2024



# Loading data

In [3]:
ms.__dict__

{'day_count': <QuantLib.QuantLib.Actual365Fixed; proxy of <Swig Object of type 'QuantLib::Actual365Fixed *' at 0x10c2c9aa0> >,
 'calendar': <QuantLib.QuantLib.UnitedStates; proxy of <Swig Object of type 'QuantLib::UnitedStates *' at 0x11a74fc90> >,
 'compounding': 2,
 'frequency': 1,
 'settings_names_dictionary': {2: 'continuous', 1: 'annual'},
 'settings_string': '\npricing settings:\nActual/365 (Fixed) day counter\nNew York stock exchange calendar\ncompounding: continuous\nfrequency: annual\n',
 'av_key': '9ZDGLN9SFCLWFN32',
 'n_MCpaths': 100000,
 'rng': 'pseudorandom',
 'bloomberg_spx_calibrated': 'OneDrive - rsbrc/DATA/calibrated/bloomberg/SPX',
 'calibrations_dir': 'OneDrive - rsbrc/DATA/calibrated',
 'bloomberg_spx_barrier_dump': 'OneDrive - rsbrc/DATA/generated/bloomberg/SPX/barrier_options',
 'cboe_spx_barrier_dump': 'OneDrive - rsbrc/DATA/generated/cboe/SPX/barrier_options',
 'bloomberg_spx_asian_option_dump': 'OneDrive - rsbrc/DATA/generated/bloomberg/SPX/asian_options',
 'cb

In [2]:
root = Path().resolve().parent.parent
datadir = os.path.join(root,ms.cboe_spx_asian_option_dump)
files = [f for f in os.listdir(datadir) if f.endswith('.csv')]
files = [os.path.join(datadir,f) for f in files]
dfs = [pd.read_csv(f).iloc[:,1:] for f in files]
dataset = pd.concat(dfs,ignore_index=True) 
dataset

KeyboardInterrupt: 

In [None]:
from model_settings import vanilla_pricer
vanillas = vanilla_pricer()
dataset['calculation_date'] = pd.to_datetime(dataset['calculation_date'],format='%Y-%m-%d')
dataset.dtypes

In [None]:
dataset.loc[:,'vanilla'] = vanillas.df_heston_price(dataset)
dataset.loc[:,'difference'] = dataset['vanilla'] -  dataset['asian_price']
dataset.loc[:,'moneyness'] = ms.vmoneyness(dataset['spot_price'],dataset['strike_price'],dataset['w'])
dataset

# Preprocessing

In [None]:
import convsklearn
categorical_features = ['averaging_type', 'w']
numerical_features = [
    'spot_price',
    'strike_price',
    'days_to_maturity',
    'risk_free_rate',
    'dividend_rate',
    'kappa',
    'theta',
    'rho',
    'eta',
    'v0',
    'fixing_frequency',
    'n_fixings',
    'past_fixings'
]
target_name = 'observed_price'
trainer = convsklearn.convsklearn(categorical_features = categorical_features, numerical_features = numerical_features, target_name = target_name)
for col in trainer.numerical_features:
    dataset[col] = pd.to_numeric(dataset[col],errors='coerce')
dataset['asian_price'] = pd.to_numeric(dataset['asian_price'],errors='coerce')
dataset['observed_price'] = ms.noisyfier(dataset['asian_price'])

## Train/test split

In [None]:
dataset['n_fixings'].unique()

In [None]:
test_data = dataset[dataset['n_fixings']==1]
train_data = dataset[dataset['n_fixings']!=1]

In [None]:
test_data

In [None]:
test_train_ratio = int(round(100*test_data.shape[0]/train_data.shape[0],0))
print(f"train/test: {100-test_train_ratio}/{test_train_ratio}")

In [None]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data,feature_set = trainer.feature_set, target_name=trainer.target_name)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']

# Training

In [None]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")

# Testing

In [None]:
test_data.describe()

In [None]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
        )
outofsample_RMSE = errors['outofsample_RMSE']

# Saving