In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd
import modin.pandas as md
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from model_settings import ms
os.chdir(os.path.abspath(str(Path())))
pd.set_option("display.max_columns",None)
pd.options.display.float_format = '{:.5f}'.format
notebook_dir = str(Path().resolve())
sys.path.append(os.path.join(notebook_dir,'historical_data','historical_generation'))
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}\n")


##################
# training start #
##################

Sat Oct 19 00:56:25 2024



# Loading data

In [2]:
dataset = pd.read_csv(r'asian options.csv').iloc[:,1:]

# Preprocessing

In [3]:
import convsklearn
categorical_features = ['averaging_type', 'w']
numerical_features = [
    'spot_price',
    'strike_price',
    'days_to_maturity',
    'risk_free_rate',
    'dividend_rate',
    'kappa',
    'theta',
    'rho',
    'eta',
    'v0',
    'fixing_frequency',
    'n_fixings',
    'past_fixings'
]
target_name = 'observed_price'
trainer = convsklearn.convsklearn(categorical_features = categorical_features, numerical_features = numerical_features, target_name = target_name)
for col in trainer.numerical_features:
    dataset[col] = pd.to_numeric(dataset[col],errors='coerce')
dataset['asian_price'] = pd.to_numeric(dataset['asian_price'],errors='coerce')
dataset['observed_price'] = ms.noisyfier(dataset['asian_price'])

## Train/test split

In [4]:
unique_dates = dataset['calculation_date'].sort_values(
    ascending=True).unique().tolist()
filter_date = unique_dates[int(0.85*len(unique_dates))]
train_data = dataset[
    (
      # (dataset['calculation_date']>=datetime(2007,1,1))
      #  &
      (dataset['calculation_date']<=filter_date)
    )
].copy()

test_data = dataset[
    (
        (dataset['calculation_date']>filter_date)
        # &
        # (dataset['calculation_date']<=datetime(2012,12,31))
    )
].copy()

test_train_ratio = int(round(100*test_data.shape[0]/train_data.shape[0],0))
print(f"train/test: {100-test_train_ratio}/{test_train_ratio}")

train/test: 82/18


In [5]:
arrs = trainer.get_train_test_arrays(
    train_data, test_data,feature_set = trainer.feature_set, target_name=trainer.target_name)
preprocessor = trainer.preprocess()
train_X = arrs['train_X'] 
train_y = arrs['train_y']
test_X = arrs['test_X']
test_y = arrs['test_y']

# Training

In [6]:
model_fit, runtime, specs = trainer.run_dnn(preprocessor,train_X,train_y)
train_end = time.time()
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")


training...

Deep Neural Network
hidden layers sizes: (15, 15, 15)
learning rate: adaptive
activation: relu
solver: sgd
alpha: 0.0001

cpu: 165.1834466457367


# Testing

In [7]:
test_data.describe()

Unnamed: 0,spot_price,strike_price,risk_free_rate,dividend_rate,fixing_frequency,n_fixings,past_fixings,kappa,theta,rho,eta,v0,days_to_maturity,asian_price,observed_price
count,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0,230520.0
mean,1387.17611,1386.67595,0.04,0.02247,115.6,5.33333,0.0,1.32924,0.07078,-0.81903,0.19055,0.02042,616.53333,0.0,0.05975
std,41.60567,426.95919,0.0,0.00064,138.66046,3.68179,0.0,0.62281,0.03507,0.2161,0.08495,0.00767,994.31933,0.0,0.08757
min,1276.9,638.0,0.04,0.02112,1.0,1.0,0.0,0.08326,0.04174,-1.0,0.11461,0.00739,1.0,0.0,0.0
25%,1357.67,1018.0,0.04,0.02207,7.0,1.0,0.0,0.91495,0.0551,-1.0,0.13566,0.01552,10.0,0.0,0.0
50%,1393.515,1389.5,0.04,0.02236,30.0,5.0,0.0,1.24392,0.06355,-0.91291,0.15397,0.01869,150.0,0.0,0.0
75%,1414.97,1756.0,0.04,0.02285,180.0,10.0,0.0,1.7527,0.07159,-0.67757,0.20861,0.02417,900.0,0.0,0.10106
max,1465.27,2197.0,0.04,0.02436,360.0,10.0,0.0,3.2559,0.3115,-0.30144,0.55532,0.0511,3600.0,0.0,0.63652


In [8]:
insample, outsample, errors = trainer.test_prediction_accuracy(
        model_fit,
        test_data,
        train_data
        )


in sample:
     RSME: 0.08756626294884456
     MAE: 0.06922809014891505

out of sample:
     RSME: 0.08758468804541415
     MAE: 0.06924532032005316


# Saving

In [16]:
train_end_tag = str(datetime.fromtimestamp(
    train_end).strftime("%Y_%m_%d %H-%M-%S"))
file_tag = str(train_end_tag + " " + specs[0] )
os.chdir(os.path.join(notebook_dir,'trained_models'))
files_dir = os.path.join(
    notebook_dir,'trained_models','trained_models',
    file_tag)

if Path(files_dir).exists():
    pass
else:
    os.mkdir(files_dir)

file_dir = os.path.join(files_dir,file_tag)

S = np.sort(train_data['spot_price'].unique())
K = np.sort(train_data['strike_price'].unique())
T = np.sort(train_data['days_to_maturity'].unique())
W = np.sort(train_data['w'].unique())
n_calls = train_data[train_data['w']=='call'].shape[0]
n_puts = train_data[train_data['w']=='put'].shape[0]
insample.to_csv(f"{file_dir} insample.csv")
outsample.to_csv(f"{file_dir} outsample.csv")
joblib.dump(model_fit,str(f"{file_dir}.pkl"))
pd.set_option("display.max_columns",None)

with open(f'{file_dir}.txt', 'w') as file:
    file.write(train_start_tag)
    file.write(f"\nspot(s):\n{S}")
    file.write(f"\n\nstrikes:\n{K}\n")
    file.write(f"\nmaturities:\n{T}\n")
    file.write(f"\ntypes:\n{W}\n")
    try:
        file.write(f"\n{train_data['barrier_type_name'].unique()}")
    except Exception:
        pass
    try:
        for col in ['averaging_type','fixing_frequency','past_fixings','n_fixings']:
            file.write(f"\n{col}:")
            file.write(f"\n{dataset[col].drop_duplicates().sort_values().values}\n")
    except Exception as e:
        print(e)
        pass
    file.write(f"\nnumber of calls, puts:\n{n_calls},{n_puts}\n")
    file.write(f"\ntotal prices:\n{train_data.shape[0]}\n")
    for spec in specs:
        file.write(f"{spec}\n")
    file.write("#"*17+"\n# training data #\n"+"#"*17+
          f"\n{train_data.describe()}\n")
    file.write("#"*13+"\n# test data #\n"+"#"*13+
          f"\n{test_data.describe()}\n")
    file.write(f"\n{dataset.dtypes}")
    file.write(
        f"\nin sample results:"
        f"\n     RMSE: {errors['insample_RMSE']}"
        f"\n     MAE: {errors['insample_MAE']}\n"
        f"\nout of sample results:"
        f"\n     RMSE: {errors['outofsample_RMSE']}"
        f"\n     MAE: {errors['outofsample_MAE']}\n"
        )
    file.write("\nfeatures:\n")
    for feature in trainer.feature_set:
        file.write(f"     {feature}\n")
    file.write(f"\ntarget: {trainer.target_name}\n")
    file.write(f"\ncpu: {train_runtime}\n")
    file.write(datetime.fromtimestamp(train_end).strftime('%c'))