In [1]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
from model_settings import ms
from df_collector import df_collector
pd.set_option("display.max_columns",0)
pd.options.display.float_format = '{:.6f}'.format
root = Path().resolve().parent.parent
df_collector.root = root
models_dir = os.path.join(root,ms.trained_models)
train_start = time.time()
train_start_datetime = datetime.fromtimestamp(train_start)
train_start_tag = train_start_datetime.strftime('%c')
print("\n"+"#"*18+"\n# training start #\n"+
      "#"*18+"\n"+f"\n{train_start_tag}")


##################
# training start #
##################

Fri Nov  8 17:17:40 2024


In [2]:
data = df_collector.cboe_spx_barriers().iloc[:,1:]
pricename = [f for f in data.columns if f.find('_price')!=-1 and f.find('spot_')==-1 and f.find('strike_')==-1][0]
relative_pricename = 'relative_'+pricename
data = data[data[pricename]<=data['spot_price']]
print(f'collected {pricename[:pricename.find('_',0)]} options')
print(data.describe())
print(data.dtypes)
print(data['calculation_date'].drop_duplicates())

collected barrier options
          spot_price   strike_price        barrier  days_to_maturity         rebate  dividend_rate  risk_free_rate          theta          kappa            rho            eta             v0  barrier_price
count 2037988.000000 2037988.000000 2037988.000000    2037988.000000 2037988.000000 2037988.000000  2037988.000000 2037988.000000 2037988.000000 2037988.000000 2037988.000000 2037988.000000 2037988.000000
mean     3771.972490    3771.988511    3785.619135        323.112634       0.000000       0.016717        0.023852       0.127303       5.861038      -0.795742       2.001972       0.060612     177.360437
std       844.635317     880.736463    1459.605277        240.806413       0.000000       0.002448        0.018788       0.186416       9.132415       0.162223       2.112885       0.090671     328.070614
min      2389.000000    2150.100000    1194.500000         60.000000       0.000000       0.012624        0.000330       0.000000       0.000000      -1.0

In [3]:
filetag = f'cboe spx relative {pricename[:pricename.find('_',0)]}'

In [4]:
data[pricename].describe()

count   2037988.000000
mean        177.360437
std         328.070614
min           0.000000
25%           1.635566
50%          50.164341
75%         225.734882
max        5424.825644
Name: barrier_price, dtype: float64

In [9]:
data_strikes = data['strike_price']
data['relative_spot'] = data['spot_price']/data_strikes
data[relative_pricename] = data[pricename]/data_strikes
try:
    data['relative_barrier'] = data['barrier']/data_strikes
    data['relative_rebate'] = data['rebate']/data_strikes
except Exception:
    pass

data['calculation_date'] = pd.to_datetime(data['calculation_date'],format='mixed')
data['date'] = pd.to_datetime(data['date'],format='mixed')
data.dtypes

spot_price                       float64
strike_price                     float64
barrier                          float64
days_to_maturity                   int64
updown                            object
outin                             object
w                                 object
barrier_type_name                 object
rebate                           float64
dividend_rate                    float64
risk_free_rate                   float64
theta                            float64
kappa                            float64
rho                              float64
eta                              float64
v0                               float64
calculation_date          datetime64[ns]
date                      datetime64[ns]
barrier_price                    float64
relative_spot                    float64
relative_barrier_price           float64
relative_barrier                 float64
relative_rebate                  float64
dtype: object

In [7]:
stop yo 

SyntaxError: invalid syntax (2701300105.py, line 1)

In [None]:
"""
plot
"""
rels = data[relative_pricename]
plt.figure()
plt.hist(rels,color='purple',label=relative_pricename.replace('_',' '),bins=int(np.sqrt(len(rels))))
plt.legend()
plt.show()
rels.describe()

In [None]:
from plotters import PlotCols
col_names = ['spot_price','kappa','theta','rho','eta','v0','risk_free_rate','dividend_rate','spot_price','relative_observed']
index = 'calculation_date'
plot_data = data[col_names+[index]].copy().sort_values(by='relative_observed').drop_duplicates(subset=index,keep='first').sort_values(by=index).reset_index(drop=True)
data_cols = [index] + col_names
PlotCols(
    plot_data,
    col_names=col_names,
    index=index,
    figsize=(10,15)
)
pd.Series(plot_data[index])
data['relative_observed'].describe()

# training procedure

In [None]:
from convsklearn import convsklearn
trainer = convsklearn()
trainer.target_name = 'relative_observed'
trainer.excluded_features = trainer.excluded_features + ['spot_price','strike_price','barrier','rebate','relative_observed']
trainer.__dict__

In [None]:
trainer.load_data(data)
print('features:')
for f in trainer.feature_set:
    print(f"   {f}")
print(f"\ntarget:\n   {trainer.target_name}")

## preprocessing

In [None]:
data['calculation_date'].drop_duplicates()

In [None]:
stop yo

In [None]:
development_dates = dates[:len(dates)//3]
test_dates = dates[~dates.isin(development_dates)]
trainer.preprocess_data(development_dates,test_dates)

## training

In [None]:
trainer.run_dnn()

In [None]:
print('instance variables:')

for key, value in trainer.__dict__.items():
    print(f"{key}:\n  {value}\n")

### initial test

In [None]:
train_test = trainer.test_prediction_accuracy()

In [None]:
oserr = trainer.test_data['outofsample_error']
iserr = trainer.train_data['insample_error']

In [None]:
plt.figure()
plt.hist(oserr,color='purple',label='out-of-sample',bins=int(np.sqrt(len(oserr))))
plt.hist(iserr,color='green',label='in-sample',bins=int(np.sqrt(len(iserr))))
plt.legend()
plt.title('distributions of prediction error')
plt.show()

## saving

In [None]:
train_end = time.time()
train_end_tag = str(datetime.fromtimestamp(
    train_end).strftime("%Y_%m_%d %H%M%S%f"))
file_tag = str(train_end_tag + " " + filetag)
files_dir = os.path.join(models_dir,file_tag)

def save_model():
    if Path(files_dir).exists():
        pass
    else:
        os.mkdir(files_dir)
    file_dir = os.path.join(files_dir,file_tag)
    joblib.dump(trainer.__dict__,str(f"{file_dir}.pkl"))
    pd.set_option("display.max_columns",None)
    print(f"model saved to {file_dir}")

print(f"execute the command 'save_model()' to save the following model: {file_tag}")
train_runtime = train_end-train_start
print(f"\ncpu: {train_runtime}")

In [None]:
save_model()