We calculate the average error of completing volatility cube for 4 cases:
    
1) Temporal splitting. ATM vols are missed.

2) Temporal splitting. Only ATM vols are available.

3) Random splitting. ATM vols are missed.

4) Random splitting. Only ATM vols are available.

## Temporal splitting. ATM vols are missed.

In [1]:
import sys
sys.path.append('../..')  # to go to the main folder of the whole project

# Download the data
from src.data.vol.get_vol_cube_tenors_strikes_dates import get_vol_cube_tenors_strikes_dates
data, opt_tenors, swap_tenors, strikes, dates = get_vol_cube_tenors_strikes_dates()

# Normalize data
from src.data.vol.normalizer import Normalizer
normalizer = Normalizer()
data_norm = normalizer.normalize(data)

# Split train and test datasets
dataset_split_type = 'temporal_split'
from src.utils.get_train_test_datasets import get_train_test_datasets
data_norm_train, dates_train, data_norm_test, dates_test = get_train_test_datasets(data_norm,
                                                                                   dates,
                                                                                   type=dataset_split_type)

data_train = normalizer.denormalize(data_norm_train)
data_test = normalizer.denormalize(data_norm_test)

##### Load best model for temporal splitting

In [2]:
from src.utils.load_model_and_history import load_model_and_history

NAME = 'vae_van_leaky_3_128_48_3000ep_bat16_3e-4'
vae, history = load_model_and_history(NAME,
                                      data_type='vol')




  saveable.load_own_variables(weights_store.get(inner_path))


In [3]:
# Create dataset data_norm_test_missed with missed data
missed_strikes = [2]  # 0 is ATM-100bp, 2 is ATM, 5 is ATM+100bp

Calculate mse error over all test dates

In [4]:
# Import what value is considered as a missed value
from references.global_parameters import MISSED_VALUE

import numpy as np
from src.utils.vol.find_z_to_complete_vol_cube import find_z_to_complete_vol_cube

mse = 0
mse_max = float('-inf')
n = 0

for date_idx, val in enumerate(dates_test):
    if date_idx % 5 == 0:
        print(f'{date_idx} out of {len(dates_test)}')
        # Create dataset with missed values
        data_test_missed = data_test.copy()
        for missed_strike in missed_strikes:
            data_test_missed[date_idx, :, :, missed_strike] = MISSED_VALUE
        # Create normalized dataset with missed values
        data_norm_test_missed = data_norm_test.copy()
        for missed_strike in missed_strikes:
            data_norm_test_missed[date_idx, :, :, missed_strike] = MISSED_VALUE

        # Complete the missed data in vol cube and check errors against real data
        z_optimal = find_z_to_complete_vol_cube(vae=vae,
                                            data=data_norm_test_missed[date_idx],
                                            random_attempt_num=5, # number of attempts starting from different random z initial values
                                            random_seed=0,
                                            print_status=False
                                            )
        
        predictions = normalizer.denormalize(vae.decoder.predict(np.array([z_optimal]), verbose=0))  # shape=(1,6,5,7)
        
        for i in range(predictions.shape[1]):
            for j in range(predictions.shape[2]):
                for k in range(predictions.shape[3]):
                    if data_test_missed[date_idx, i, j, k] != MISSED_VALUE:
                        predictions[0, i, j, k] = data_test[date_idx, i, j, k]
                    else:
                        diff = (predictions[0, i, j, k] - data_test[date_idx, i, j, k])**2
                        mse_max = max(mse_max, diff)
                        mse += diff
                        n += 1
        print((mse / n) ** 0.5, n, mse_max**0.5)

# 0 out of 445
# 5.330540463626246 30 17.691074066162102
# 440 out of 445
# 4.635297712341057 2670 23.884532165527318 over all dates in 106 minutes

0 out of 445
5.330540463626246 30 17.691074066162102
5 out of 445
5.4198631617941855 60 22.229789276123043
10 out of 445
5.420971299113311 90 22.229789276123043
15 out of 445
6.3141511899440275 120 23.884532165527318
20 out of 445
6.339959737105582 150 23.884532165527318
25 out of 445
6.244272308534952 180 23.884532165527318
30 out of 445
6.087572409057399 210 23.884532165527318
35 out of 445
6.085842024528327 240 23.884532165527318
40 out of 445
5.932996424855125 270 23.884532165527318
45 out of 445
5.851261091509109 300 23.884532165527318
50 out of 445
5.743317388532257 330 23.884532165527318
55 out of 445
5.645508637211773 360 23.884532165527318
60 out of 445
5.535821758743693 390 23.884532165527318
65 out of 445
5.440954365432423 420 23.884532165527318
70 out of 445
5.330507963927557 450 23.884532165527318
75 out of 445
5.2309651760535525 480 23.884532165527318
80 out of 445
5.1298000777885235 510 23.884532165527318
85 out of 445
5.078592999281551 540 23.884532165527318
90 out of 4

## Temporal splitting. Only ATM vols are available.

Create dataset with missed values

In [5]:
# Create dataset data_norm_test_missed with missed data
missed_strikes = [0, 1, 3, 4]  # 0 is ATM-100bp, 2 is ATM, 4 is ATM+100bp

Calculate mse error over all test dates

In [6]:
# Import what value is considered as a missed value
from references.global_parameters import MISSED_VALUE

import numpy as np
from src.utils.vol.find_z_to_complete_vol_cube import find_z_to_complete_vol_cube

mse = 0
mse_max = float('-inf')
n = 0

for date_idx, val in enumerate(dates_test):
    if date_idx % 5 == 0:
        print(f'{date_idx} out of {len(dates_test)}')
        # Create dataset with missed values
        data_test_missed = data_test.copy()
        for missed_strike in missed_strikes:
            data_test_missed[date_idx, :, :, missed_strike] = MISSED_VALUE
        # Create normalized dataset with missed values
        data_norm_test_missed = data_norm_test.copy()
        for missed_strike in missed_strikes:
            data_norm_test_missed[date_idx, :, :, missed_strike] = MISSED_VALUE

        # Complete the missed data in vol cube and check errors against real data
        z_optimal = find_z_to_complete_vol_cube(vae=vae,
                                            data=data_norm_test_missed[date_idx],
                                            random_attempt_num=5, # number of attempts starting from different random z initial values
                                            random_seed=0,
                                            print_status=False
                                            )
        
        predictions = normalizer.denormalize(vae.decoder.predict(np.array([z_optimal]), verbose=0))  # shape=(1,6,5,7)
        
        for i in range(predictions.shape[1]):
            for j in range(predictions.shape[2]):
                for k in range(predictions.shape[3]):
                    if data_test_missed[date_idx, i, j, k] != MISSED_VALUE:
                        predictions[0, i, j, k] = data_test[date_idx, i, j, k]
                    else:
                        diff = (predictions[0, i, j, k] - data_test[date_idx, i, j, k])**2
                        mse_max = max(mse_max, diff)
                        mse += diff
                        n += 1
        print((mse / n) ** 0.5, n, mse_max**0.5)

# 0 out of 445
# 14.812059277362142 120 73.12985961914069
# 440 out of 445
# 12.505738262843272 10680 133.08577178955076 over all dates in 81 minutes

0 out of 445
14.812059277362142 120 73.12985961914069
5 out of 445
16.031718108019575 240 81.80189331054692
10 out of 445
14.622372375583906 360 81.80189331054692
15 out of 445
20.409357321555383 480 133.08577178955076
20 out of 445
19.599807163300653 600 133.08577178955076
25 out of 445
18.442750720740456 720 133.08577178955076
30 out of 445
17.943001662728395 840 133.08577178955076
35 out of 445
17.577359623909896 960 133.08577178955076
40 out of 445
17.067581717381874 1080 133.08577178955076
45 out of 445
16.57756967805008 1200 133.08577178955076
50 out of 445
16.27401506669379 1320 133.08577178955076
55 out of 445
16.003773987912467 1440 133.08577178955076
60 out of 445
15.720509887825923 1560 133.08577178955076
65 out of 445
15.47187650197727 1680 133.08577178955076
70 out of 445
15.186321323688347 1800 133.08577178955076
75 out of 445
14.975631794127056 1920 133.08577178955076
80 out of 445
14.764671880133493 2040 133.08577178955076
85 out of 445
14.527459118110286 2160 133.08577

## Random splitting. ATM vols are missed.

##### Download, normalize and split vol cube data into train/test datasets

In [1]:
import sys
sys.path.append('../..')  # to go to the main folder of the whole project

# Download the data
from src.data.vol.get_vol_cube_tenors_strikes_dates import get_vol_cube_tenors_strikes_dates
data, opt_tenors, swap_tenors, strikes, dates = get_vol_cube_tenors_strikes_dates()

# Normalize data
from src.data.vol.normalizer import Normalizer
normalizer = Normalizer()
data_norm = normalizer.normalize(data)

# Split train and test datasets
dataset_split_type = 'random_split'
from src.utils.get_train_test_datasets import get_train_test_datasets
data_norm_train, dates_train, data_norm_test, dates_test = get_train_test_datasets(data_norm,
                                                                                   dates,
                                                                                   seed = 0,
                                                                                   train_ratio=0.8,
                                                                                   type=dataset_split_type)
data_train = normalizer.denormalize(data_norm_train)
data_test = normalizer.denormalize(data_norm_test)

##### Load best model for random splitting

In [2]:
from src.utils.load_model_and_history import load_model_and_history

NAME = 'vae_van_leaky_randomsplit_3_200_100_50_25_3000ep_bat16_1e-5'
vae, history = load_model_and_history(NAME,
                                      data_type='vol')




  saveable.load_own_variables(weights_store.get(inner_path))


Create dataset with missed values

In [6]:
# Create dataset data_norm_test_missed with missed data
missed_strikes = [2]  # 0 is ATM-100bp, 2 is ATM, 4 is ATM+100bp

Calculate mse error over all test dates

In [6]:
# Import what value is considered as a missed value
from references.global_parameters import MISSED_VALUE

import numpy as np
from src.utils.vol.find_z_to_complete_vol_cube import find_z_to_complete_vol_cube

mse = 0
mse_max = float('-inf')
n = 0

for date_idx, val in enumerate(dates_test):
    if date_idx % 5 == 0:
        print(f'{date_idx} out of {len(dates_test)}')
        # Create dataset with missed values
        data_test_missed = data_test.copy()
        for missed_strike in missed_strikes:
            data_test_missed[date_idx, :, :, missed_strike] = MISSED_VALUE
        # Create normalized dataset with missed values
        data_norm_test_missed = data_norm_test.copy()
        for missed_strike in missed_strikes:
            data_norm_test_missed[date_idx, :, :, missed_strike] = MISSED_VALUE

        # Complete the missed data in vol cube and check errors against real data
        z_optimal = find_z_to_complete_vol_cube(vae=vae,
                                            data=data_norm_test_missed[date_idx],
                                            random_attempt_num=5, # number of attempts starting from different random z initial values
                                            random_seed=0,
                                            print_status=False
                                            )
        
        predictions = normalizer.denormalize(vae.decoder.predict(np.array([z_optimal]), verbose=0))  # shape=(1,6,5,7)
        
        for i in range(predictions.shape[1]):
            for j in range(predictions.shape[2]):
                for k in range(predictions.shape[3]):
                    if data_test_missed[date_idx, i, j, k] != MISSED_VALUE:
                        predictions[0, i, j, k] = data_test[date_idx, i, j, k]
                    else:
                        diff = (predictions[0, i, j, k] - data_test[date_idx, i, j, k])**2
                        mse_max = max(mse_max, diff)
                        mse += diff
                        n += 1
        print((mse / n) ** 0.5, n, mse_max**0.5)
#0 out of 265
#0.9630661958469724 30 2.6965539550781017
# 260 out of 265
# 0.6128737208244198 1590 4.306733093261727 over all dates in 50 minutes

0 out of 265
0.9630661958469724 30 2.6965539550781017
5 out of 265
0.870293753403727 60 2.6965539550781017
10 out of 265
0.7617084955085855 90 2.6965539550781017
15 out of 265
0.6936416109314965 120 2.6965539550781017
20 out of 265
0.66191034503459 150 2.6965539550781017
25 out of 265
0.6384677515839475 180 2.6965539550781017
30 out of 265
0.6040386038930856 210 2.6965539550781017
35 out of 265
0.5739025875489283 240 2.6965539550781017
40 out of 265
0.5649221503668926 270 2.6965539550781017
45 out of 265
0.5508622040666196 300 2.6965539550781017
50 out of 265
0.5268891889276103 330 2.6965539550781017
55 out of 265
0.506115024272658 360 2.6965539550781017
60 out of 265
0.4989651725582347 390 2.6965539550781017
65 out of 265
0.4869506784093567 420 2.6965539550781017
70 out of 265
0.4745220280133632 450 2.6965539550781017
75 out of 265
0.46131116253818427 480 2.6965539550781017
80 out of 265
0.45313567000369054 510 2.6965539550781017
85 out of 265
0.4429488627241678 540 2.6965539550781017

## Random splitting. Only ATM vols are available.

Create dataset with missed values

In [3]:
# Create dataset data_norm_test_missed with missed data
missed_strikes = [0, 1, 3, 4]  # 0 is ATM-100bp, 2 is ATM, 4 is ATM+100bp

Calculate mse error over all test dates

In [4]:
# Import what value is considered as a missed value
from references.global_parameters import MISSED_VALUE

import numpy as np
from src.utils.vol.find_z_to_complete_vol_cube import find_z_to_complete_vol_cube

mse = 0
mse_max = float('-inf')
n = 0

for date_idx, val in enumerate(dates_test):
    if date_idx % 5 == 0:
        print(f'{date_idx} out of {len(dates_test)}')
        # Create dataset with missed values
        data_test_missed = data_test.copy()
        for missed_strike in missed_strikes:
            data_test_missed[date_idx, :, :, missed_strike] = MISSED_VALUE
        # Create normalized dataset with missed values
        data_norm_test_missed = data_norm_test.copy()
        for missed_strike in missed_strikes:
            data_norm_test_missed[date_idx, :, :, missed_strike] = MISSED_VALUE

        # Complete the missed data in vol cube and check errors against real data
        z_optimal = find_z_to_complete_vol_cube(vae=vae,
                                            data=data_norm_test_missed[date_idx],
                                            random_attempt_num=5, # number of attempts starting from different random z initial values
                                            random_seed=0,
                                            print_status=False
                                            )
        
        predictions = normalizer.denormalize(vae.decoder.predict(np.array([z_optimal]), verbose=0))  # shape=(1,6,5,7)
        
        for i in range(predictions.shape[1]):
            for j in range(predictions.shape[2]):
                for k in range(predictions.shape[3]):
                    if data_test_missed[date_idx, i, j, k] != MISSED_VALUE:
                        predictions[0, i, j, k] = data_test[date_idx, i, j, k]
                    else:
                        diff = (predictions[0, i, j, k] - data_test[date_idx, i, j, k])**2
                        mse_max = max(mse_max, diff)
                        mse += diff
                        n += 1
        print((mse / n) ** 0.5, n, mse_max**0.5)

#0 out of 265
#3.6780138560431777 120 33.06625366210946
#260 out of 265
#3.560947131751989 6360 63.23500305175784 in 62 minutes

0 out of 265
3.6780138560431777 120 33.06625366210946
5 out of 265
3.3507792122539244 240 33.06625366210946
10 out of 265
2.850805941176933 360 33.06625366210946
15 out of 265
2.626457513381438 480 33.06625366210946
20 out of 265
2.395172429735789 600 33.06625366210946
25 out of 265
2.2735229767739438 720 33.06625366210946
30 out of 265
2.1367997078446863 840 33.06625366210946
35 out of 265
2.015751589856803 960 33.06625366210946
40 out of 265
1.9227196827443782 1080 33.06625366210946
45 out of 265
1.8381390343795623 1200 33.06625366210946
50 out of 265
2.2958628570478306 1320 33.06625366210946
55 out of 265
2.1997758330743453 1440 33.06625366210946
60 out of 265
2.1191825647362417 1560 33.06625366210946
65 out of 265
2.049890520828636 1680 33.06625366210946
70 out of 265
1.985629111457822 1800 33.06625366210946
75 out of 265
1.9260888574777586 1920 33.06625366210946
80 out of 265
1.8730489407221327 2040 33.06625366210946
85 out of 265
1.8255214468017615 2160 33.06625366210946
90 out o