In [1]:
import numpy as np
import pandas as pd
import teneva
from sklearn.metrics import r2_score, mean_absolute_error as MAE, mean_absolute_percentage_error as MAPE

## This was run with 10% of the pysabr output as TRAINING data and 90% as TEST data. We did this to test the robustness of the ALS method and to see whether it performs well in real life scenarios when training data is sparse.

In [2]:
data_path = "/data/workspace_files/"

In [3]:
vols = np.load(data_path + "12_12_sample_lognormal_vol.npy")
vols.shape # S, T, V_atm, Beta, Rho, Volvol, (Displacement), K

(12, 5, 3, 2, 3, 5, 12)

In [4]:
names = ["S", "T", "V_atm", "Beta", "Rho", "Volvol", "K"]
# names = ["S", "T", "V_atm", "Beta", "Rho", "Volvol", "Displacement", "K"]

vols = vols[:,:,:,:,:,:,:] # 
# names.remove("Rho")

vols.shape

(12, 5, 3, 2, 3, 5, 12)

In [5]:

multiindex = pd.MultiIndex.from_product([range(i) for i in vols.shape],
                                        names=names
                                       )
full_df = pd.DataFrame(vols.reshape((-1,1)), index=multiindex, columns=["Lognormal_vol"])
full_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Lognormal_vol
S,T,V_atm,Beta,Rho,Volvol,K,Unnamed: 7_level_1
0,0,0,0,0,0,0,0.19802
0,0,0,0,0,0,1,0.195679
0,0,0,0,0,0,2,0.193395
0,0,0,0,0,0,3,0.191166
0,0,0,0,0,0,4,0.18899


In [6]:
full_df.describe()

Unnamed: 0,Lognormal_vol
count,64800.0
mean,0.176921
std,0.368926
min,0.004975
25%,0.009706
50%,0.040484
75%,0.128974
max,1.387807


In [7]:
# verification
vols[0,0,1,0,0,0,2], full_df.loc[0,0,1,0,0,0,2].values.item()

(0.19339689243337468, 0.19339689243337468)

In [8]:
missing_df = full_df.sample(frac=0.90, replace=False, random_state=1)
missing_df.shape

(58320, 1)

In [9]:
train_df = full_df.drop(missing_df.index)
train_df.shape

(6480, 1)

In [10]:
missing_df.shape[0] + train_df.shape[0] == full_df.shape[0]

True

In [11]:
Y_train = train_df.values
I_train = train_df.reset_index()[names].values
Y_train.shape, I_train.shape

((6480, 1), (6480, 7))

In [12]:
r = 4
Y_0 = teneva.rand(vols.shape, r, np.random.default_rng(12345).random)
nswp = 5

Y_new = teneva.als(I_train, Y_train, Y_0, nswp)
get = teneva.getter(Y_new)
Y_train_pred = np.array([get(i) for i in I_train])
# train_error = np.linalg.norm(Y_train_pred - Y_train)
# train_error /= np.linalg.norm(Y_train)
# print(f"train error {train_error:.4f}")

In [13]:
Y_test = missing_df.values
I_test = missing_df.reset_index()[names].values
Y_test.shape, I_test.shape 

((58320, 1), (58320, 7))

In [14]:
Y_test_pred = np.array([get(i) for i in I_test])
# test_error = np.linalg.norm(Y_test_pred - Y_test)
# test_error /= np.linalg.norm(Y_test)
# print(f"test error {test_error:.4f}")

In [16]:
print(r2_score(Y_train.reshape(-1,1), Y_train_pred))
print(r2_score(Y_test.reshape(-1,1), Y_test_pred))

0.9999998452316975
0.9999993875609762


In [17]:
print(MAE(Y_train.reshape(-1,1), Y_train_pred))
print(MAE(Y_test.reshape(-1,1), Y_test_pred))

6.976420242861728e-05
9.53404681102971e-05


In [18]:
print(Y_test.reshape(-1).shape)
print(Y_test_pred.shape)

(58320,)
(58320,)


In [19]:
print(np.max(np.abs(Y_test.reshape(-1) - Y_test_pred)))

print(np.argmax(np.abs(Y_test.reshape(-1)[:1000] - Y_test_pred[:1000])))
# np.abs(Y_test.reshape(-1,1)[:1000] - Y_test_pred[:1000])[690666]
# missing_df.iloc[690666]

0.009938532788126508
598


In [20]:
print(MAPE(Y_train.reshape(-1,1), Y_train_pred))
print(MAPE(Y_test.reshape(-1,1), Y_test_pred))

0.0008614707843976386
0.0008995659808580757
