<a href="https://colab.research.google.com/github/ashish1610dhiman/CSE8803_DLT_Project/blob/main/1_get_torch_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import torch
import pickle
import numpy as np
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from google.colab import drive
import os
import yaml

In [31]:
#exp params
VERSION = "v0" #meta for saving
drive.mount('/content/drive')
EXP_PATH = '/content/drive/My Drive/call_prices_conditional_flow/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 0. Load params/data-dict:(gbm+call)

In [35]:
with open(f"{EXP_PATH}/dataset_params_{VERSION}.yaml", "r") as f:
    PARAMS = yaml.unsafe_load(f)

In [30]:
#train,val,test
VAL_SIZE = 0.15
TEST_SIZE = 0.15
SEED = 42

In [32]:
#read data dict from google colab
with open(f"{EXP_PATH}/dataset_{VERSION}.pkl", 'rb') as f:
    combined_data_dict = pickle.load(f)

In [33]:
len(combined_data_dict)

100

In [34]:
combined_data_dict[(0.05,0.1)].keys()

dict_keys(['gbm_paths', 'call_prices'])

In [14]:
combined_data_dict[(0.05,0.1)]["call_prices"]["call_prices"].shape

(239, 30, 25, 10)

In [37]:
int(239*TEST_SIZE)

35

# 1. Create Torch Dataset

In [16]:
import torch
from torch.utils.data import Dataset
import numpy as np

class OptionDataset(Dataset):
    def __init__(self, combined_data_dict, spot_history, train_val_cut, burnin):
        X_list, Y_list, T_list, test_flags, meta_list = [], [], [], [], []

        for (mu, sigma), data in combined_data_dict.items():
            paths = data["gbm_paths"]  # shape (n_steps, n_paths)
            call_prices = data["call_prices"]["call_prices"]  # shape (L, n_paths, K_size, M)

            n_steps, n_paths = paths.shape
            L, _, K_size, M = call_prices.shape

            for i in range(n_paths):  # loop over each path
                for t in range(L - M):  # loop over call price times
                    gbm_t = t + burnin
                    if gbm_t - spot_history + 1 < 0 or gbm_t + M >= n_steps:
                        continue  # skip if history or future goes out of bounds

                    x_call = call_prices[t, i].reshape(-1)                         # (K_size * M,)
                    x_spot_hist = paths[gbm_t - spot_history + 1 : gbm_t + 1, i].reshape(-1)  # (spot_history,)
                    y = paths[gbm_t + 1 : gbm_t + 1 + M, i].reshape(-1)           # (M,)
                    X = np.concatenate([x_call, x_spot_hist])

                    is_test = (t + M) >= (L - train_val_cut)

                    X_list.append(X)
                    Y_list.append(y)
                    T_list.append(t)
                    test_flags.append(is_test)
                    meta_list.append((mu, sigma))

        self.X = torch.from_numpy(np.stack(X_list)).float()
        self.Y = torch.from_numpy(np.stack(Y_list)).float()
        self.T = torch.from_numpy(np.array(T_list)).long()
        self.is_test = torch.tensor(test_flags)
        self.meta = torch.tensor(meta_list).float()  # shape: (N_samples, 2)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return {
            "X": self.X[idx],
            "Y": self.Y[idx],
            "T": self.T[idx],
            "is_test": self.is_test[idx],
            "meta": self.meta[idx],  # (mu, sigma)
        }


In [40]:
dataset = OptionDataset(combined_data_dict, spot_history=30, train_val_cut=int(239*TEST_SIZE), burnin=PARAMS["bs_call"]["BURNIN_WINDOW"])

# Example of a batch item
sample = dataset[0]
print(sample["X"].shape, sample["Y"].shape, sample["is_test"], sample["meta"])

torch.Size([280]) torch.Size([10]) tensor(False) tensor([0.0500, 0.1000])


In [41]:
280 == PARAMS["bs_call"]["K_GRID_SIZE"]*PARAMS["bs_call"]["M"] + 30

True

In [42]:
len(dataset)

687000

In [43]:
combined_data_dict[(0.05,0.1)]["call_prices"]["call_prices"].shape

(239, 30, 25, 10)

In [44]:
len(combined_data_dict)*239*30

717000

# 2. Create Train/Val/Test

In [45]:
# 1. Split into train_val and test based on is_test flag
is_test_mask = dataset.is_test
test_indices = torch.where(is_test_mask)[0]
trainval_indices = torch.where(~is_test_mask)[0]
len(trainval_indices), len(test_indices)

(582000, 105000)

In [46]:
# 2. Random 80/20 split from train_val
train_indices, val_indices = train_test_split(
    trainval_indices.numpy(),
    test_size=VAL_SIZE/(1-TEST_SIZE),
    shuffle=True,
    random_state=SEED
)

In [54]:
np.array([len(train_indices), len(val_indices), len(test_indices)])

array([479294, 102706, 105000])

In [48]:
np.array([len(train_indices), len(val_indices), len(test_indices)])/687000

array([0.6976623 , 0.14949927, 0.15283843])

In [49]:
# --- Package all splits ---
dataset_splits = {
    "train": {k: v[train_indices] for k, v in dataset.__dict__.items() if k in ["X", "Y", "T", "meta"]},
    "val":   {k: v[val_indices]   for k, v in dataset.__dict__.items() if k in ["X", "Y", "T", "meta"]},
    "test":  {k: v[test_indices]  for k, v in dataset.__dict__.items() if k in ["X", "Y", "T", "meta"]},
}

In [53]:
min(dataset_splits["test"]["T"])

tensor(194)

In [50]:
# --- Save to Pickle ---
with open(f"{EXP_PATH}/train_test_val_dataste_{VERSION}.pkl", "wb") as f:
    pickle.dump(dataset_splits, f)