In [1]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('index_data/mid_cap_all_sectors_volume.csv', index_col='date')

In [None]:
data.plot()

In [2]:
ret = pd.read_csv('index_data/mid_cap_all_sectors_ret.csv', index_col='date') * 100
ret.columns = [f'{col}_ret' for col in ret.columns]
vol = pd.read_csv('index_data/mid_cap_all_sectors_volume.csv', index_col='date')
vol.columns = [f'{col}_volume' for col in vol.columns]
full = pd.concat([ret, vol], axis=1)

n = int(len(full) * 0.8)
train_n = int(n * 0.95)
tmp = full.iloc[:n]
train_df = tmp.iloc[:train_n]
valid_df = tmp.iloc[train_n:]

z_score_map = {}
for col in train_df.columns:
    z_score_map[col] = (train_df[col].mean(), train_df[col].std())

train_df = train_df.copy()
for col in train_df.columns:
    mu, std = z_score_map[col] 
    train_df[col] = (train_df[col] - mu) / std
    
valid_df = valid_df.copy()
for col in valid_df.columns:
    mu, std = z_score_map[col] 
    valid_df[col] = (valid_df[col] - mu) / std

In [6]:
train_df.shape

(4714, 22)

In [3]:
import torch
import torch.nn as nn

def data_to_tensor(data, dtype=torch.float32):
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    return torch.tensor(np.array(data), dtype=dtype).to(device)

class CNNDataset(torch.utils.data.Dataset):
    def __init__(self, data: pd.Series, seq_n: int) -> None:
        sample_index = data.shift(seq_n-1).dropna().index.tolist()
        self.data_list = []
        for sample in sample_index:
            data_tensor = data_to_tensor(data.loc[:sample].iloc[-seq_n:].T)
            data_tuple = (data_tensor, data_tensor)
            self.data_list.append(data_tuple)

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, index):
        return self.data_list[index]

In [4]:
dataset = CNNDataset(train_df, 100)

In [5]:
dataset[0][0].shape

torch.Size([22, 100])

In [None]:
import torch
import torch.nn as nn

# Custom loss function to focus more on return reconstruction using weighted loss
class CustomSectorLoss(nn.Module):
    def __init__(self, return_weight=2.0, volume_weight=1.0):
        super(CustomSectorLoss, self).__init__()
        self.return_weight = return_weight  # Weight for return loss
        self.volume_weight = volume_weight  # Weight for volume loss
        self.mse_loss = nn.MSELoss(reduction='none')  # Element-wise MSE loss

    def forward(self, true, pred):
        # Assuming true and pred have shape (batch_size, 22, seq_len)
        # true has both return and volume; pred has both for reconstruction

        # Split returns and volumes for both true and predicted data
        return_true = true[:, :11, :]  # First 11 channels for returns
        volume_true = true[:, 11:, :]  # Next 11 channels for volumes
        
        return_pred = pred[:, :11, :]  # First 11 channels for predicted returns
        volume_pred = pred[:, 11:, :]  # Next 11 channels for predicted volumes

        # Compute MSE loss for returns and volumes
        return_loss = self.mse_loss(return_true, return_pred)  # Return loss (batch, 11, seq_len)
        volume_loss = self.mse_loss(volume_true, volume_pred)  # Volume loss (batch, 11, seq_len)

        # Mean the loss across the batch, sequence length, and features
        return_loss_mean = return_loss.mean()  # Mean return loss across sectors
        volume_loss_mean = volume_loss.mean()  # Mean volume loss

        # Combine the losses with the respective weights
        total_loss = self.return_weight * return_loss_mean + self.volume_weight * volume_loss_mean

        return total_loss

# Example usage with data (batch_size=32, 22 channels for returns and volumes, seq_len=100)
batch_true = torch.randn(32, 22, 100)  # Ground truth: 11 sector returns + 11 sector volumes
batch_pred = torch.randn(32, 22, 100)  # Predicted returns and volumes (22 channels)

# Custom loss
criterion = CustomSectorLoss(return_weight=2.0, volume_weight=1.0)  # Return is prioritized

# Compute total loss
loss = criterion(batch_true, batch_pred)
print(f"Total Loss: {loss.item()}")

# In inference, only output the first 11 channels (returns)
inference_output = batch_pred[:, :11, :]  # Only output returns
print(inference_output.shape)  # Should be (batch_size, 11, seq_len)