In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f
def data_to_tensor(data, dtype=torch.float32):
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    return torch.tensor(np.array(data), dtype=dtype).to(device)

class CNNChannelDataset(torch.utils.data.Dataset):
    def __init__(self, data: pd.DataFrame, seq_n: int, exclude_date_start = '2008-08-01', exclude_date_end ='2009-04-01') -> None:
        sample_index = data.shift(seq_n-1).dropna().index.tolist()
        self.data_list = []
        for sample in sample_index:
            if pd.to_datetime(exclude_date_start) <= pd.to_datetime(sample) <= pd.to_datetime(exclude_date_end):
                continue
            data_tensor = data_to_tensor(data.loc[:sample].iloc[-seq_n:].T)
            data_tuple = (data_tensor, data_tensor)
            self.data_list.append(data_tuple)

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, index):
        return self.data_list[index]

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
seed = 42
torch.manual_seed(seed)

mid_cap_index = pd.read_csv('../data/mid_cap_all_sectors_ret.csv', index_col='date')
ret = mid_cap_index * 100
n = int(len(ret) * 0.8)
train_n = int(n * 0.8)
tmp = ret.iloc[:n]
train_df = tmp.iloc[:train_n]
valid_df = tmp.iloc[train_n:]

input_dim = train_df.shape[1]
seq_n = 100
train_dataset = CNNChannelDataset(train_df, seq_n)

In [8]:
len(train_dataset)

3703

In [9]:
valid_dataset = CNNChannelDataset(valid_df, seq_n)

In [10]:
len(valid_dataset)

894

In [4]:
train_dataset[0][0].shape

torch.Size([11, 100])

In [5]:
train_dataset[0][0][0,:]

tensor([-1.1153,  1.4503,  1.9151,  0.1463,  0.2355, -1.1988, -0.4537,  1.2009,
        -0.7826, -0.3205, -0.4999, -1.4888, -0.1310, -1.2225, -1.2254,  0.6941,
        -0.3964, -0.9279, -1.5058,  0.4917, -0.4280, -0.4284, -0.8743, -0.4611,
         0.2229, -1.4902, -1.2817, -0.1858,  2.0856,  0.1991, -0.4207, -0.4453,
        -2.2361, -0.4923, -2.1606, -1.4957, -1.2640,  0.2613,  0.0862, -0.3621,
        -0.9226,  1.5017, -2.5942, -0.8134,  0.6152,  1.5348,  0.9780,  0.9661,
        -2.5440,  2.7498,  5.1937, -1.3348, -0.2242,  0.6582, -0.7371,  1.2696,
        -0.6286, -0.7516, -0.1700,  1.5803,  1.2503,  2.1284,  0.9733, -1.5711,
        -0.5828,  1.2936, -0.4622, -0.7783,  1.6094,  0.3027, -0.3278, -2.8866,
         0.1420, -0.3918, -0.5795,  1.0793, -1.5441,  1.8322, -0.1777,  0.1938,
         1.3905,  0.9461,  0.2078, -1.9711,  1.0518,  0.6165, -0.7011, -0.9314,
        -1.7466,  2.2142, -0.0506,  0.4815, -0.5831, -0.6365, -1.5877, -0.3565,
        -0.3672, -0.9340,  0.4168, -1.24

In [7]:
train_df.iloc[0:100,0]

date
2000-01-04   -1.115250
2000-01-05    1.450299
2000-01-06    1.915120
2000-01-07    0.146314
2000-01-10    0.235475
                ...   
2000-05-19   -0.356475
2000-05-22   -0.367219
2000-05-23   -0.933987
2000-05-24    0.416800
2000-05-25   -1.249260
Name: Materials, Length: 100, dtype: float64

In [27]:
y = np.array([[8,2,3],
              [7,1,6]])

x = np.array([6,1,6])

In [28]:
y.shape

(2, 3)

In [29]:
x.shape

(3,)

In [30]:
y > x

array([[ True,  True, False],
       [ True, False, False]])