In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f
import numpy as np
def data_to_tensor(data, dtype=torch.float32):
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    return torch.tensor(np.array(data), dtype=dtype).to(device)

class CNNChannelDataset(torch.utils.data.Dataset):
    def __init__(self, data: pd.DataFrame, seq_n: int, exclude = True) -> None:
        sample_index = data.shift(seq_n-1).dropna().index.tolist()
        self.data_list = []
        for sample in sample_index:
            if exclude:
                 exclude_date_start = pd.to_datetime('2008-08-01')
                 exclude_date_end = pd.to_datetime('2009-04-01')
                 if exclude_date_start <= pd.to_datetime(sample) <= exclude_date_end:
                    continue
                 
            data_tensor = data_to_tensor(data.loc[:sample].iloc[-seq_n:].T)
            data_tuple = (data_tensor, data_tensor)
            self.data_list.append(data_tuple)

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, index):
        return self.data_list[index]

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
seed = 42
torch.manual_seed(seed)

mid_cap_index = pd.read_csv('../data/mid_cap_all_sectors_ret.csv', index_col='date')
ret = mid_cap_index * 100
n = int(len(ret) * 0.8)
train_n = int(n * 0.8)
tmp = ret.iloc[:n]
train_df = tmp.iloc[:train_n]
valid_df = tmp.iloc[train_n:]

input_dim = train_df.shape[1]
seq_n = 20
train_dataset = CNNChannelDataset(train_df, seq_n, exclude=True)

In [13]:
len(train_dataset)

3783

In [14]:
valid_dataset = CNNChannelDataset(valid_df, seq_n, exclude=True)

In [15]:
len(valid_dataset)

974

In [16]:
valid_dataset[0][0].shape

torch.Size([11, 20])

In [17]:
valid_dataset[0][0][0,:]

tensor([ 0.8991, -0.7810, -1.2286,  0.1161, -2.0100,  2.5600,  0.7553, -1.6141,
        -1.2755,  1.4477, -0.1290,  0.4776,  2.3201,  0.6755, -1.1295, -1.3862,
         0.1219, -0.8741,  0.0721, -0.7097], device='mps:0')

In [18]:
valid_df.iloc[0:20,0]

date
2015-10-15    0.899112
2015-10-16   -0.780959
2015-10-19   -1.228553
2015-10-20    0.116133
2015-10-21   -2.010037
2015-10-22    2.560004
2015-10-23    0.755332
2015-10-26   -1.614055
2015-10-27   -1.275462
2015-10-28    1.447740
2015-10-29   -0.128994
2015-10-30    0.477637
2015-11-02    2.320069
2015-11-03    0.675544
2015-11-04   -1.129488
2015-11-05   -1.386199
2015-11-06    0.121921
2015-11-09   -0.874123
2015-11-10    0.072062
2015-11-11   -0.709686
Name: Materials, dtype: float64

In [24]:
vol_df = (valid_df).rolling(seq_n).std().dropna()
vol_df

Unnamed: 0_level_0,Materials,Industrials,Health Care,Real Estate,Consumer Discretionary,Financials,Utilities,Information Technology,Energy,Consumer Staples,Communication Services
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-11-11,1.274430,1.197362,1.768121,1.010746,1.017918,1.251349,1.208442,1.372668,2.533204,1.041177,1.578974
2015-11-12,1.418532,1.368330,1.885932,1.003793,1.034886,1.320464,1.179030,1.426179,2.506651,1.305854,1.595268
2015-11-13,1.422650,1.352572,1.892655,1.010378,1.207406,1.343220,1.178586,1.442747,2.605158,1.259928,1.603272
2015-11-16,1.437995,1.382369,1.892467,1.005447,1.239528,1.361238,1.282009,1.458200,2.698825,1.271578,1.645992
2015-11-17,1.459591,1.387995,1.857816,1.005674,1.240432,1.356677,1.331811,1.444327,2.766972,1.266834,1.631911
...,...,...,...,...,...,...,...,...,...,...,...
2019-09-19,1.494247,1.304049,1.163494,0.715995,1.553525,1.226418,0.792354,1.517996,3.784347,1.029020,1.654919
2019-09-20,1.480756,1.307500,1.130630,0.705739,1.546955,1.233716,0.791903,1.548306,3.764133,0.987930,1.651488
2019-09-23,1.312959,1.043919,0.948518,0.571382,1.244876,0.968073,0.635667,1.278850,3.599477,0.864468,1.550232
2019-09-24,1.421530,1.098914,1.128802,0.574531,1.283272,1.012595,0.583122,1.366815,3.790757,0.825388,1.627157


In [43]:
thresholds_df = vol_df.quantile([0.3, 0.7]).T
thresholds_df.apply(lambda row: {"low": row[0.3], "high": row[0.7]}, axis=1).to_dict()

{'Materials': {'low': 0.8721838637381618, 'high': 1.3675819710791473},
 'Industrials': {'low': 0.8271148481134133, 'high': 1.1939285748614032},
 'Health Care': {'low': 1.0330252326930105, 'high': 1.5209183302002873},
 'Real Estate': {'low': 0.7167286686795477, 'high': 0.9837557315595057},
 'Consumer Discretionary': {'low': 0.8438497447901601,
  'high': 1.1583618059278578},
 'Financials': {'low': 0.8548435384311283, 'high': 1.244340088946994},
 'Utilities': {'low': 0.7367512348340797, 'high': 1.0007386471211765},
 'Information Technology': {'low': 1.0272435160701177,
  'high': 1.3922118538989268},
 'Energy': {'low': 1.6862890588652093, 'high': 2.4190524869152688},
 'Consumer Staples': {'low': 0.7427326278158609, 'high': 0.9591672543538732},
 'Communication Services': {'low': 1.0160448301115739,
  'high': 1.3896489586562735}}

In [None]:
regimes = pd.DataFrame(index=vol_df.index, columns=vol_df.columns)


In [27]:
y = np.array([[8,2,3],
              [7,1,6]])

x = np.array([6,1,6])

In [28]:
y.shape

(2, 3)

In [29]:
x.shape

(3,)

In [30]:
y > x

array([[ True,  True, False],
       [ True, False, False]])

In [16]:
import itertools
import pandas as pd
param_grid = {
        'hidden_channels1': [16,32, 64],
        'activation_func': [nn.ReLU(), nn.LeakyReLU(), nn.Tanh()],
        'dropout_rate': [0.1, 0.2],
        'kernel_size': [3, 5, 7, 11],
        'lr': [1e-3]
    }

param_list = [(hidden, func, dropout_rate, kernel, lr)
    for hidden, func, dropout_rate, kernel,lr  in itertools.product(
        param_grid['hidden_channels1'],
        param_grid['activation_func'],
        param_grid['dropout_rate'],
        param_grid['kernel_size'],
        param_grid['lr']
    )
]

reslt = []
for param in param_list:
    hidden, func, dropout_rate, kernel, lr = param

    reslt.append({
        'hidden_channels1': hidden,
        'activation_func': str(func),
        'dropout_rate': dropout_rate,
        'kernel_size': kernel,
        'lr':lr,
        'mae': 1 + np.random.uniform(0,1,1).item() 
    })

reslt = pd.DataFrame(reslt)
best_model_params = reslt.loc[reslt['mae'].idxmin()]

In [17]:
reslt

Unnamed: 0,hidden_channels1,activation_func,dropout_rate,kernel_size,lr,mae
0,16,ReLU(),0.1,3,0.001,1.045890
1,16,ReLU(),0.1,5,0.001,1.928602
2,16,ReLU(),0.1,7,0.001,1.962618
3,16,ReLU(),0.1,11,0.001,1.072399
4,16,ReLU(),0.2,3,0.001,1.273290
...,...,...,...,...,...,...
67,64,Tanh(),0.1,11,0.001,1.689278
68,64,Tanh(),0.2,3,0.001,1.392316
69,64,Tanh(),0.2,5,0.001,1.702005
70,64,Tanh(),0.2,7,0.001,1.085890


In [18]:
best_model_params

hidden_channels1          16
activation_func       Tanh()
dropout_rate             0.2
kernel_size                5
lr                     0.001
mae                 1.000272
Name: 21, dtype: object