In [5]:
import os
import warnings
warnings.filterwarnings('ignore')
import pickle
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple

In [9]:
import torch
from torch import nn
from pytorch_forecasting.models.base_model import BaseModelWithCovariates
from pytorch_forecasting.models.nn import MultiEmbedding
from pytorch_forecasting import TimeSeriesDataSet

In [7]:
class FullyConnectedModule(nn.Module):
    def __init__(self, input_size: int, output_size: int, hidden_size: int, n_hidden_layers: int):
        super().__init__()
        module_list = [
            nn.Linear(input_size, hidden_size), nn.ReLU()
        ]
        for _ in range(n_hidden_layers):
            module_list.extend([
                nn.Linear(hidden_size, hidden_size), nn.ReLU()
            ])
        module_list.append(nn.Linear(hidden_size, output_size))
        self.sequential = nn.Sequential(*module_list)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x of shape: batch_size * n_timesteps_in
        # output of shape batch_size * n_timesteps_out
        return self.sequential(x)
    
network = FullyConnectedModule(input_size = 5, output_size = 2, hidden_size = 10, n_hidden_layers = 2)
x = torch.rand(20, 5)
network(x).shape

torch.Size([20, 2])

In [28]:
class FullyConnectedModel(BaseModelWithCovariates):
    def __init__(
        self, 
        input_size: int, 
        output_size: int, 
        hidden_size: int, 
        n_hidden_layers: int, 
        x_reals: List[str],
        x_categoricals: List[str],
        embedding_sizes: Dict[str, Tuple[int, int]],
        embedding_labels: Dict[str, List[str]],
        static_categoricals: List[str],
        static_reals: List[str],
        time_varying_categoricals_encoder: List[str],
        time_varying_categoricals_decoder: List[str],
        time_varying_reals_encoder: List[str],
        time_varying_reals_decoder: List[str],
        embedding_paddings: List[str],
        categorical_groups: Dict[str, List[str]],
        **kwargs):
        # saves arguments in signature to '.hparams' attribute, mandatory call
        self.save_hyperparameters()
        # pass additional arguments to BaseModel.__init__, mandatory call
        super().__init__(**kwargs)
        
        # create embedder
        self.input_embeddings = MultiEmbedding(
            embedding_sizes = self.hparams.embedding_sizes,
            categorical_groups = self.hparams.categorical_groups,
            embedding_paddings = self.hparams.embedding_paddings,
            x_categoricals = self.hparams.x_categoricals,
            max_embedding_size = self.hparams.hidden_size,
        )
        
        # Calculate the size of all concatenated embeddings + continuous variables
        n_features = sum(
            embedding_size for classes_size, embedding_size in self.hparams.embedding_sizes.values()
        ) + len(self.reals)
        
        self.network = FullyConnectedModule(
            input_size = self.hparams.input_size * n_features,
            output_size = self.hparams.output_size,
            hidden_size = self.hparams.hidden_size,
            n_hidden_layers = self.hparams.n_hidden_layers,
        )
        
    def forward(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        # x is a batch generated based on the TimeSeriesDataset
        batch_size = x['encoder_lengths'].size(0)
        embeddings = self.input_embeddings(x['encoder_cat']) # returns dictionary with embedding tensors
        network_input = torch.cat(
            [x['encoder_cont']] + 
                [
                    emb for name, emb in embeddings.items()
                    if name in self.encoder_variables or name in self.static_variables
                ], dim = -1,
        )
        prediction = self.network(network_input.view(batch_size, -1))
        
        prediction = self.transform_output(prediction, target_scale = x['target_scale'])
        
        # We need to return a dictionary that at least contains the prediction
        # The parameter can be directly forwarded from the input.
        # The conversion to a named tuple can be directly achieved with the `to_network_output` function.
        return self.to_network_output(prediction=prediction)
    
    @classmethod
    def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
        new_kwargs = {
            'output_size': dataset.max_prediction_length,
            'input_size': dataset.max_encoder_length,
        }
        new_kwargs.update(kwargs)
        assert dataset.max_prediction_length == dataset.min_prediction_length, "Decoder only supports a fixed length"
        assert dataset.min_encoder_length == dataset.max_encoder_length, "Encoder only supports a fixed length"
        return super().from_dataset(dataset, **new_kwargs)

## Data 불러오기

In [11]:
with open('../data/crypto_currency.pickle', 'rb') as f:
    data = pickle.load(f)

#### Create index
  * 모형에서 integer ndex를 timestamp로 하고 있음
  * 가장 긴 데이터를 기준으로 index 생성

In [12]:
cnt = 0
m = 0
for k in data.keys():
    cnt += data[k].shape[0]
    m = np.max([m, data[k].shape[0]])

In [13]:
data['KRW-ZIL'].shape

(720, 16)

In [14]:
d1 = data['KRW-ZIL']
d1 = d1.sort_values('candle_date_time_kst')
d1['index'] = np.arange(len(d1))
date_index = d1[['index', 'candle_date_time_kst']]

In [15]:
print(cnt)

103848


In [16]:
res_data = pd.DataFrame()
for i, key in enumerate(data.keys()):
    tmp = data[key]
    tmp_1 = pd.merge(tmp, date_index, on ='candle_date_time_kst')
    try:
        res_data = pd.concat([res_data, tmp_1[['market', 'candle_acc_trade_volume', 'index', 'change_price', 'candle_date_time_kst', 'trade_price']]])
    except Exception as e:
        res_data = pd.concat([res_data, tmp_1[['market', 'candle_acc_trade_volume', 'index_y', 'change_price', 'candle_date_time_kst', 'trade_price']]])
        pass

### Dupe 발생
* Dupe 나는 data 파악

In [17]:
res_data['new_idx'] = res_data['market'] + res_data['index'].astype(str)

In [18]:
res_data.groupby(['new_idx']).count()[['index']][res_data.groupby(['new_idx']).count()['index'] > 1]

Unnamed: 0_level_0,index
new_idx,Unnamed: 1_level_1
BTC-AERGO517,2
BTC-AERGO518,2
BTC-AERGO519,2
BTC-AHT519,2
BTC-ANKR518,2
BTC-ANKR519,2
BTC-ARDR519,2
BTC-CBK519,2
BTC-DKA519,2
BTC-FCT2519,2


In [19]:
dup_new_idx = res_data.groupby(['new_idx']).count()[['index']][res_data.groupby(['new_idx']).count()['index'] > 1].index

In [21]:
dup_new_idx

Index(['BTC-AERGO517', 'BTC-AERGO518', 'BTC-AERGO519', 'BTC-AHT519',
       'BTC-ANKR518', 'BTC-ANKR519', 'BTC-ARDR519', 'BTC-CBK519', 'BTC-DKA519',
       'BTC-FCT2519', 'BTC-HUM519', 'BTC-IQ519', 'BTC-LOOM519', 'BTC-RFR517',
       'BTC-RFR518', 'BTC-RFR519', 'BTC-XEM519', 'BTC-ZIL519', 'USDT-SC519'],
      dtype='object', name='new_idx')

### Dupe 나는 row 제거

In [22]:
res_data.drop_duplicates(subset = ['new_idx'], inplace = True)

### Error 발생 -> AssertionError: data index has to be unique

In [23]:
res_data.set_index('new_idx', inplace = True)

In [24]:
dataset = TimeSeriesDataSet(
    res_data[['market', 'trade_price', 'index', 'candle_acc_trade_volume']], # , 'change_price'에는 미싱값이 많음
    group_ids = ['market'],
    target = 'trade_price',
    time_idx = 'index',
    min_encoder_length = 5,
    max_encoder_length = 5,
    min_prediction_length = 2,
    max_prediction_length = 2,
    time_varying_unknown_reals = ['trade_price'],
    time_varying_known_reals = ['candle_acc_trade_volume'],
    time_varying_known_categoricals = [],
    static_categoricals = ['market'],   
    allow_missing_timesteps=True, #미싱값이 있는 경우
)

In [25]:
dataset.get_parameters()

{'time_idx': 'index',
 'target': 'trade_price',
 'group_ids': ['market'],
 'weight': None,
 'max_encoder_length': 5,
 'min_encoder_length': 5,
 'min_prediction_idx': 0,
 'min_prediction_length': 2,
 'max_prediction_length': 2,
 'static_categoricals': ['market'],
 'static_reals': [],
 'time_varying_known_categoricals': [],
 'time_varying_known_reals': ['candle_acc_trade_volume'],
 'time_varying_unknown_categoricals': [],
 'time_varying_unknown_reals': ['trade_price'],
 'variable_groups': {},
 'constant_fill_strategy': {},
 'allow_missing_timesteps': True,
 'lags': {},
 'add_relative_time_idx': False,
 'add_target_scales': False,
 'add_encoder_length': False,
 'target_normalizer': GroupNormalizer(transformation='log'),
 'categorical_encoders': {'__group_id__market': NaNLabelEncoder(),
  'market': NaNLabelEncoder()},
 'scalers': {'candle_acc_trade_volume': StandardScaler()},
 'randomize_length': None,
 'predict_mode': False}

In [31]:
dataloader = dataset.to_dataloader(batch_size = 4)
x, y = next(iter(dataloader))
print('x = ', x)
print('\ny = ', y)
for key, value in x.items():
    print(f'\t{key} = {value.size()}')

x =  {'encoder_cat': tensor([[[112],
         [112],
         [112],
         [112],
         [112]],

        [[ 97],
         [ 97],
         [ 97],
         [ 97],
         [ 97]],

        [[154],
         [154],
         [154],
         [154],
         [154]],

        [[118],
         [118],
         [118],
         [118],
         [118]]]), 'encoder_cont': tensor([[[-0.0376,  1.0103],
         [-0.0376,  1.0105],
         [-0.0376,  1.0099],
         [-0.0376,  1.0101],
         [-0.0376,  1.0027]],

        [[-0.0376,  0.7061],
         [-0.0376,  0.7015],
         [-0.0376,  0.7048],
         [-0.0376,  0.7034],
         [-0.0376,  0.6982]],

        [[-0.0376,  0.7011],
         [-0.0376,  0.7011],
         [-0.0376,  0.7039],
         [-0.0376,  0.6863],
         [-0.0376,  0.6885]],

        [[-0.0376,  0.5295],
         [-0.0376,  0.5295],
         [-0.0376,  0.5323],
         [-0.0376,  0.5351],
         [-0.0376,  0.5404]]]), 'encoder_target': tensor([[4125.0000, 4135.00

In [39]:
# from torch.utils.data import WeightedRandomSampler

#probabilities = np.sqrt(1 + data.loc[dataset.index, 'trade_price'])
#sampler = WeightedRandomSampler(probabilities, len(probabilities))
# dataset.to_dataloader(train = True,  shuffle = True) # sampler = sampler

In [29]:
model = FullyConnectedModel.from_dataset(dataset, input_size = 5, output_size = 2, hidden_size = 10, n_hidden_layers = 2)
model.summarize(4)

   | Name                               | Type                 | Params
-----------------------------------------------------------------------------
0  | loss                               | SMAPE                | 0     
1  | logging_metrics                    | ModuleList           | 0     
2  | input_embeddings                   | MultiEmbedding       | 2.0 K 
3  | input_embeddings.embeddings        | ModuleDict           | 2.0 K 
4  | input_embeddings.embeddings.market | Embedding            | 2.0 K 
5  | network                            | FullyConnectedModule | 1.9 K 
6  | network.sequential                 | Sequential           | 1.9 K 
7  | network.sequential.0               | Linear               | 1.7 K 
8  | network.sequential.1               | ReLU                 | 0     
9  | network.sequential.2               | Linear               | 110   
10 | network.sequential.3               | ReLU                 | 0     
11 | network.sequential.4               | Linear          

In [32]:
x, y = next(iter(dataloader))
model(x)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x60 and 165x10)

In [216]:
dataset.x_to_index(x)

Unnamed: 0,index,market
0,564,BTC-REI
1,430,KRW-SRM
2,538,BTC-AERGO
3,634,KRW-AAVE


In [29]:
from pytorch_forecasting.models.nn import MultiEmbedding

class FullyConnectedModel(BaseModelWithCovariates):
    def __init__(
        self, 
        input_size: int, 
        output_size: int, 
        hidden_size: int, 
        n_hidden_layers: int, 
        x_reals: List[str],
        x_categoricals: List[str],
        embedding_sizes: Dict[str, Tuple[int, int]],
        embedding_labels: Dict[str, List[str]],
        static_categoricals: List[str],
        static_reals: List[str],
        time_varying_categoricals_encoder: List[str],
        time_varying_categoricals_decoder: List[str],
        time_varying_reals_encoder: List[str],
        time_varying_reals_decoder: List[str],
        embedding_paddings: List[str],
        categorical_groups: Dict[str, List[str]],
        **kwargs):
        # saves arguments in signature to '.hparams' attribute, mandatory call
        self.save_hyperparameters()
        # pass additional arguments to BaseModel.__init__, mandatory call
        super().__init__(**kwargs)
        
        # create embedder
        self.input_embeddings = MultiEmbedding(
            embedding_sizes = self.hparams.embedding_sizes,
            categorical_groups = self.hparams.categorical_groups,
            embedding_padding = self.hparams.embedding_paddings,
            x_categoricals = self.hparams.x_categoricals,
            max_embedding_size = self.hparams.hidden_size,
        )
        
        # Calculate the size of all concatenated embeddings + continuous variables
        n_features = sum(
            embedding_size for classes_size, embedding_size in self.hparams.embedding_sizes.values()
        ) + len(self.reals)
        
        self.network = FullyConnectedModule(
            input_size = self.hparams.input_size * n_features,
            output_size = self.hparams.output_size,
            hidden_size = self.hparams.hidden_size,
            n_hidden_layers = self.hparams.n_hidden_layers,
        )
        
    def forward(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        # x is a batch generated based on the TimeSeriesDataset
        batch_size = x['encoder_lengths'].size(0)
        embeddings = self.input_embeddings(x['encoder_cat']) # returns dictionary with embedding tensors
        network_input = torch.cat(
            [x['encoder_cont']] + 
                [
                    emb for name, emb in embeddings.items()
                    if name in self.encoder_variables or name in self.static_variables
                ], dim = -1,
        )
        prediction = self.network(network_input.view(batch_size, -1))
        
        prediction = self.transform_output(prediction, target_scale = x['target_scale'])
        
        # We need to return a dictionary that at least contains the prediction
        # The parameter can be directly forwarded from the input.
        # The conversion to a named tuple can be directly achieved with the `to_network_output` function.
        return self.to_network_output(prediction=prediction)
    
    @classmethod
    def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
        new_kwargs = {
            'output_size': dataset.max_prediction_length,
            'input_size': dataset.max_encoder_length,
        }
        new_kwargs.update(kwargs)
        assert dataset.max_prediction_length == dataset.min_prediction_length, "Decoder only supports a fixed length"
        assert dataset.min_encoder_length == dataset.max_encoder_length, "Encoder only supports a fixed length"
        return super().from_dataset(dataset, **new_kwargs)

In [30]:
model = FullyConnectedModel.from_dataset(dataset, hidden_size = 10, n_hidden_layers = 2)
print(model.summarize(3))
model.hparams

TypeError: __init__() got an unexpected keyword argument 'embedding_padding'

In [235]:
from pytorch_lightning import Trainer

trainer = Trainer(max_epochs = 20, gpus = 0)
trainer.fit(model, train_dataloaders = dataloader, val_dataloaders = dataloader)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/kyle/course-material/notebooks/lightning_logs

  | Name            | Type                 | Params
---------------------------------------------------------
0 | loss            | SMAPE                | 0     
1 | logging_metrics | ModuleList           | 0     
2 | network         | FullyConnectedModule | 302   
---------------------------------------------------------
302       Trainable params
0         Non-trainable params
302       Total params
0.001     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]