In [36]:
import pandas as pd
import numpy as np

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl

from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.loggers import TensorBoardLogger
import wandb

from DataPreparation import get_data

In [37]:
class TimeSeriesDataset(Dataset):
    def __init__(self, 
                 currency_list,
                 x: np.ndarray, 
                 y: np.ndarray,
                 data_use_type,
                 train_percentage,
                 val_percentage,
                 test_percentage,
                 seq_len, 
                 ):
        self.currencies = currency_list
        self.n_currencies = len(self.currencies)
        self.x = torch.tensor(x[:self.n_currencies]).float()
        self.y = torch.tensor(y[:self.n_currencies]).long()
        self.seq_len = seq_len
        self.data_use_type = data_use_type
        
        
        #self.train_size = int(len(self.x[0]) * train_percentage)
        self.val_size = int(len(self.x[0]) * val_percentage)
        self.test_size = int(len(self.x[0]) * test_percentage)
        self.train_size = len(self.x[0]) - self.val_size - self.test_size 
        print(self.test_size, self.val_size, self.train_size) 
        
        self.train_mean = [self.x[i][:self.train_size].mean(axis=0) for i in range(self.n_currencies)]
        print(self.train_mean)
        self.train_std = [self.x[i][:self.train_size].std(axis=0) for i in range(self.n_currencies)]
        
#         self.train_min = [self.x[i][:self.train_size].min() for i in range(n_currencies)]
#         self.train_max = [self.x[i][:self.train_size].max() for i in range(n_currencies)]
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - ( self.seq_len)

        elif self.data_use_type == "val":
            return self.val_size
  
        else:
            return self.test_size
        
    
    def __getitem__(self, index):
        
        item = dict()
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        for i in range(self.n_currencies):
            window = self.x[i][index:index+self.seq_len]
            window = (window -self.train_mean[i]) / self.train_std[i]
            
            item[self.currencies[i] + "_window"] = window
            item[self.currencies[i] + "_label"]  = self.y[i][index+self.seq_len]

        return item

In [38]:
def name_model(config):
    task = "multi_task_" + "_".join(config["currency_list"]) if len(config["currency_list"]) > 1 else "single_task_" + config["currency_list"][0]
    classification = "multi_classification" if config["n_classes"] > 2 else "binary_classification"
    lstm = "stack_lstm" if len(config["lstm_hidden_sizes"]) > 1 else "single_lstm"
    trend_removed = "trend_removed" if config["remove_trend"] else ""
    loss_weighted = "loss_weighted" if config["loss_weight_calculate"] else ""
    indicators  = "indicators" if config["indicators"] else ""
    imfs = "imfs" if config["imfs"] else ""
    return "_".join([task, lstm, loss_weighted, classification, trend_removed])

CONFIG = {#fix for this project
          "window_size": 50, 
          "dataset_percentages": [0.97, 0.007, 0.023],
          "frenquency": "D", 
          "neutral_quantile": 0.33,
          "batch_size": 16,
          "bidirectional": False}
config = CONFIG.copy()
config.update({"n_classes": 2,
          "currency_list": ['BTC'],#['BTC', 'ETH', 'LTC'],
          "remove_trend": True,
          "lstm_hidden_sizes": [128, 128, 128],
          "loss_weight_calculate": False, 
          "indicators": True, 
          "imfs": False,
               "ohlv": True})

In [39]:
MODEL_NAME = name_model(config)

CURRENCY_LST = config["currency_list"]
N_CLASSES = config["n_classes"]
LSTM_HIDDEN_SIZES = config["lstm_hidden_sizes"]
BIDIRECTIONAL = config["bidirectional"]
REMOVE_TREND =config["remove_trend"]
LOSS_WEIGHT_CALCULATE = config["loss_weight_calculate"]

TRAIN_PERCENTAGE, VAL_PERCENTAGE, TEST_PERCENTAGE = config["dataset_percentages"] 
WINDOW_SIZE = config["window_size"]
FREQUENCY = config["frenquency"]
NEUTRAL_QUANTILE = config["neutral_quantile"] if N_CLASSES > 2 else 0 
BATCH_SIZE= config["batch_size"]
INDICATORS = config["indicators"]
IMFS = config["imfs"]
OHLV = config["ohlv"]
#####
X, y, features, dfs = get_data(CURRENCY_LST,
                            N_CLASSES,
                             FREQUENCY, 
                             WINDOW_SIZE,
                             neutral_quantile = NEUTRAL_QUANTILE,
                             log_price=True,
                             remove_trend=REMOVE_TREND,
                             include_indicators = INDICATORS,
                             include_imfs = True
                            )
INPUT_FEATURE_SIZE = X.shape[-1]

ModuleNotFoundError: No module named 'PyEMD'

In [40]:
!pip install PyEMD

Collecting PyEMD
  Downloading pyemd-0.5.1.tar.gz (91 kB)
[K     |████████████████████████████████| 91 kB 228 kB/s eta 0:00:01
Building wheels for collected packages: PyEMD
  Building wheel for PyEMD (setup.py) ... [?25ldone
[?25h  Created wheel for PyEMD: filename=pyemd-0.5.1-cp38-cp38-linux_x86_64.whl size=582739 sha256=3417626fd554f5ec8ea6e6d57cdcca719ab41724fd6342da2baceae2751f2975
  Stored in directory: /home/aysenurk/.cache/pip/wheels/a2/a5/34/f960a47ca5c06b0e91b6f48117a79a66f53a879f8fac9529bf
Successfully built PyEMD
Installing collected packages: PyEMD
Successfully installed PyEMD-0.5.1


In [34]:
X[0][0].shape

(84,)

In [35]:
train_dataset, val_dataset, test_dataset = [TimeSeriesDataset(CURRENCY_LST, 
                                                          X, 
                                                          y, 
                                                          dtype, 
                                                          TRAIN_PERCENTAGE, 
                                                          VAL_PERCENTAGE, 
                                                          TEST_PERCENTAGE, 
                                                          WINDOW_SIZE) for dtype in ['train', 'val', 'test']]

31 9 1313
[tensor([-5.0744e-05,  1.8045e+06,  7.9578e+05,  1.0896e-01,  2.3417e+06,
         5.2833e+01,  2.6741e+08,  2.6392e+08,  3.1912e+02,  6.3611e+03,
         1.0571e+04,  7.1437e+02,  1.0489e+04,  1.2099e+04,  8.8788e+03,
         2.7589e+01,  5.6297e-01,  8.3778e-02,  4.4935e-02,  1.0654e+04,
         1.1382e+04,  9.9257e+03,  1.2225e+01,  6.1807e-01,  2.5590e-01,
         1.3481e-01,  8.7893e+03,  1.2222e+04,  1.0505e+04,  2.9928e+01,
         5.5979e-01,  8.3091e+00,  2.5967e+02,  2.5326e+02,  6.4109e+00,
         1.0650e+04,  1.0375e+04,  1.0651e+04,  1.0391e+04,  2.9581e+01,
         2.1951e+01,  1.9870e+01,  1.0315e+00,  9.5711e-01,  7.4427e-02,
         1.9354e-01,  2.4941e+01,  2.0288e+01, -1.5619e+01,  5.6813e+01,
         5.4309e+01,  2.5041e+00,  1.0660e+04,  1.0433e+04,  1.0547e+04,
         1.0156e+04,  9.7321e+03,  9.4731e+03,  5.4148e+01,  4.2142e+01,
         1.2006e+01,  1.0105e+04,  1.0918e+04,  3.7319e-02,  3.7319e-02,
         5.5104e+01,  5.3583e+01,  5.089

In [18]:
train_dataset[1]["BTC_window"]

tensor([[-0.0102, -0.0102, -0.0102,  ..., -0.0102, -0.0102, -0.0102],
        [-0.0102, -0.0102, -0.0102,  ..., -0.0102, -0.0102, -0.0102],
        [-0.0102, -0.0102, -0.0102,  ..., -0.0102, -0.0102, -0.0102],
        ...,
        [-0.0102, -0.0102, -0.0102,  ..., -0.0102, -0.0102, -0.0102],
        [-0.0102, -0.0102, -0.0102,  ..., -0.0102, -0.0102, -0.0102],
        [-0.0102, -0.0102, -0.0102,  ..., -0.0102, -0.0102, -0.0102]])