In [1]:
import pandas as pd
import numpy as np

In [2]:
try:
    from PyEMD import EEMD
except:
    !pip install EMD-signal
    from PyEMD import EEMD

from ta import add_all_ta_features

In [3]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.metrics import F1
from pytorch_lightning.loggers import WandbLogger

from pytorch_lightning.loggers import TensorBoardLogger

In [4]:
CURRENCY_LST = ['BTC', 'ETH', 'LTC']
PRICE_TYPE = 'close'
FREQUENCY = "D"
WINDOW_SIZE = 14
NEUTRAL_QUANTILE = 0.25
LOG_PRICE = True
INDICATORS = False
IMFS = False

In [5]:
N_CLASSES = 2 ###
TRAIN_PERCENTAGE, VAL_PERCENTAGE, TEST_PERCENTAGE = 0.80, 0.10, 0.10

In [6]:
LSTM_HIDDEN_SIZES = [128, 128, 128]

In [7]:
BATCH_SIZE= 4

In [8]:
def get_data(currency_lst,
             frequency, 
             window_size,
             neutral_quantile = 0.25,
             beg_date = pd.Timestamp(2013,1,1),
             end_date = pd.Timestamp.now(),
             log_price = True, 
             include_indicators = True,
             include_imfs = True):
        
        X, y, dfs = {}, {}, {}     
        
        for cur in currency_lst:
            df = pd.read_csv(f"../data/0_raw/Binance/{str.lower(cur)}_usdt_1d.csv", index_col=0).reset_index()   
            
            if include_indicators:
                df = add_all_ta_features(df, open="Open", high="High", low="Low", close="Close", volume="Volume", 
                                         fillna=True)
            else:
                df.drop("Volume", axis=1, inplace=True)
            
            df.Date = df.Date.apply(pd.Timestamp)
            df.sort_values("Date", ascending=True, inplace=True)
            df.set_index("Date", inplace=True)
            df.drop(["Timestamp", "Open", "High", "Low"], axis=1, inplace=True)
            df.rename(str.lower, axis=1, inplace=True)
            
            if log_price:
                df["close"] = df["close"].apply(np.log)
      
            if include_imfs:
                eemd = EEMD()
                imfs = eemd(df[PRICE_TYPE].values)
                imf_features = ["imf_"+str(i) for i in range(imfs.shape[0])]
                df = pd.concat((df, pd.DataFrame(imfs.T, columns=imf_features, index=df.index)), axis=1)
                
            price_diff = df["close"].diff().dropna()
            rolling_quantiles = price_diff.abs().rolling(window_size).quantile(neutral_quantile).dropna()
#             conditions = [(price_diff[window_size-1:] < 0) & (price_diff[window_size-1:].abs() > rolling_quantiles),
#                           (price_diff[window_size-1:] > 0) & (price_diff[window_size-1:].abs() > rolling_quantiles)]
            conditions = [(price_diff[window_size-1:] < 0),
                          (price_diff[window_size-1:] > 0)]

#             classes = [1,2] #1 is decrease, 2 is decrease, and 0 is neutral if none of conditions is met
            classes = [0,1]
    
            y = pd.Series(np.select(conditions, classes, default=0), index=price_diff[window_size-1:].index)
            df.insert(loc=0, column="change_dir", value=y)
            dfs[cur] = df
            
        min_dates = [df.index.min() for cur, df in dfs.items()]
        max_dates = [df.index.max() for cur, df in dfs.items()]
        beg_date = max([max(min_dates), beg_date])
        end_date = min([min(max_dates), end_date])
        common_range = pd.date_range(beg_date, end_date, freq=frequency)
        
        X = np.array([dfs[cur].drop("change_dir", axis=1).loc[common_range].values for cur in currency_lst])
        y = np.array([dfs[cur].loc[common_range, "change_dir"].values for cur in currency_lst])
        features = df.columns.tolist()
        
        return X, y, features, dfs

In [9]:
X, y, features, dfs = get_data(CURRENCY_LST,
                                 FREQUENCY, 
                                 WINDOW_SIZE,
                                 neutral_quantile = NEUTRAL_QUANTILE,
                                 log_price=LOG_PRICE,
                                 include_indicators = INDICATORS, #True diyince patlıyor
                                 include_imfs = IMFS
                                )

In [10]:
N_CURRENCIES = X.shape[0]
INPUT_FEATURE_SIZE = X.shape[-1]

In [28]:
class MultiTimeSeriesDataset(Dataset):
    def __init__(self, 
                 n_currencies,
                 x: np.ndarray, 
                 y: np.ndarray,
                 data_use_type,
                 train_percentage = TRAIN_PERCENTAGE,
                 val_percentage = VAL_PERCENTAGE,
                 test_percentage = TEST_PERCENTAGE,
                 seq_len = WINDOW_SIZE, 
                 ):
        
        self.x = torch.tensor(x[:n_currencies]).float()
        self.y = torch.tensor(y[:n_currencies]).long()
        self.seq_len = seq_len
        self.data_use_type = data_use_type
        
        #self.train_size = int(len(self.x[0]) * train_percentage)
        self.val_size = int(len(self.x[0]) * val_percentage)
        self.test_size = int(len(self.x[0]) * test_percentage)
        self.train_size = len(self.x[0]) - self.val_size - self.test_size 
        
#         self.train_mean = [self.x[i][:self.train_size].mean() for i in range(n_currencies)]
#         self.train_std = [self.x[i][:self.train_size].std() for i in range(n_currencies)]
        
#         self.train_min = [self.x[i][:self.train_size].min() for i in range(n_currencies)]
#         self.train_max = [self.x[i][:self.train_size].max() for i in range(n_currencies)]
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - ( self.seq_len)

        elif self.data_use_type == "val":
            return self.val_size
  
        else:
            return self.test_size
        
    
    def __getitem__(self, index):
        
        item = dict()
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        for i in range(N_CURRENCIES):
            window = self.x[i][index:index+self.seq_len]
            #window = (window -self.train_mean[i]) / self.train_std[i]
            
            item["currency_" + str(i) + "_window"] = window
            item["currency_" + str(i) + "_label"]  = self.y[i][index+self.seq_len]

        return item

In [29]:
train_dataset, val_dataset, test_dataset = [MultiTimeSeriesDataset(N_CURRENCIES, X, y, dtype) for dtype in ['train', 'val', 'test']]

In [30]:
X[1][:2].mean()

6.524841546163962

In [31]:
def collate_fn(batch):
    collated_batch = dict()
    for i in range(N_CURRENCIES):
        collated_batch["currency_" + str(i) + "_window"] = []
        collated_batch["currency_" + str(i) + "_label"] = []
        
    for sample in batch: 
        for i in range(N_CURRENCIES):
            collated_batch["currency_" + str(i) + "_window"].append(sample["currency_" + str(i) + "_window"])
            collated_batch["currency_" + str(i) + "_label"].append(sample["currency_" + str(i) + "_label"])
    
    for i in range(N_CURRENCIES):
        collated_batch["currency_" + str(i) + "_window"] = torch.stack(collated_batch["currency_" + str(i) + "_window"])
        collated_batch["currency_" + str(i) + "_label"] = torch.stack(collated_batch["currency_" + str(i) + "_label"])
    
    return collated_batch

#calculate loss' weights
train_labels = [train_dataset[i][1].item() for i in range (train_dataset.__len__())]

cnt = Counter(train_labels)
samples_size = np.array([cnt[key] for key in  sorted(cnt.keys())])

loss_weights = (1 / samples_size) * sum(samples_size)/2
loss_weights, cnt

In [32]:
train_dataset[0]

{'currency_0_window': tensor([[9.6824],
         [9.7011],
         [9.7722],
         [9.8576],
         [9.8448],
         [9.8446],
         [9.7582],
         [9.7104],
         [9.6481],
         [9.4975],
         [9.4955],
         [9.5104],
         [9.5251],
         [9.6607]]),
 'currency_0_label': tensor(0),
 'currency_1_window': tensor([[6.5279],
         [6.5217],
         [6.5146],
         [6.5262],
         [6.5619],
         [6.6644],
         [6.6821],
         [6.6723],
         [6.6624],
         [6.4552],
         [6.4620],
         [6.4892],
         [6.5722],
         [6.6201]]),
 'currency_1_label': tensor(0),
 'currency_2_window': tensor([[5.6699],
         [5.6073],
         [5.6836],
         [5.6835],
         [5.7411],
         [5.8636],
         [5.8245],
         [5.7127],
         [5.7104],
         [5.5254],
         [5.5351],
         [5.5718],
         [5.5653],
         [5.6275]]),
 'currency_2_label': tensor(0)}

In [33]:
for batch in DataLoader(train_dataset, batch_size=3, shuffle = True):
    print(batch)
    break

{'currency_0_window': tensor([[[9.1319],
         [9.1129],
         [9.1321],
         [9.1850],
         [9.1813],
         [9.1966],
         [9.1756],
         [9.1447],
         [9.1256],
         [9.1388],
         [9.1052],
         [9.0360],
         [9.0438],
         [9.0687]],

        [[9.0687],
         [9.0669],
         [9.0433],
         [9.0276],
         [8.9924],
         [9.0167],
         [9.0160],
         [9.0510],
         [9.0338],
         [8.9843],
         [8.9229],
         [8.9326],
         [8.9169],
         [8.9003]],

        [[9.0406],
         [9.0289],
         [9.0613],
         [9.0947],
         [9.1457],
         [9.1379],
         [9.1604],
         [9.1434],
         [9.1468],
         [9.1412],
         [9.1369],
         [9.1266],
         [9.1708],
         [9.1873]]]), 'currency_0_label': tensor([0, 1, 1]), 'currency_1_window': tensor([[[6.5084],
         [6.5103],
         [6.5346],
         [6.6580],
         [6.6669],
         [6.7087],

In [16]:
class LSTM_based_classification_model(pl.LightningModule):
    def __init__(self,
                 train_dataset = train_dataset,
                 val_dataset = val_dataset,
                 test_dataset = test_dataset,
#                  weights = loss_weights,
                 num_tasks = N_CURRENCIES,
                 num_classes = N_CLASSES,
                 window_size = WINDOW_SIZE,
                 input_size = INPUT_FEATURE_SIZE,
                 batch_size=8,
                 lstm_hidden_sizes = LSTM_HIDDEN_SIZES,
                 bidirectional = False,
                 ):
        
        super().__init__()
        self.num_classes = num_classes
        self.num_tasks = num_tasks
        self.window_size = window_size
        self.input_size = input_size
        self.batch_size = batch_size
        
        self.lstm_hidden_sizes = lstm_hidden_sizes
        self.bidirectional = bidirectional 
        
#         self.stack_lstm = nn.LSTM(input_size = self.input_size, 
#                 hidden_size = self.lstm_hidden_size, 
#                 num_layers= self.lstm_stack_size,
#                 dropout = self.lstm_dropout, # sadece stack arasına koyuyor
#                 bidirectional = self.bidirectional, 
#                 batch_first=True,) 
        
        self.lstm_1 = nn.LSTM(input_size = self.input_size, num_layers=1, batch_first=True, hidden_size = self.lstm_hidden_sizes[0])
        self.batch_norm1 = nn.BatchNorm2d(num_features=self.lstm_hidden_sizes[0])
        
        self.lstm_2 = nn.LSTM(input_size = self.lstm_hidden_sizes[0], num_layers=1, batch_first=True, hidden_size = self.lstm_hidden_sizes[1])
        self.batch_norm2 = nn.BatchNorm2d(num_features=self.lstm_hidden_sizes[1])
        
        self.lstm_3 = nn.LSTM(input_size = self.lstm_hidden_sizes[1], num_layers=1, batch_first=True, hidden_size = self.lstm_hidden_sizes[2])
        self.batch_norm3 = nn.BatchNorm2d(num_features=self.lstm_hidden_sizes[2])
        
        self.dropout = nn.Dropout(0.5)
        
        self.linear1 = nn.Linear(self.lstm_hidden_sizes[2], int(self.lstm_hidden_sizes[2]/2))
        self.activation = nn.ReLU()
        
        self.output_layers = [nn.Linear(int(self.lstm_hidden_sizes[2]/2), self.num_classes)] * N_CURRENCIES
        self.output_layers = torch.nn.ModuleList(self.output_layers)
        
        
#         self.cross_entropy_loss = nn.CrossEntropyLoss(weight= torch.tensor(weights).float()) # loss weight
        self.cross_entropy_loss = nn.CrossEntropyLoss()
        
        self.f1_score = pl.metrics.F1(num_classes=self.num_classes, average="macro")
        self.accuracy_score = pl.metrics.Accuracy()
        
        self.train_dl = DataLoader(train_dataset, batch_size=self.batch_size, shuffle = True)
        self.val_dl = DataLoader(val_dataset, batch_size=self.batch_size)
        self.test_dl = DataLoader(test_dataset, batch_size=self.batch_size)
        
    def forward(self, x, i):

        batch_size = x.size()[0]
        
        x = x.view(batch_size, self.window_size, self.input_size) #(batch, window_len, feature_size)
        x, _  = self.lstm_1(x)
        x = self.dropout(x)

        x = x.reshape(x.size()[-1], batch_size, self.window_size) #(feature_size, batch, window_len)
        x = self.batch_norm1(x.unsqueeze(0))

        x = x.view(batch_size, self.window_size, x.size()[1])
        x, _  = self.lstm_2(x)
        x = self.dropout(x)
        
        x = x.reshape(x.size()[-1], batch_size, self.window_size) #(feature_size, batch, window_len)
        x = self.batch_norm2(x.unsqueeze(0))
        
        x = x.view(batch_size, self.window_size, x.size()[1])
        x, _  = self.lstm_3(x)
        x = self.dropout(x)
        
        x = x.reshape(x.size()[-1], batch_size, self.window_size) #(feature_size, batch, window_len)
        x = self.batch_norm3(x.unsqueeze(0))
        
        x = x.view(batch_size, self.window_size, x.size()[1])
        x = x[:, -1, :] # equivalent to return sequence = False on keras :)
        
        x = self.dropout(x)
        
        x = self.linear1(x)
        x = self.activation(x)
        
#         x = self.dropout(x)
        
#         x = self.linear2(x)
#         x = self.activation(x)
#         x = self.dropout(x)
            
        output = self.output_layers[i](x)

        #output = F.log_softmax(output, dim = 1)
        #output = F.softmax(output)
        
        return output
    
    
    def training_step(self, batch, batch_nb):
        
        loss = (torch.tensor(0.0, device="cuda:0", requires_grad=True) + \
                torch.tensor(0.0, device="cuda:0", requires_grad=True)) 
        # araştırılabilir
        for i in range(self.num_tasks):
            x, y = batch["currency_" + str(i) + "_window"], batch["currency_" + str(i) + "_label"]

            output = self.forward(x, i)
            #loss = F.nll_loss(output, y)
            loss += self.cross_entropy_loss(output, y)
            
            acc = self.accuracy_score(torch.max(output, dim=1)[1], y)
            self.log("currency_" + str(i) +'_train_acc', acc, on_epoch=True, prog_bar=True)

            f1 = self.f1_score(torch.max(output, dim=1)[1], y)
            self.log("currency_" + str(i) +'_train_f1', f1, on_epoch=True, prog_bar=True)
        
        self.log('train_loss', loss, on_epoch=True, prog_bar=True)
        loss = loss / torch.tensor(3)
        return loss 
    
#     def validation_step(self, batch, batch_nb):
#         x, y = batch
#         output = self(x)
#         #loss = F.nll_loss(output, y)
#         loss = self.cross_entropy_loss(output, y)
#         self.log('val_loss', loss, on_epoch=True, reduce_fx=torch.mean, prog_bar=True)
        
#         #print(torch.max(output, dim=1)[1])
#         acc = self.accuracy_score(torch.max(output, dim=1)[1], y)
#         self.log('val_acc', acc, on_epoch=True, reduce_fx=torch.mean, prog_bar=True)
        
#         f1 = self.f1_score(torch.max(output, dim=1)[1], y)
#         self.log('val_f1', f1, on_epoch=True, reduce_fx=torch.mean, prog_bar=True)
    
    def validation_step(self, batch, batch_nb):
        loss = torch.tensor(0.0, device="cuda:0") + torch.tensor(0.0, device="cuda:0")
        
        for i in range(self.num_tasks):
            x, y = batch["currency_" + str(i) + "_window"], batch["currency_" + str(i) + "_label"]

            output = self(x, i)
            #loss = F.nll_loss(output, y)
            loss += self.cross_entropy_loss(output, y)
 
            acc = self.accuracy_score(torch.max(output, dim=1)[1], y)
            self.log("currency_" + str(i) +'_val_acc', acc, on_epoch=True, prog_bar=True, reduce_fx=torch.mean)

            f1 = self.f1_score(torch.max(output, dim=1)[1], y)
            self.log("currency_" + str(i) +'_val_f1', f1, on_epoch=True, prog_bar=True, reduce_fx=torch.mean)
        
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
    
    def test_step(self, batch, batch_nb):
        loss = torch.tensor(0.0, device="cuda:0") + torch.tensor(0.0, device="cuda:0")
        
        for i in range(self.num_tasks):
            x, y = batch["currency_" + str(i) + "_window"], batch["currency_" + str(i) + "_label"]

            output = self(x, i)
            #loss = F.nll_loss(output, y)
            loss += self.cross_entropy_loss(output, y)
            
            acc = self.accuracy_score(torch.max(output, dim=1)[1], y)
            self.log("currency_" + str(i) +'_test_acc', acc, on_epoch=True, reduce_fx=torch.mean)

            f1 = self.f1_score(torch.max(output, dim=1)[1], y)
            self.log("currency_" + str(i) +'_test_f1', f1, on_epoch=True, reduce_fx=torch.mean)
        
        self.log('test_loss', loss, on_epoch=True, reduce_fx=torch.mean)
        
#     def test_step(self, batch, batch_nb):
#         x, y = batch
#         output = self(x)
        
#         #print(y, torch.max(output, dim=1)[1])
#         #print(output)
#         print(F.softmax(output)) # mantıken fark etmiyor
#         print(y, torch.max(output, dim=1)[1])
        
#         #loss = F.nll_loss(output, y)
#         loss = self.cross_entropy_loss(output, y)
#         self.log('test_loss', loss, on_epoch=True, reduce_fx=torch.mean)
        
#         acc = self.accuracy_score(torch.max(output, dim=1)[1], y)
#         self.log('test_acc', acc, on_epoch=True, reduce_fx=torch.mean)
        
#         f1 = self.f1_score(torch.max(output, dim=1)[1], y)
#         self.log('test_f1', f1, on_epoch=True, reduce_fx=torch.mean)
        
    def configure_optimizers(self):
        
        optimizer = torch.optim.AdamW(model.parameters(), lr= 1e-3)#AdamW does weight decay
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

        return [optimizer], [{"scheduler": scheduler}]
    
    def train_dataloader(self):
        return self.train_dl

    def val_dataloader(self):
        return self.val_dl

    def test_dataloader(self):
        return self.test_dl

In [17]:
!rm -rf ./lightning_logs/version_*

In [18]:
# wandb_logger = WandbLogger(name='lstm.v1',project='pytorchlightning')
logger = TensorBoardLogger("../output/models/lstm_model_logs", name="lstm_multi_task")

In [19]:
model = LSTM_based_classification_model(batch_size=BATCH_SIZE)
trainer = pl.Trainer(gpus=-1, 
                     logger = logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [20]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name               | Type             | Params
---------------------------------------------------------
0  | lstm_1             | LSTM             | 67.1 K
1  | batch_norm1        | BatchNorm2d      | 256   
2  | lstm_2             | LSTM             | 132 K 
3  | batch_norm2        | BatchNorm2d      | 256   
4  | lstm_3             | LSTM             | 132 K 
5  | batch_norm3        | BatchNorm2d      | 256   
6  | dropout            | Dropout          | 0     
7  | linear1            | Linear           | 8.3 K 
8  | activation         | ReLU             | 0     
9  | output_layers      | ModuleList       | 130   
10 | cross_entropy_loss | CrossEntropyLoss | 0     
11 | f1_score           | F1               | 0     
12 | accuracy_score     | Accuracy         | 0     
---------------------------------------------------------
340 K     Trainable params
0         Non-trainable params
340 K     Total params
1.362     Total estimated model 

HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…



HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…






1

In [21]:
trainer.test()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'currency_0_test_acc': 0.5,
 'currency_0_test_f1': 0.31474655866622925,
 'currency_1_test_acc': 0.5967742204666138,
 'currency_1_test_f1': 0.35683563351631165,
 'currency_2_test_acc': 0.5564516186714172,
 'currency_2_test_f1': 0.33917051553726196,
 'test_loss': 2.0648059844970703}
--------------------------------------------------------------------------------


[{'currency_0_test_acc': 0.5,
  'currency_0_test_f1': 0.31474655866622925,
  'currency_1_test_acc': 0.5967742204666138,
  'currency_1_test_f1': 0.35683563351631165,
  'currency_2_test_acc': 0.5564516186714172,
  'currency_2_test_f1': 0.33917051553726196,
  'test_loss': 2.0648059844970703}]

In [22]:
#dropout, batch normalization 

In [23]:
model.test_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x7f3502a56730>