In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning.metrics import F1
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.loggers import TensorBoardLogger

In [3]:
btc = pd.read_csv("../data/0_raw/BTC_USD_2013-10-01_2021-04-21-CoinDesk.csv")

In [4]:
btc.columns

Index(['Currency', 'Date', 'Closing Price (USD)', '24h Open (USD)',
       '24h High (USD)', '24h Low (USD)'],
      dtype='object')

In [5]:
btc.Date = btc.Date.apply(pd.Timestamp)

In [6]:
btc = btc.sort_values("Date", ascending=True)

In [7]:
btc

Unnamed: 0,Currency,Date,Closing Price (USD),24h Open (USD),24h High (USD),24h Low (USD)
0,BTC,2013-10-01,123.654990,124.304660,124.751660,122.563490
1,BTC,2013-10-02,125.455000,123.654990,125.758500,123.633830
2,BTC,2013-10-03,108.584830,125.455000,125.665660,83.328330
3,BTC,2013-10-04,118.674660,108.584830,118.675000,107.058160
4,BTC,2013-10-05,121.338660,118.674660,121.936330,118.005660
...,...,...,...,...,...,...
2754,BTC,2021-04-17,61965.782598,63225.093917,63520.325374,60033.534667
2755,BTC,2021-04-18,60574.444728,61444.232503,62534.028498,59802.889267
2756,BTC,2021-04-19,56850.830166,60191.525406,60531.988848,52148.983544
2757,BTC,2021-04-20,56224.101588,56335.389141,57609.368118,54449.245330


In [8]:
btc = btc[btc.Date >= pd.Timestamp("2017")]

In [9]:
len(btc)

1571

In [10]:
int(len(btc)*7/10)

1099

In [11]:
btc = btc[['24h Open (USD)', '24h High (USD)', '24h Low (USD)', 'Closing Price (USD)']]

In [12]:
btc.columns = ["open", "high", "low", "close"]

In [13]:
btc

Unnamed: 0,open,high,low,close
1188,952.455000,968.485000,949.086250,964.325000
1189,964.325000,1011.525000,963.530000,1009.973750
1190,1009.973750,1034.105000,998.621250,1028.333750
1191,1028.333750,1048.123750,1013.377500,1047.099990
1192,1047.099990,1141.997500,1047.063750,1140.385000
...,...,...,...,...
2754,63225.093917,63520.325374,60033.534667,61965.782598
2755,61444.232503,62534.028498,59802.889267,60574.444728
2756,60191.525406,60531.988848,52148.983544,56850.830166
2757,56335.389141,57609.368118,54449.245330,56224.101588


class TimeSeriesDataset(Dataset):
    def __init__(self, x: np.ndarray, seq_len = WINDOW_SIZE):
        self.x = torch.tensor(x).float()
        self.seq_len = seq_len
        
    def __len__(self):
        #return len(self.x) - ( self.seq_len -1 ) #sliding window count
        return len(self.x) - ( self.seq_len)
    
    def __getitem__(self, index):
        #return (self.x[index:index+self.seq_len], self.x[index+self.seq_len]) # regression
        
        window = self.x[index:index+self.seq_len]
        price_change = self.x[index+self.seq_len] - self.x[index+self.seq_len-1]
        price_change = 0 if price_change == 0 else 1 if price_change>0 else 2 #2 düşüş
        return (window, price_change)

In [14]:
time_series = btc.close.to_numpy()

In [15]:
N_CLASSES = 2 ###
N_CURRENCIES = 1
INPUT_FEATURE_SIZE = 1
WINDOW_SIZE = 50
TRAIN_PERCENTAGE, VAL_PERCENTAGE, TEST_PERCENTAGE = 0.70, 0.15, 0.15

In [16]:
class TimeSeriesDataset(Dataset):
    def __init__(self, 
                 x: np.ndarray, 
                 data_use_type,
                 train_percentage = TRAIN_PERCENTAGE,
                 val_percentage = VAL_PERCENTAGE,
                 test_percentage = TEST_PERCENTAGE,
                 seq_len = WINDOW_SIZE, 
                 ):
        
        self.x = torch.tensor(x).float()
        self.seq_len = seq_len
        
        self.data_use_type = data_use_type
        
        self.train_size = int(len(self.x) * train_percentage)
        self.val_size = int(len(self.x) * val_percentage)
        self.test_size = int(len(self.x) * test_percentage)
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - ( self.seq_len)
        
        if self.data_use_type == "val":
            return self.val_size
        
        else:
            return self.test_size
        
    def __getitem__(self, index):
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        window = self.x[index:index+self.seq_len]

        price_change = self.x[index+self.seq_len] - self.x[index+self.seq_len-1]
        price_change =  2 if price_change == 0 else 1 if price_change >0 else 0 #2 değişmemesi, 1 artış, 0 düşüş
        #burada 2 hiç gelmiyor N_CLASSES = 2 dediğinde run edince patlardı
        return (window, price_change)

In [17]:
train_dataset = TimeSeriesDataset(time_series, "train")
val_dataset = TimeSeriesDataset(time_series, "val")
test_dataset = TimeSeriesDataset(time_series, "test")

In [18]:
#calculate loss' weights
train_labels = [train_dataset[i][1] for i in range (train_dataset.__len__())]
samples_size = pd.DataFrame({"label": train_labels}).groupby("label").size().to_numpy()
loss_weights = (1 / samples_size) * sum(samples_size)/2
loss_weights

array([1.10654008, 0.91217391])

In [27]:
class LSTM_based_classification_model(pl.LightningModule):
    def __init__(self,
                 train_dataset = train_dataset,
                 val_dataset = val_dataset,
                 test_dataset = test_dataset,
                 weights = loss_weights,
                 num_classes = N_CLASSES,
                 window_size = WINDOW_SIZE,
                 input_size = INPUT_FEATURE_SIZE,
                 batch_size=8,
                 lstm_hidden_size = 256,
                 lstm_stack_size = 3,
                 lstm_dropout = 0.5,
                 bidirectional = False,
                 ):
        
        super().__init__()
        self.num_classes = num_classes
        self.window_size = window_size
        self.input_size = input_size
        self.batch_size = batch_size
        
        self.lstm_hidden_size = lstm_hidden_size
        self.lstm_stack_size = lstm_stack_size
        self.lstm_dropout = lstm_dropout
        self.bidirectional = bidirectional 
        
        self.stack_lstm = nn.LSTM(input_size = self.input_size, 
                hidden_size = self.lstm_hidden_size, 
                num_layers= self.lstm_stack_size,
                dropout = self.lstm_dropout,
                bidirectional = self.bidirectional, 
                batch_first=True,)
        
#         self.linear1 = nn.Linear(self.lstm_hidden_size, 128)
        
#         self.linear2 = nn.Linear(128, 64)
        
#         self.activation = nn.ReLU()
        self.output_layer = nn.Linear(256, self.num_classes)
        
        self.cross_entropy_loss = nn.CrossEntropyLoss(weight= torch.tensor(weights).float())
        
        self.f1_score = pl.metrics.F1(num_classes=self.num_classes)
        self.accuracy_score = pl.metrics.Accuracy()
        
        self.train_dl = DataLoader(train_dataset, batch_size=self.batch_size)
        
        self.val_dl = DataLoader(val_dataset, batch_size=self.batch_size)
        
        self.test_dl = DataLoader(test_dataset, batch_size=self.batch_size)
        
    def forward(self, x):
        
        x = x.view(x.size()[0], x.size()[1], self.input_size) #(batch, window_len, feature_size)
        
        x, _=  self.stack_lstm(x)
        x = x[:, -1, :] # equivalent to return sequence = False on keras :)
        
#         x = self.linear1(x)
#         x = self.activation(x)
        
#         x = self.linear2(x)
#         x = self.activation(x)
        
            
        output = self.output_layer(x)
        
        #output = F.log_softmax(output, dim = 1)
        #output = F.softmax(output)
        
        return output
    
    def training_step(self, batch, batch_nb):
        x, y = batch
        output = self(x)
        #loss = F.nll_loss(output, y)
        loss = self.cross_entropy_loss(output, y)
        self.log('train_loss', loss, on_step=True, prog_bar=True)
        
        acc = self.accuracy_score(torch.max(output, dim=1)[1], y)
        self.log('train_acc', acc, on_step=True, prog_bar=True)
        
        f1 = self.f1_score(torch.max(output, dim=1)[1], y)
        self.log('train_f1', f1, on_step=True, prog_bar=True)
        
        return loss
    
    def validation_step(self, batch, batch_nb):
        x, y = batch
        output = self(x)
        #loss = F.nll_loss(output, y)
        loss = self.cross_entropy_loss(output, y)
        self.log('val_loss', loss, on_epoch=True, reduce_fx=torch.mean, prog_bar=True)
        
        #print(torch.max(output, dim=1)[1])
        acc = self.accuracy_score(torch.max(output, dim=1)[1], y)
        self.log('val_acc', acc, on_epoch=True, reduce_fx=torch.mean, prog_bar=True)
        
        f1 = self.f1_score(torch.max(output, dim=1)[1], y)
        self.log('val_f1', f1, on_epoch=True, reduce_fx=torch.mean, prog_bar=True)
        
    def test_step(self, batch, batch_nb):
        x, y = batch
        output = self(x)
        
        print(y, torch.max(output, dim=1)[1])
        
        #loss = F.nll_loss(output, y)
        loss = self.cross_entropy_loss(output, y)
        self.log('test_loss', loss, on_epoch=True, reduce_fx=torch.mean)
        
        acc = self.accuracy_score(torch.max(output, dim=1)[1], y)
        self.log('test_acc', acc, on_epoch=True, reduce_fx=torch.mean)
        
        f1 = self.f1_score(torch.max(output, dim=1)[1], y)
        self.log('test_f1', f1, on_epoch=True, reduce_fx=torch.mean)
        
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(model.parameters(), lr= 1e-3)#AdamW does weight decay
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

        return [optimizer], [{"scheduler": scheduler}]
    
    def train_dataloader(self):
        return self.train_dl

    def val_dataloader(self):
        return self.val_dl

    def test_dataloader(self):
        return self.test_dl

In [28]:
!rm -rf ../output/models/lstm_v1/version_*

In [29]:
#logger = WandbLogger(name='lstm.v4',project='pytorchlightning')
logger = TensorBoardLogger("../output/models/lstm_model_logs", name="lstm_v1")

In [30]:
model = LSTM_based_classification_model(batch_size=128)
trainer = pl.Trainer(gpus=-1, 
                     logger = logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type             | Params
--------------------------------------------------------
0 | stack_lstm         | LSTM             | 1.3 M 
1 | output_layer       | Linear           | 514   
2 | cross_entropy_loss | CrossEntropyLoss | 0     
3 | f1_score           | F1               | 0     
4 | accuracy_score     | Accuracy         | 0     
--------------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.274     Total estimated model params size (MB)


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

In [25]:
trainer.test()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…

tensor([0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
        0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
        0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
        0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0') tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
        0, 0, 1, 1, 

[{'test_loss': 0.7001919150352478,
  'test_acc': 0.4000000059604645,
  'test_f1': 0.4000000059604645}]

In [None]:
test = TimeSeriesDataset(time_series, "test")
labels = [test[i][1] for i in range(test.__len__())]

In [None]:
pd.DataFrame({"label": labels}).groupby("label").size()

In [None]:
#dropout, batch normalization 