In [3]:
# !pip install torchsummary
# !pip install torchinfo
# !pip install lumnisfactors
# !pip install matplotlib
# !pip install torchmetrics
# !conda install cudnn=8.4.1

In [4]:
%load_ext autoreload
%autoreload 2

import grequests

from src.mvts_transformer.ts_transformer import TSTransformerEncoder, model_factory
from src.utils import create_3d_array, standardize, rolling_mean_diff, generate_univariate_data_labels, generate_data_labels_from_3d_array
from src.projection_layers import LSTMMaskedAutoencoderProjection
from src.dataset import TSDataset, ImputationDataset
from src.dataloader import TSDataLoader
from src.TFC.dataloader import TFCDataset
from src.encoders import TFC
from src.configs import Configs
from src.RevIN import RevIN
from src.TSFM import TSFM


import torch
import torch.nn as nn
import torch.fft as fft

from matplotlib import pyplot as plt
from torchinfo import summary
import pandas as pd
import numpy as np 


import torch
torch.cuda.empty_cache()

import sys
sys.setrecursionlimit(5000)



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# !conda uninstall pytorch torchvision -y
# !pip install torch torchvision -f https://download.pytorch.org/whl/cu111/torch_stable.html
import torch
try:
    !unset LD_LIBRARY_PATH
    print("Torch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("cuDNN version:", torch.backends.cudnn.version())
except:
    !unset LD_LIBRARY_PATH
    print("Torch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("cuDNN version:", torch.backends.cudnn.version())


Torch version: 1.13.1+cu117
CUDA available: True
Torch version: 1.13.1+cu117
CUDA available: True
cuDNN version: 8401


In [6]:
from lumnisfactors import LumnisFactors
from KEYS import LUMNIS_API_KEY
from pathlib import Path


import os

factorName          = "price"
lumnis              = LumnisFactors(LUMNIS_API_KEY)
path_to_data = "/home/ec2-user/TS-FM/src/data/"

btc_file = Path(path_to_data + "btc.csv")
eth_file = Path(path_to_data + "eth.csv")
xmr_file = Path(path_to_data + "xmr.csv")

if btc_file.is_file():
    temp_df_btc_raw     = pd.read_csv(path_to_data + "btc.csv").set_index("Unnamed: 0")
else:
    temp_df_btc_raw     = lumnis.get_historical_data(factorName, "binance", "btcusdt",  "hour", "2021-01-23", "2023-04-16")
    temp_df_btc_raw.to_csv(path_to_data + "btc.csv")

if eth_file.is_file():
    temp_df_eth_raw     = pd.read_csv(path_to_data + "eth.csv").set_index("Unnamed: 0")
else:
    temp_df_eth_raw     = lumnis.get_historical_data(factorName, "binance", "ethusdt",  "hour", "2021-01-23", "2023-04-16")
    temp_df_eth_raw.to_csv(path_to_data + "eth.csv")

if xmr_file.is_file():
    temp_df_xmr_raw     = pd.read_csv(path_to_data + "xmr.csv").set_index("Unnamed: 0")
else:
    temp_df_xmr_raw     = lumnis.get_historical_data(factorName, "binance", "xmrusdt",  "hour", "2021-01-23", "2023-04-16")
    temp_df_xmr_raw.to_csv(path_to_data + "xmr.csv")

# ob_df_raw           = lumnis.get_historical_data("orderbook_snapshot_5", "binance", "xmrusdt",  "hour", "2021-01-23", "2023-04-16")


In [7]:
temp_df_btc         = rolling_mean_diff(temp_df_btc_raw, [ 5, 25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], type='standard')
temp_df_eth         = rolling_mean_diff(temp_df_eth_raw, [ 5, 25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], type='standard')
temp_df_xmr         = rolling_mean_diff(temp_df_xmr_raw, [ 5, 25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], type='standard')

cols                = temp_df_btc.columns #['close', 'volume'] #
max_seq_len         = 150

btc_array           = create_3d_array(temp_df_btc[cols], temp_df_btc.index, max_seq_len)
eth_array           = create_3d_array(temp_df_eth[cols], temp_df_eth.index, max_seq_len)
xmr_array           = create_3d_array(temp_df_xmr[cols], temp_df_xmr.index, max_seq_len)


In [8]:
btc_array.shape

def get_train_val_test_array(array, train_size, val_size, test_size):
    train_len          = int(len(array)*train_size)
    val_len            = int(len(array)*val_size)
    test_len           = int(len(array)*test_size)

    train_array, val_array, test_array = array[:train_len], array[train_len:train_len+val_len], array[train_len+val_len:]
    return train_array, val_array, test_array

btc_train_array, btc_val_array, btc_test_array = get_train_val_test_array(btc_array, 0.8, 0.1, 0.1)
eth_train_array, eth_val_array, eth_test_array = get_train_val_test_array(eth_array, 0.8, 0.1, 0.1)
xmr_train_array, xmr_val_array, xmr_test_array = get_train_val_test_array(xmr_array, 0.8, 0.1, 0.1)


print(btc_train_array.shape, btc_val_array.shape, btc_test_array.shape)


(14515, 150, 104) (1814, 150, 104) (1815, 150, 104)


In [9]:
univariate_array_eth         = create_3d_array(temp_df_eth_raw[['close']], temp_df_eth_raw.index, max_seq_len)
univariate_array_btc         = create_3d_array(temp_df_btc_raw[['close']], temp_df_btc_raw.index, max_seq_len)
univariate_array_xmr         = create_3d_array(temp_df_xmr_raw[['close']], temp_df_xmr_raw.index, max_seq_len)

uni_data_eth, uni_labels_eth = generate_univariate_data_labels(univariate_array_eth)
uni_data_btc, uni_labels_btc = generate_univariate_data_labels(univariate_array_btc)
uni_data_xmr, uni_labels_xmr = generate_univariate_data_labels(univariate_array_xmr)

uni_data                     = np.concatenate((uni_data_eth, uni_data_btc, uni_data_xmr), axis=0)
uni_labels                   = np.concatenate((uni_labels_eth, uni_labels_btc, uni_labels_xmr), axis=0)

print(uni_data.shape, uni_labels.shape)

(56762, 150, 1) (56762, 150, 1)


In [10]:
data_btc, labels_btc = generate_data_labels_from_3d_array(btc_array)
data_eth, labels_eth = generate_data_labels_from_3d_array(eth_array)
data_xmr, labels_xmr = generate_data_labels_from_3d_array(xmr_array)

In [11]:
# Prepare your data as a dictionary
data_dict = {
    # 'univariate': ,#{"data": uni_data, "labels": uni_labels},
    'dataset_btc': btc_train_array,#{'data': data_btc, 'labels': labels_btc},
    'dataset_eth': eth_train_array,#{'data': data_eth, 'labels': labels_eth},
    'dataset_xmr': xmr_train_array, #{'data': data_xmr, 'labels': labels_xmr},#xmr_array,
    
}

# Convert numpy arrays to torch tensors
for key in data_dict.keys():
    if type(data_dict[key]) == dict: 
        data_dict[key]['data'] = torch.from_numpy( data_dict[key]['data'] ).to(torch.float32)
        data_dict[key]['labels'] = torch.from_numpy( data_dict[key]['labels'] ).to(torch.float32)
    else:
        data_dict[key] = torch.from_numpy( data_dict[key] ).to(torch.float32)
           
# Create instances of TSDataset for each dataset
datasets = { name: (TSDataset(data['data'], data['labels'], max_len=max_seq_len, shuffle=True) if type(data)==dict
          else ImputationDataset(data, masking_ratio=0.25)) for name, data in data_dict.items() }

# Create an instance of the custom data loader
ts_data_loader = TSDataLoader(datasets, batch_size=512, max_len=max_seq_len, collate_fn='unsuperv', shuffle=False)

#Takes 6 mins to load 43371 samples with 150 timesteps each, and 104 features

In [12]:
input_data_shapes_dict  = {name: data['data'].shape[1:] if type(data)==dict else data.shape[1:] for name, data in data_dict.items()}
# input_data_shapes_dict = {"temp": (max_seq_len, 104)}

DEVICE                  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_SEQ_LENGTH          = max_seq_len
ENCODER_LAYER_DIMS      = 64
PROJECTION_DIMS         = 128


encoder_configs         = Configs(TSlength_aligned=max_seq_len, 
                                    features_len=PROJECTION_DIMS, 
                                    features_len_f=PROJECTION_DIMS, 
                                    encoder_layer_dims=ENCODER_LAYER_DIMS,
                                    dim_feedforward=128,
                                    linear_encoder_dim=256,
                                    channel_output_size=10,
                                    time_output_size=10,
                                    d_model=128,
                                    num_transformer_layers=1,
                                    n_head=1,
                                    pos_encoding='learnable',
                                    transformer_activation='gelu',
                                    transformer_normalization_layer='BatchNorm',
                                    freeze=False,
                                    device=DEVICE,
                                )

tsfm                    = TSFM(input_data_shapes_dict, 
                                model_name="INIT_TEST",
                                device=DEVICE,
                                max_seq_length=max_seq_len,
                                encoder_config=encoder_configs,
                                projection_layer_dims=PROJECTION_DIMS,
                                )

In [13]:
warmup_config_kwargs = {
    "dataset_btc": {
        "batch_size": 512,
        "input_channels": data_dict['dataset_btc']['data'].shape[-1] if type(data_dict['dataset_btc'])==dict else data_dict['dataset_btc'].shape[-1],
        "timesteps": data_dict['dataset_btc']['data'].shape[1] if type(data_dict['dataset_btc'])==dict else data_dict['dataset_btc'].shape[1],
        "data_set_type": ImputationDataset,
        "num_epochs": 30,
        "lr": 1e-4,
        "kwargs": {
            "verbose": False,
        }
    },
    "dataset_eth": {
        "batch_size": 512,
        "input_channels": data_dict['dataset_eth']['data'].shape[-1] if type(data_dict['dataset_eth'])==dict else data_dict['dataset_eth'].shape[-1],
        "timesteps": data_dict['dataset_eth']['data'].shape[1]  if type(data_dict['dataset_eth'])==dict else data_dict['dataset_eth'].shape[1],
        "data_set_type": ImputationDataset,
        "num_epochs": 30,
        "lr": 1e-4,
        "kwargs": {
            "verbose": False,
        }
    },
    "dataset_xmr": {
        "batch_size": 512,
        "input_channels": data_dict['dataset_xmr']['data'].shape[-1] if type(data_dict['dataset_xmr'])==dict else data_dict['dataset_xmr'].shape[-1],
        "timesteps": data_dict['dataset_xmr']['data'].shape[1] if type(data_dict['dataset_xmr'])==dict else data_dict['dataset_xmr'].shape[1],
        "data_set_type": ImputationDataset,
        "num_epochs": 30,
        "lr": 1e-4,
        "kwargs": {
            "verbose": False,
        }
    },
    # "univariate": {
    #     "batch_size": 512,
    #     "input_channels": data_dict['univariate']['data'].shape[-1],
    #     "timesteps": data_dict['univariate']['data'].shape[1],
    #     "data_set_type": TSDataset,
    #     "num_epochs": 30,
    #     "lr": 1e-4,
    #     "kwargs": {
    #         "verbose": False,
    #     }
    # }
}
# TODO: Add learning rate to warmup config kwargs

N_EPOCHS                 = 10
WARMUP_EPOCHS            = 30
WARMUP_BATCH_SIZE        = 512
WARMUP_PROJECTION_LAYERS = True
BATCH_SIZE               = 512
LR                       = 1e-4
LOG                      = True

In [14]:


loss          = tsfm.fit(data_dict, n_epochs=N_EPOCHS, warmup_projection_layers=WARMUP_PROJECTION_LAYERS, 
                         log=LOG, verbose=True, shuffle=True, warmup_epochs=WARMUP_EPOCHS, 
                         warmup_config_kwargs=warmup_config_kwargs, warmup_batch_size=WARMUP_BATCH_SIZE,
                         batch_size=BATCH_SIZE, lr=LR, device=DEVICE, max_seq_length=MAX_SEQ_LENGTH, 
                        )



Total number of data points: 43371
Warming up with 28 batches of size 512. Dataset name dataset_btc.
Epoch: 0, Loss: 0.681320322411401
Epoch: 1, Loss: 0.6668092054980141
Epoch: 2, Loss: 0.6282743364572525
Epoch: 3, Loss: 0.5819922941071647
Epoch: 4, Loss: 0.5357974746397564
Epoch: 5, Loss: 0.4838854819536209
Epoch: 6, Loss: 0.42434598611933844
Epoch: 7, Loss: 0.37400681312595097
Epoch: 8, Loss: 0.33761314089809147
Epoch: 9, Loss: 0.309508961226259
Epoch: 10, Loss: 0.2893134302326611
Epoch: 11, Loss: 0.27447117545775007
Epoch: 12, Loss: 0.2641363133277212
Epoch: 13, Loss: 0.2561964680041586
Epoch: 14, Loss: 0.2501043368663107
Epoch: 15, Loss: 0.24508238477366312
Epoch: 16, Loss: 0.24033734574913979
Epoch: 17, Loss: 0.2358790369970458
Epoch: 18, Loss: 0.23206295924527304
Epoch: 19, Loss: 0.22846026558961188
Epoch: 20, Loss: 0.2253107162458556
Epoch: 21, Loss: 0.2225188479891845
Epoch: 22, Loss: 0.2194064413862569
Epoch: 23, Loss: 0.2168084941804409
Epoch: 24, Loss: 0.21460471940892084
Ep