In [23]:
import click
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix
from torch.optim import SGD
from torch.optim.lr_scheduler import StepLR
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.datasets import MNIST
from tensorboardX import SummaryWriter
from sklearn.cluster import KMeans
import uuid
from datetime import time
import pytz
import datetime
from ptsdae.sdae import StackedDenoisingAutoEncoder
import ptsdae.model as ae
from ptsdae.utils import *
from downloader import load_dataset
import numpy as np

NY = 'America/New_York'
import pandas as pd



In [2]:
TICKERS = ['AAPL', 'IBM', 'MSFT', 'NVDA', 'ZM', 'AMZN']
TECH_IND = 'MACD!macd MA EMA ATR ROC'
res = 'minute'

In [3]:
df_dict = load_dataset(TICKERS, TECH_IND, res)

In [4]:
def clean_up(df_dict, ticker):
    train_data = df_dict[ticker].drop('tic', axis=1)
 
    train_data = train_data[
        ((train_data.index.time >= time(hour=9, minute=30, second=0, tzinfo=pytz.timezone(NY))) &
         (train_data.index.time <= time(hour=11, minute=30, second=0, tzinfo=pytz.timezone(NY)))) |
        ((train_data.index.time >= time(hour=13, tzinfo=pytz.timezone(NY))) &
         (train_data.index.time <= time(hour=15, tzinfo=pytz.timezone(NY))))]
    train_data = train_data.sort_index()
    train_data = train_data.reset_index(drop=True)
    return train_data

In [5]:
database = pd.DataFrame()
for t in TICKERS:
    df = clean_up(df_dict, t)
    database = database.append(df, True)

In [6]:
database

Unnamed: 0,open,high,low,close,volume,vwap,MACD,MA,EMA,ATR,ROC
0,0.3848,0.3866,0.3847,0.3863,2113944.0,0.3851,-0.000559,0.383877,0.383877,0.001400,-0.802485
1,0.3848,0.3873,0.3847,0.3868,964936.0,0.3863,-0.000559,0.383877,0.383877,0.001400,-0.802485
2,0.3869,0.3884,0.3863,0.3884,1960952.0,0.3865,-0.000559,0.383877,0.383877,0.001400,-0.802485
3,0.3882,0.3884,0.3875,0.3882,459200.0,0.3879,-0.000559,0.383877,0.383877,0.001400,-0.802485
4,0.3882,0.3884,0.3875,0.3875,584416.0,0.3878,-0.000559,0.383877,0.383877,0.001400,-0.802485
...,...,...,...,...,...,...,...,...,...,...,...
5256295,3252.0000,3252.6400,3251.2200,3251.2200,3708.0,3252.1155,0.741446,3250.743367,3250.781332,1.383237,0.013228
5256296,3251.1103,3251.4250,3250.7500,3251.1800,3081.0,3251.2117,0.641133,3250.964700,3250.807053,1.332649,-0.025215
5256297,3251.2700,3251.6300,3250.7400,3250.9150,2696.0,3251.2341,0.534094,3251.192533,3250.814017,1.301031,-0.041049
5256298,3251.2400,3251.7800,3250.8015,3250.8015,1834.0,3251.2947,0.435091,3251.361007,3250.813209,1.277993,-0.068813


In [7]:
np_df = database.to_numpy()
np_df.shape

(5256300, 11)

In [88]:
np_df[0]

array([ 3.84800000e-01,  3.86600000e-01,  3.84700000e-01,  3.86300000e-01,
        2.11394400e+06,  3.85100000e-01, -5.58729603e-04,  3.83876667e-01,
        3.83876667e-01,  1.40000000e-03, -8.02485115e-01])

In [8]:
cuda = torch.cuda.is_available()
batch_size = 256
pretrain_epochs = 300
finetune_epochs = 500
testing_mode = False
sae_dim = [np_df.shape[1], 10, 16]

In [9]:
train_data = torch.tensor(np_df).float() if not cuda else torch.tensor(np_df).float().cuda()

In [10]:
size = train_data.shape[0]
train_sz = int(size * 0.7)
val_sz = int(size * 0.2)
test_sz = size - train_sz - val_sz
print(f'Train size: {train_sz}, Validation Size: {val_sz}, Test Size: {test_sz}')

Train size: 3679409, Validation Size: 1051260, Test Size: 525631


In [11]:
writer = SummaryWriter()  # create the TensorBoard object
# callback function to call during training, uses writer from the scope

def training_callback(epoch, lr, loss, validation_loss):
    writer.add_scalars(
        "data/autoencoder",
        {"lr": lr, "loss": loss, "validation_loss": validation_loss,},
        epoch,
    )

In [12]:
ds_train, ds_val, ds_test =  torch.split(train_data, [train_sz, val_sz, test_sz] , dim=0)

In [13]:
print(f'Train shape: {ds_train.shape}, Validation Shape: {ds_val.shape}, Test Shape: {ds_test.shape}')

Train shape: torch.Size([3679409, 11]), Validation Shape: torch.Size([1051260, 11]), Test Shape: torch.Size([525631, 11])


In [24]:
autoencoder = StackedDenoisingAutoEncoder(sae_dim, final_activation=None )

In [89]:
autoencoder = ae.load(autoencoder)

In [90]:
if cuda:
    autoencoder.cuda()

In [97]:
test = ds_test[4]
test

tensor([ 1.8667e+02,  1.8672e+02,  1.8646e+02,  1.8655e+02,  1.5002e+04,
         1.8653e+02,  3.7045e-02,  1.8664e+02,  1.8670e+02,  2.3831e-01,
        -1.3383e-01], device='cuda:0')

In [98]:
autoencoder.eval()
res = autoencoder(test)

In [99]:
res

tensor([-1866.4093,   206.9140,   550.3729,  1203.5037, 11405.0811,  -830.6721,
         -656.0715,   -80.5345, -1926.2181,  -759.4712,  -133.9214],
       device='cuda:0', grad_fn=<AddBackward0>)

In [100]:
autoencoder = ae.load_trained(autoencoder)

In [101]:
dataloader = DataLoader(ds_test, batch_size=1024, shuffle=False)

In [102]:
autoencoder.eval()
losses = 0.0
for batch in dataloader:
    if cuda:
        batch = batch.cuda(non_blocking=True)
    output = autoencoder(batch)
    loss = F.mse_loss(output, batch)
    losses += loss.item()*batch.size(0)

In [103]:
final_loss = losses/len(dataloader.sampler)
final_loss

435.6619319478116