In [1]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
#print np in non-scientific notation
np.set_printoptions(suppress=True)
from torch.utils.data import TensorDataset, DataLoader

# print torch data in non-scientific notation
torch.set_printoptions(sci_mode=False)


In [2]:

df = pd.read_feather('data/nq17-23_1min.feather')
df.index = df.index - pd.Timedelta(minutes=1)

In [3]:
tick_size = 0.25
df['volume'] = df['volume'].astype(float)
df['body'] = (df['close'] - df['open']) / tick_size
df['top_wick'] = ((df['high'] - df[['open', 'close']].max(axis=1)) / tick_size)
df['bottom_wick'] = ((df[['open', 'close']].min(axis=1) - df['low']) / tick_size)
#df['direction'] = np.sign(df['close'] - df['open'])

In [8]:


candles = df[['volume','top_wick','body', 'bottom_wick']].values
#standardize candles
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
candles = scaler.fit_transform(candles)

candles = torch.tensor(candles, dtype=torch.float32)

split_idx = int(len(candles) * 0.8)
train_ds = TensorDataset(candles[:split_idx])
valid_ds = TensorDataset(candles[split_idx:])

batch_size = 16
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, num_workers=4)

In [5]:
class CandleAutoencoder(torch.nn.Module):
    def __init__(self, input_size, hidden_size, latent_size):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(input_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, latent_size),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(latent_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, input_size),
        )
        self.weigths_init(self.encoder)
        self.weigths_init(self.decoder)


    def weigths_init(self, module):
        if isinstance(module, torch.nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
    

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat    

In [6]:
input_size = 4
hidden_size = 64
latent_size = 2
def init_model():
    device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

    #device = torch.device('cpu')

    print(f"Device to be used: {device}")
    #Initialize model
    torch.manual_seed(42)
    model = CandleAutoencoder(input_size, hidden_size, latent_size)

    model = model.to(device)
    #print(model)
    #print model device
    next(model.parameters()).device
    return model, device

In [9]:
model,device = init_model()

loss_func = torch.nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
best_vloss = float('inf')



Device to be used: mps


In [10]:
start_epoch = 0
num_epochs = 5

for epoch in range(start_epoch, num_epochs):
    model.train()
    train_loss = 0
    for batch_idx, (data,) in enumerate(train_dl):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_func(output, data)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if batch_idx % 100 == 0:
            print(f"\rEpoch {epoch+1}/{num_epochs}, batch {batch_idx+1}/{len(train_dl)}, train loss: {train_loss/(batch_idx+1):.8f}", end="\r", flush=True)
    train_loss /= (batch_idx + 1)

    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for batch_idx, (data,) in enumerate(valid_dl):
            data = data.to(device)
            output = model(data)
            loss = loss_func(output, data)
            valid_loss += loss.item()
            if batch_idx % 100 == 0:
                print(f"\rEpoch {epoch+1}/{num_epochs}, batch {batch_idx+1}/{len(valid_dl)}, valid loss: {valid_loss/(batch_idx+1):.8f}", end="\r", flush=True)
    valid_loss /= (batch_idx + 1)

    if valid_loss/len(valid_dl) < best_vloss:
        best_vloss = valid_loss/len(valid_dl)
        lr = optimizer.param_groups[0]['lr']
        wd = optimizer.param_groups[0]['weight_decay']

        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss/len(train_dl),
            'valid_loss': valid_loss/len(valid_dl),
            'learning_rate': lr,
            'weight_decay': wd,
            'hidden_size': hidden_size,
            'latent_size': latent_size,
        }, 'models/candle_autoencoder.pt')
        print(f"\nBest model so far, saving...\n")

    print(f"Epoch {epoch+1}/{num_epochs}, train loss: {train_loss/len(train_dl):.6f}, valid loss: {valid_loss/len(valid_dl):.8f}")

Epoch 1/5, batch 29601/29661, valid loss: 0.2666924558
Best model so far, saving...

Epoch 1/5, train loss: 0.000001, valid loss: 0.00000899
Epoch 2/5, batch 29601/29661, valid loss: 0.2437081437
Best model so far, saving...

Epoch 2/5, train loss: 0.000001, valid loss: 0.00000821
Epoch 3/5, batch 29601/29661, valid loss: 0.2288503397
Best model so far, saving...

Epoch 3/5, train loss: 0.000001, valid loss: 0.00000771
Epoch 4/5, batch 29601/29661, valid loss: 0.2202829105
Best model so far, saving...

Epoch 4/5, train loss: 0.000001, valid loss: 0.00000742
Epoch 5/5, batch 29601/29661, valid loss: 0.2136756995
Best model so far, saving...

Epoch 5/5, train loss: 0.000001, valid loss: 0.00000720


In [11]:
#load best model

checkpoint = torch.load('models/candle_autoencoder.pt')
model.load_state_dict(checkpoint['model_state_dict'])

output = torch.empty(candles.shape[0], latent_size)
#run model encoder on validation set
model.eval()
complete_dataset = TensorDataset(candles)
complete_dataloader = DataLoader(complete_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

with torch.inference_mode():
    for batch_idx, (data,) in enumerate(complete_dataloader):
        data = data.to(device)
        out = (model.encoder(data).cpu())
        output[batch_idx*batch_size:(batch_idx+1)*batch_size] = out
        print(f"Batch number: {batch_idx+1}/{len(complete_dataloader)}", end="\r", flush=True)


output = torch.softmax(output, dim=1).argmax(dim=1)




Batch number: 148303/148303

In [12]:
df['class'] = output.numpy()

In [14]:
print(df['class'].value_counts())
df[df['class'] == 1]


class
1    1258648
0    1114188
Name: count, dtype: int64


Unnamed: 0_level_0,open,high,low,close,volume,body,top_wick,bottom_wick,class
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-02 18:01:00-05:00,4888.00,4888.50,4887.00,4887.00,90.0,-4.0,2.0,0.0,1
2017-01-02 18:02:00-05:00,4887.25,4888.00,4886.75,4887.75,70.0,2.0,1.0,2.0,1
2017-01-02 18:03:00-05:00,4887.75,4888.00,4887.50,4888.00,40.0,1.0,0.0,1.0,1
2017-01-02 18:05:00-05:00,4889.75,4890.00,4887.50,4888.00,116.0,-7.0,1.0,2.0,1
2017-01-02 18:07:00-05:00,4887.50,4888.00,4887.00,4887.75,53.0,1.0,1.0,2.0,1
...,...,...,...,...,...,...,...,...,...
2023-10-26 14:37:00-04:00,14282.75,14284.75,14271.00,14272.50,1981.0,-41.0,8.0,6.0,1
2023-10-26 14:39:00-04:00,14268.75,14269.25,14264.00,14265.75,797.0,-12.0,2.0,7.0,1
2023-10-26 14:40:00-04:00,14268.25,14268.25,14256.75,14259.50,1044.0,-35.0,0.0,11.0,1
2023-10-26 14:43:00-04:00,14281.25,14281.25,14281.00,14281.00,5.0,-1.0,0.0,0.0,1
