In [27]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [28]:
df = pd.read_feather('data/nq17-23_1min_quantile.feather')


In [29]:
data = torch.from_numpy(df[['top_wick', 'body', 'bottom_wick']].values).float()
labels = torch.from_numpy(df['quantile'].values).long()

#standardize the data separately for each column
data = (data - data.mean(dim=0)) / data.std(dim=0)
# check if nan in data
if torch.isnan(data).any().item():
    print('Check the data for nan')
    exit()


In [30]:
#split for train and test
split_idx = int(len(data) * 0.8)
train_data = data[:split_idx]
train_labels = labels[:split_idx]
test_data = data[split_idx:]
test_labels = labels[split_idx:]

# create datasets and dataloaders

train_dataset = TensorDataset(train_data, train_labels)
test_dataset = TensorDataset(test_data, test_labels)

batch_size = 256
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)



In [31]:
#create FF model for classification
class CandleTockenizer(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=64):
        super(CandleTockenizer, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size

        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.apply(self.init_weights)
        
    def init_weights(self,module):
        nn.init.xavier_normal_(self.fc1.weight, gain=nn.init.calculate_gain('relu'))
        nn.init.xavier_uniform_(self.fc2.weight)
        if self.fc1.bias is not None:
            nn.init.zeros_(self.fc1.bias)
        if self.fc2.bias is not None:
            nn.init.zeros_(self.fc2.bias)
     
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)

        return x

In [32]:
input_size = 3
vocab_size = 1024 # labels.max().item() + 1
model = CandleTockenizer(input_size, vocab_size)
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model = model.to(device)
print(f'Using {device} device')

Using mps device


In [33]:


#loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
best_loss = float('inf')


In [34]:
#adjust lr in optimizer
for param_group in optimizer.param_groups:
    param_group['lr'] = 0.0001
    param_group['weight_decay'] = 0.01

In [35]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_batch_metrics(y_true, y_pred, num_classes):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', labels=range(num_classes), zero_division=1)
    return accuracy, precision, recall, f1

In [37]:
#train loop
epochs = 10

for epoch in range(epochs):
    model.train()
    losses = 0
    vlosses = 0
    for i, (data, labels) in enumerate(train_dataloader):
        data = data.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = loss_fn(outputs, labels)
        loss.backward()
        losses += loss.item()
        optimizer.step()

        if i % 10 == 0:
            print(f'Epoch: {epoch}, batch: {i}/{len(train_dataloader)}, loss: {losses / (i + 1)}', end='\r', flush=True)

    model.eval()
    total_accuracy = 0
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    num_batches = len(test_dataloader)

    with torch.no_grad():
        
        for vdata, vlabels in test_dataloader:
            vdata = vdata.to(device)
            vlabels = vlabels.to(device)
            voutputs = model(vdata)
            vloss = loss_fn(voutputs, vlabels)
            vlosses += vloss.item()
        #calculate metrics for the current batch
            _, predicted = torch.max(voutputs.data, 1)
            accuracy, precision, recall, f1 = calculate_batch_metrics(vlabels.cpu().numpy(), predicted.cpu().numpy(), vocab_size)
            total_accuracy += accuracy
            total_precision += precision
            total_recall += recall
            total_f1 += f1

    final_accuracy = total_accuracy / num_batches
    final_precision = total_precision / num_batches
    final_recall = total_recall / num_batches
    final_f1 = total_f1 / num_batches        
    print(f'Epoch: {epoch}, train loss: {losses / len(train_dataloader)}, test loss: {vlosses / len(test_dataloader)}, accuracy: {final_accuracy*100}%, precision: {final_precision*100}%, recall: {final_recall*100}%, f1: {final_f1*100}%')

            

    if vlosses / len(test_dataloader) < best_loss:
        best_loss = vlosses / len(test_dataloader)
        #save model, optimizer, epoch, best_loss and accuracy
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_loss': best_loss,
            'accuracy': final_accuracy,
            'precision': final_precision,
            'recall': final_recall,
            'f1': final_f1
    
        }, 'models/candle_tokenizer.pth')

        print('Model saved')    
    
    # if epoch in [19, 59]:
    #     for param_group in optimizer.param_groups:
    #         param_group['lr'] *= 0.1
    #         print(f'Learning rate adjusted to {param_group["lr"]}')



    

Epoch: 0, train loss: 0.10904064622834253, test loss: 0.33295359357468385, accuracy: 94.20029328478965%, precision: 98.93620739785568%, recall: 94.20029328478965%, f1: 94.1594748727839%
Model saved
Epoch: 1, train loss: 0.11512079936961574, test loss: 0.32763470936880046, accuracy: 94.17265877831716%, precision: 98.92554572521757%, recall: 94.17265877831716%, f1: 94.12494183476686%
Model saved
Epoch: 2, train loss: 0.12244532740963252, test loss: 0.3262218031389267, accuracy: 93.80162823624595%, precision: 98.84579552897192%, recall: 93.80162823624595%, f1: 93.75462784988221%
Model saved
Epoch: 3, train loss: 0.1301199394246079, test loss: 0.3356403020263811, accuracy: 93.42069511866235%, precision: 98.73494607862003%, recall: 93.42069511866235%, f1: 93.3666442061604%
Epoch: 4, train loss: 0.13770731401196737, test loss: 0.329393668333822, accuracy: 93.4902491235167%, precision: 98.75283872159169%, recall: 93.4902491235167%, f1: 93.48991703056451%
Epoch: 5, train loss: 0.14554817479770

In [36]:
#restore model with best_loss and test it
checkpoint = torch.load('models/candle_tokenizer.pth')
model.load_state_dict(checkpoint['model_state_dict'])


<All keys matched successfully>

In [25]:
with torch.no_grad():
    outputs = model(test_data.to(device))
    labels = test_labels.to(device)
(torch.softmax(outputs, dim=1).argmax(dim=1) == labels).unique(return_counts=True)

(tensor([False,  True], device='mps:0'),
 tensor([ 13466, 461102], device='mps:0'))

In [7]:
#verify tokenization with data of higher timeframe
df_5min = df[['open', 'high', 'low', 'close']].resample('5min').agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last'})
tick_size = 0.25
df_5min.dropna(inplace=True)

df_5min['body'] = ((df_5min['close'] - df_5min['open']) / tick_size).astype(int)
df_5min['top_wick'] = ((df_5min['high'] - df_5min[['open', 'close']].max(axis=1)) / tick_size).astype(int)
df_5min['bottom_wick'] = ((df_5min[['open', 'close']].min(axis=1) - df_5min['low']) / tick_size).astype(int)

In [8]:
verify_data = torch.from_numpy(df_5min[['top_wick', 'body', 'bottom_wick']].values).float()


#standardize the data separately for each column
verify_data = (verify_data - verify_data.mean(dim=0)) / verify_data.std(dim=0)
# check if nan in data
if torch.isnan(verify_data).any().item():
    print('Check the data for nan')
    exit()

result = torch.empty(len(verify_data), dtype=torch.long)

batch_size = 1024*4
with torch.no_grad():
    for i in range(0, len(verify_data), batch_size):
        batch = verify_data[i:i+batch_size].to(device)
        outputs = model(batch)
        result[i:i+batch_size] = torch.softmax(outputs, dim=1).argmax(dim=1).cpu()

df_5min['quantile'] = result.numpy()


In [16]:
df_5min['quantile'].value_counts()

quantile
357     10422
236      6677
297      6618
728      5327
142      5014
        ...  
310         1
893         1
1003        1
524         1
1011        1
Name: count, Length: 952, dtype: int64

In [59]:
#plot candles chart with quantiles usind plotly
import plotly.graph_objects as go
import plotly.express as px

data_df = df_5min.iloc[:1000]
#find 10 most frequent quantiles
top_quantiles = data_df['quantile'].value_counts().index[:10]


fig = go.Figure(data=[go.Candlestick(x=data_df.index,
                open=data_df['open'],
                high=data_df['high'],
                low=data_df['low'],
                close=data_df['close'])])

fig.update_layout(xaxis_rangeslider_visible=False)
fig.update_layout(xaxis=dict(type='category'))
fig.update_layout(xaxis=dict(rangeslider=dict(visible=False)))
#update fig size
fig.update_layout(
    autosize=False,
    width=2000,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
)

#add text labels for most frequent quantiles

for quantile in top_quantiles:
    fig.add_trace(go.Scatter(x=data_df[data_df['quantile'] == quantile].index, y=data_df[data_df['quantile'] == quantile]['high'], mode='text',name=str(quantile), text=str(quantile)))

fig.show()

