In [1]:
import pandas as pd
import yfinance as yf
import plotly.graph_objects as go
import numpy as np
from create_vocab import tokenize_word
import json
import torch
#from nq_llm_decoder import AutoregressionDecoderModel

In [86]:
# load 1 min NQ data from yfinance
nq = yf.download(tickers='NQ=F', period='max', interval='1m')
nq.drop(['Adj Close'], axis=1, inplace=True)
nq.columns = ['open', 'high', 'low', 'close', 'volume']



[*********************100%%**********************]  1 of 1 completed


In [87]:
nq.loc[nq['volume'] == 0, 'volume'] = 1

In [88]:
def count_moves(df, tick_size):
    df['body_size'] = (abs(df['close'] - df['open']) /tick_size).astype(int)
    df['top_wick'] = ((df['high'] - df[['open', 'close']].max(axis=1)) /tick_size).astype(int)
    df['bottom_wick'] = ((df[['open', 'close']].min(axis=1) - df['low']) /tick_size).astype(int)
    df['direction'] = np.sign(df['close'] - df['open']).astype(int)
    df['gap_size'] = (abs(df['open'] - df['close'].shift(1)) /tick_size)
    df['gap_direction'] = np.sign(df['open'] - df['close'].shift(1))
    df.fillna(0, inplace=True)
    df['gap_size'] = df['gap_size'].astype(int)
    df['gap_direction'] = df['gap_direction'].astype(int)
    df['hour_start'] = (df.index.minute == 0).astype(int)
    df['15min_start'] = (((df.index.minute % 15) == 0).astype(int) * (df.index.minute // 15)).isin(range(1,4)).astype(int)
    df['5min_start'] = (((df.index.minute % 5) == 0).astype(int) * (df.index.minute // 5)).isin(range(1,12)).astype(int)
    df['volume_per_tick'] = np.where((df['body_size'] + df['top_wick'] + df['bottom_wick']) >0, np.ceil(df['volume'] / (df['body_size'] + df['top_wick'] + df['bottom_wick'])), 0)
    return df

def create_move(row, tick_size):
    if row['hour_start'] == 1:
        move = 'H'
    elif row['15min_start'] == 1:
        move = 'Q'
    elif row['5min_start'] == 1:
        move = 'F'
    else:
        move = 'M'
        
    if row['gap_direction'] == 1: # if there is gap up after the candle
        move = move + 'G' * int(row['gap_size'])
    elif row['gap_direction'] == -1: # if there is gap down after the candle
        move =  move + 'g' * int(row['gap_size'])
        
    if row['direction'] == 1 or row['direction'] == 0: # green candle. Assume doji is green
        move =  move + 'D' * int(row['bottom_wick']) + 'U' * int(row['bottom_wick'] + row['body_size'] + row['top_wick']) + 'D' * int(row['top_wick'])
    else: # red candle
        move = move + 'U' * int(row['bottom_wick']) + 'D' * int(row['bottom_wick'] + row['body_size'] + row['top_wick']) + 'U' * int(row['top_wick'])
    
    return move + 'E'


def create_move1(row, tick_size):
    if row['hour_start'] == 1:
        move = 'H'
    elif row['15min_start'] == 1:
        move = 'Q'
    elif row['5min_start'] == 1:
        move = 'F'
    else:
        move = 'M'

    if row['gap_direction'] == 1: # if there is gap up after the candle
        move = move + 'G' + str(int(row['gap_size'])) + ''
    elif row['gap_direction'] == -1: # if there is gap down after the candle
        move =  move + 'g' + str(int(row['gap_size'])) + ''

    if row['direction'] == 1 or row['direction'] == 0: # green candle. Assume doji is green
        move =  move + 'D' + str(int(row['bottom_wick'])) + 'U' + str(int(row['bottom_wick'] + row['body_size'] + row['top_wick'])) + 'D' + str(int(row['top_wick'])) + ''
    else: # red candle
        move = move + 'U' + str(int(row['bottom_wick'])) + 'D' + str(int(row['bottom_wick'] + row['body_size'] + row['top_wick'])) + 'U' + str(int(row['top_wick'])) + ''
    
     
    move = move + 'V' + str(int(row['volume_per_tick'])) + 'E'
    return move


In [89]:
tick_size = 0.25
nq = count_moves(nq, tick_size)
nq['move'] = nq.apply(lambda row: create_move1(row, tick_size), axis=1)


In [90]:

moves_data = nq['move'].tolist()


In [91]:
vocab_file = 'data/NQ_vocab_vol_99.json'
tok_to_idx_file = 'data/NQ_tok_to_idx_vol_100.json'
idx_to_tok_file = 'data/NQ_idx_to_tok_vol_100.json'

#load vocab and tok_to_idx
with open(vocab_file, 'r') as f:
        vocab = json.load(f)
    
#change vocab keys from str to int
vocab = {int(k):v for k,v in vocab.items()}
vocab = dict(sorted(vocab.items(), reverse=True))
#vocab_size = sum(len(v) for v in vocab.values()) +1
vocab_size = 62

with open(tok_to_idx_file, 'r') as f:
        token_to_idx = json.load(f)

with open(idx_to_tok_file, 'r') as f:
        idx_to_token = json.load(f)


print('Tokenizing data...')
tokens = [tokenize_word(word, vocab) for word in moves_data]
        

Tokenizing data...


In [92]:
print('Encoding tokens...')
encoded_tokens = []
for token in tokens:
    encoded_tokens.extend([token_to_idx[t] for t in token])

Encoding tokens...


In [93]:
import torch.nn as nn
class AutoregressionDecoderModel(nn.Module):
    def __init__(self, vocab_size, seq_len, embed_dim, num_heads, num_layers, dropout=0.1, ff_mult=4, global_tokens=None):
        super(AutoregressionDecoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0,)
        self.pos_encoder = nn.Embedding(seq_len, embed_dim, padding_idx=0) 
        #self.pos_encoder = PositionalEncoding(embed_dim, dropout, seq_len)
        self.decoder_layer = nn.TransformerDecoderLayer(embed_dim, num_heads, embed_dim*ff_mult, dropout, batch_first=True)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers)
        self.output_layer = nn.Linear(embed_dim, vocab_size)
        self.global_tokens = global_tokens
        
        self.embed_dim = embed_dim
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight, gain=nn.init.calculate_gain('relu'))
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        if isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0, std=0.05)
        if isinstance(module, nn.LayerNorm):
            nn.init.constant_(module.bias, 0)
            nn.init.constant_(module.weight, 1.0)
        if isinstance(module, nn.BatchNorm1d):
            nn.init.constant_(module.bias, 0)
            nn.init.constant_(module.weight, 1.0)


    def forward(self, src):
        positions = torch.arange(0, src.size(1)).unsqueeze(0).to(src.device)
        x = self.embedding(src) + self.pos_encoder(positions)
        #x = self.embedding(src)
        #x = self.pos_encoder(x)
        tgt_mask = torch.nn.Transformer.generate_square_subsequent_mask(x.size(1),)
        
        tgt_mask = tgt_mask.to(src.device)
        output = self.decoder(x, x, tgt_mask=tgt_mask)
        output = self.output_layer(output)
        if self.global_tokens != None:
            output = output[:,self.global_tokens:,:-self.global_tokens*10]

        return output

In [103]:

global_tokens = 2
vocab_size = 117
embed_dim = 64
num_heads = 8
num_layers = 8
dropout = 0.1
ff_mult = 4
seq_len = 98
model_path = 'models/nq-llm_decoder_bpe_vol_100_best.pth'

model = AutoregressionDecoderModel(vocab_size, seq_len, embed_dim, num_heads, num_layers, dropout, ff_mult, global_tokens)
checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])
model.eval();

In [134]:
input_data = torch.tensor(encoded_tokens).unsqueeze(0)
input_seq = input_data[:, -98:]
glob_0 = range(97,107)
glob_1 = range(107,117)
input_seq[:,0] = glob_1[-1]
input_seq[:,1] = glob_0[-1]
input_seq

tensor([[116, 106,   2,  28,  36,  59,   5,  46,   1,  11,  45,  23,  34,  51,
           1,   2,  33,   5,  56,  22,  49,   1,   2,  27,  21,  69,  10,  38,
           1,   2,  27,  40,   3,  23,   9,  41,   1,   2,   6,  69,  43,  49,
           1,  44,  40,   4,  24,   8,  38,   1,   2,  62,  16,  45,  20,  12,
          38,   1,   2,   8,   3,  23,   9,  25,   1,   2,   9,  56,   9,  58,
           1,   2,  61,  71,   8,  51,   1,  11,  34,  78,  21,  46,   1,   2,
          52,   5,   3,  26,   9,  41,   1,   2,  32,  60,  76,   5,  96,   1]])

In [218]:

with torch.inference_mode():
    iter = 0
    prediction = torch.tensor([0])
    while prediction[-1] != 1:
        output = model(input_seq)        
        prediction = torch.softmax(output, dim=2).argmax(dim=2).squeeze()
        input_seq[0, 2:] = prediction
        iter += 1

        
input_seq, prediction                


(tensor([[116, 106,  45,  16,  34,  41,   1,   2,  33,  45,  16,  34,  41,   1,
           11,  33,  45,  16,  34,  41,   1,   2,  33,  45,  16,  34,  41,   1,
            2,  33,  45,  16,  34,  41,   1,   2,  33,  45,  16,  34,  41,   1,
            2,  33,  45,  16,  34,  41,   1,  11,  33,  45,  16,  34,  41,   1,
            2,  33,  45,  16,  34,  41,   1,   2,  33,  45,  16,  34,  41,   1,
            2,  33,  45,  16,  34,  41,   1,   2,  33,  45,  16,  34,  41,   1,
           44,  33,  45,  16,  34,  41,   1,   2,  33,  45,  16,  34,  41,   1]]),
 tensor([45, 16, 34, 41,  1,  2, 33, 45, 16, 34, 41,  1, 11, 33, 45, 16, 34, 41,
          1,  2, 33, 45, 16, 34, 41,  1,  2, 33, 45, 16, 34, 41,  1,  2, 33, 45,
         16, 34, 41,  1,  2, 33, 45, 16, 34, 41,  1, 11, 33, 45, 16, 34, 41,  1,
          2, 33, 45, 16, 34, 41,  1,  2, 33, 45, 16, 34, 41,  1,  2, 33, 45, 16,
         34, 41,  1,  2, 33, 45, 16, 34, 41,  1, 44, 33, 45, 16, 34, 41,  1,  2,
         33, 45, 16, 34, 41,  1]

In [107]:
, iter

(tensor([33,  5, 56, 22, 49,  1,  2, 27, 21, 69, 10, 38,  1,  2, 27, 40,  3, 23,
          9, 41,  1,  2,  6, 69, 43, 49,  1, 44, 40,  4, 24,  8, 38,  1,  2, 62,
         16, 45, 20, 12, 38,  1,  2,  8,  3, 23,  9, 25,  1,  2,  9, 56,  9, 58,
          1,  2, 61, 71,  8, 51,  1, 11, 34, 78, 21, 46,  1,  2, 52,  5,  3, 26,
          9, 41,  1,  2, 32, 60, 76,  5, 96,  1,  2, 32, 45, 23, 34, 41,  1,  2,
         32, 45, 23, 34, 41,  1]),
 0)

In [82]:

#convert indexes to tokens
best_prediction_tokens = [idx_to_token[str(idx.item())] for idx in best_prediction[:]]
best_prediction_tokens

['D5',
 'V7',
 'E',
 'M',
 'G2',
 'U3',
 'D11',
 'U3',
 'V7',
 'E',
 'F',
 'g',
 '4',
 'U4',
 'D16',
 'U5',
 'V8',
 'E',
 'M',
 'G2',
 'U8',
 'D15',
 'U2',
 'V8',
 'E',
 'M',
 'g2',
 'U13',
 'D2',
 '7',
 'U0',
 'V11',
 'E',
 'M',
 'g1U1',
 '3',
 'D3',
 '0',
 'U7',
 'V9',
 'E',
 'M',
 'g1',
 'D0U1',
 '8',
 'D4',
 'V6',
 'E',
 'F',
 'g1D1',
 'U8',
 'D7',
 'V1',
 'E',
 'M',
 'g2',
 'D0U1',
 '2',
 'D5',
 'V7',
 'E',
 'M',
 'g2',
 'D0U1',
 '2',
 'D6',
 'V7',
 'E',
 'M',
 'g2',
 'D0U1',
 '2',
 'D6',
 'V7',
 'E',
 'M',
 'g2',
 'D0U1',
 '2',
 'D6',
 'V7',
 'E',
 'Q',
 'g2',
 'D0U1',
 '2',
 'D6',
 'V7',
 'E',
 'M',
 'g2',
 'D0U1',
 '2',
 'D6',
 'V7',
 'E']

In [12]:
def predict_candle(model, input_data, idx_to_token):
    with torch.inference_mode():
        output = model(input_data)
        prediction = torch.softmax(output, dim=2).argmax(dim=2).squeeze()

        probability = [torch.softmax(output, dim=2).squeeze()[-1][prediction[-1]].item()]
        iter = 1
    
        while prediction[-1] != 1 and iter < 10:
            
            output = model(prediction.unsqueeze(0))
            prob = torch.softmax(output, dim=2).squeeze()
            prediction = prob.argmax(dim=1)
            probability.append((prob[-1][prediction[-1]]).item())
            iter += 1
            print(prediction[-1])
        #input_data, prediction[-iter:]

    #decode prediction
    decoded = []
    for idx in prediction[-iter:]:
        decoded.append(idx_to_token[str(idx.item())])
    return decoded, prediction

In [2]:
def restore_candle (last_close, time, decoded, tick_size):
    move = ''.join(decoded[1:-1])
    
    
    price = [last_close]
    for step in move:
        if step == 'g':
            price[0] -= tick_size
        elif step == 'G':
            price[0] += tick_size
        elif step == 'U':
            price.append(price[-1] + tick_size)
        elif step == 'D':
            price.append(price[-1] - tick_size)
    open = price[0]    
    close = price[-1]
    high = max(price)
    low = min(price)    
    time = time + pd.Timedelta(minutes=1)
    df = pd.DataFrame({'open':open, 'high':high, 'low':low, 'close':close, 'move':move}, index=[time])
    return df
import re
def expand_prediction(s):
    s = s[1:-1]
    result = ''
    matches = re.findall(r'([A-Za-z])(\d+)', s)
    for match in matches:
        letter, number = match
        result += letter * int(number)
    else:
        result = None 
    return result

a = expand_prediction('44444444')

In [31]:
#predict and restore next 5 candles
nq.dropna(inplace=True)
number_of_candles = 1

#predicting first candle
decoded, prediction = predict_candle(model, input_data, idx_to_token)
# new_candle = restore_candle(nq['close'].iloc[-1], nq.index[-1], decoded, tick_size)
# nq = pd.concat([nq, new_candle])


# #predicting next candles
# for i in range(number_of_candles-1):
#     decoded, prediction = predict_candle(model, prediction.unsqueeze(0), idx_to_token)
#     new_candle = restore_candle(nq['close'].iloc[-1], nq.index[-1], decoded, tick_size)
#     nq = pd.concat([nq, new_candle])


tensor(12)
tensor(34)


tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)
tensor(34)


In [32]:
#plot candles using plotly
fig = go.Figure(data=[go.Candlestick(x=nq.index,
                open=nq['open'],
                high=nq['high'],
                low=nq['low'],
                close=nq['close'])])
fig.update_layout(xaxis_rangeslider_visible=False)
#set figure size
fig.update_layout(
    autosize=False,
    width=800,
    height=700,)
fig.show()

In [38]:
nq

Unnamed: 0,open,high,low,close,body_size,top_wick,bottom_wick,direction,gap_size,gap_direction,hour_start,15min_start,5min_start,move
2024-01-24 00:56:00-05:00,17605.5,17605.75,17604.25,17605.0,2.0,1.0,3.0,-1.0,0.0,0.0,0.0,0.0,0.0,MUUUDDDDDDUE
2024-01-24 00:57:00-05:00,17604.75,17605.0,17603.5,17603.5,5.0,1.0,0.0,-1.0,1.0,-1.0,0.0,0.0,0.0,MgDDDDDDUE
2024-01-24 00:58:00-05:00,17603.25,17605.75,17603.0,17605.25,8.0,2.0,1.0,1.0,1.0,-1.0,0.0,0.0,0.0,MgDUUUUUUUUUUUDDE
2024-01-24 00:59:00-05:00,17605.0,17605.75,17603.0,17603.25,7.0,3.0,1.0,-1.0,1.0,-1.0,0.0,0.0,0.0,MgUDDDDDDDDDDDUUUE
2024-01-24 01:00:00-05:00,17603.5,17610.25,17603.5,17608.75,21.0,6.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,HGUUUUUUUUUUUUUUUUUUUUUUUUUUUDDDDDDE
2024-01-24 01:01:00-05:00,17609.25,17612.5,17607.25,17607.5,7.0,13.0,1.0,-1.0,2.0,1.0,0.0,0.0,0.0,MGGUDDDDDDDDDDDDDDDDDDDDDUUUUUUUUUUUUUE
2024-01-24 01:02:00-05:00,17607.75,17607.75,17600.75,17601.5,25.0,0.0,3.0,-1.0,1.0,1.0,0.0,0.0,0.0,MGUUUDDDDDDDDDDDDDDDDDDDDDDDDDDDDE
2024-01-24 01:03:00-05:00,17602.5,17606.25,17602.25,17605.25,11.0,4.0,1.0,1.0,4.0,1.0,0.0,0.0,0.0,MGGGGDUUUUUUUUUUUUUUUUDDDDE
2024-01-24 01:04:00-05:00,17605.75,17607.75,17604.75,17606.0,1.0,7.0,4.0,1.0,2.0,1.0,0.0,0.0,0.0,MGGDDDDUUUUUUUUUUUUDDDDDDDE
2024-01-24 01:05:00-05:00,17606.5,17612.25,17606.5,17612.25,23.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,1.0,FGGUUUUUUUUUUUUUUUUUUUUUUUE
