In [2]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import h5py
#print np in non-scientific notation
np.set_printoptions(suppress=True)


In [2]:

df = pd.read_feather('data/nq17-23_1min.feather')
df.index = df.index - pd.Timedelta(minutes=1)



In [3]:
tick_size = 0.25

df['body_size'] = (abs(df['close'] - df['open']) /tick_size).astype(int)
df['top_wick'] = ((df['high'] - df[['open', 'close']].max(axis=1)) /tick_size).astype(int)
df['bottom_wick'] = ((df[['open', 'close']].min(axis=1) - df['low']) /tick_size).astype(int)
df['direction'] = np.sign(df['close'] - df['open']).astype(int)
df['gap_size'] = (abs(df['open'] - df['close'].shift(1)) /tick_size)
df['gap_direction'] = np.sign(df['open'] - df['close'].shift(1))
df.dropna(inplace=True)
df['gap_size'] = df['gap_size'].astype(int)
df['gap_direction'] = df['gap_direction'].astype(int)
df['hour_start'] = (df.index.minute == 0).astype(int)
df['15min_start'] = (((df.index.minute % 15) == 0).astype(int) * (df.index.minute // 15)).isin(range(1,4)).astype(int)
df['5min_start'] = (((df.index.minute % 5) == 0).astype(int) * (df.index.minute // 5)).isin(range(1,12)).astype(int)





In [4]:
df

Unnamed: 0_level_0,open,high,low,close,volume,body_size,top_wick,bottom_wick,direction,gap_size,gap_direction,hour_start,15min_start,5min_start
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-01-02 18:01:00-05:00,4888.00,4888.50,4887.00,4887.00,90,4,2,0,-1,0,0,0,0,0
2017-01-02 18:02:00-05:00,4887.25,4888.00,4886.75,4887.75,70,2,1,2,1,1,1,0,0,0
2017-01-02 18:03:00-05:00,4887.75,4888.00,4887.50,4888.00,40,1,0,1,1,0,0,0,0,0
2017-01-02 18:04:00-05:00,4887.50,4890.00,4887.50,4890.00,89,10,0,0,1,2,-1,0,0,0
2017-01-02 18:05:00-05:00,4889.75,4890.00,4887.50,4888.00,116,7,1,2,-1,1,-1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-26 14:40:00-04:00,14268.25,14268.25,14256.75,14259.50,1044,35,0,11,-1,10,1,0,0,1
2023-10-26 14:41:00-04:00,14262.00,14282.50,14261.75,14282.00,1376,80,2,1,1,10,1,0,0,0
2023-10-26 14:42:00-04:00,14282.00,14286.25,14278.75,14281.00,1001,4,17,9,-1,0,0,0,0,0
2023-10-26 14:43:00-04:00,14281.25,14281.25,14281.00,14281.00,5,1,0,0,-1,1,1,0,0,0


In [8]:
def create_move(row, tick_size):
    if row['hour_start'] == 1:
        move = 'H'
    elif row['15min_start'] == 1:
        move = 'Q'
    elif row['5min_start'] == 1:
        move = 'F'
    else:
        move = 'M'
        
    if row['direction'] == 1 or row['direction'] == 0: # green candle. Assume doji is green
        move =  move + 'D' * int(row['bottom_wick']) + 'U' * int(row['bottom_wick'] + row['body_size'] + row['top_wick']) + 'D' * int(row['top_wick'])
    else: # red candle
        move = move + 'U' * int(row['bottom_wick']) + 'D' * int(row['bottom_wick'] + row['body_size'] + row['top_wick']) + 'U' * int(row['top_wick'])
    if row['gap_direction'] == 1: # if there is gap up after the candle
        move = move + 'G' * int(row['gap_size'])
    elif row['gap_direction'] == -1: # if there is gap down after the candle
        move =  move + 'g' * int(row['gap_size'])
    return move + 'E'

def create_move1(row, tick_size):
    if row['hour_start'] == 1:
        move = 'H'
    elif row['15min_start'] == 1:
        move = 'Q'
    elif row['5min_start'] == 1:
        move = 'F'
    else:
        move = 'M'
        
    if row['direction'] == 1 or row['direction'] == 0: # green candle. Assume doji is green
        move =  move + 'D' + str(int(row['bottom_wick'])) + 'U' + str(int(row['bottom_wick'] + row['body_size'] + row['top_wick'])) + 'D' + str(int(row['top_wick']))
    else: # red candle
        move = move + 'U' + str(int(row['bottom_wick'])) + 'D' + str(int(row['bottom_wick'] + row['body_size'] + row['top_wick'])) + 'U' + str(int(row['top_wick']))
    if row['gap_direction'] == 1: # if there is gap up after the candle
        move = move + 'G' + str(int(row['gap_size']))
    elif row['gap_direction'] == -1: # if there is gap down after the candle
        move =  move + 'g' + str(int(row['gap_size']))
    return move + 'E'

df['move'] = df.apply(lambda row: create_move1(row, tick_size), axis=1)


In [9]:
df.head(10)

Unnamed: 0_level_0,open,high,low,close,volume,body_size,top_wick,bottom_wick,direction,gap_size,gap_direction,hour_start,15min_start,5min_start,move
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-01-02 18:01:00-05:00,4888.0,4888.5,4887.0,4887.0,90,4,2,0,-1,0,0,0,0,0,MU0D6U2E
2017-01-02 18:02:00-05:00,4887.25,4888.0,4886.75,4887.75,70,2,1,2,1,1,1,0,0,0,MD2U5D1G1E
2017-01-02 18:03:00-05:00,4887.75,4888.0,4887.5,4888.0,40,1,0,1,1,0,0,0,0,0,MD1U2D0E
2017-01-02 18:04:00-05:00,4887.5,4890.0,4887.5,4890.0,89,10,0,0,1,2,-1,0,0,0,MD0U10D0g2E
2017-01-02 18:05:00-05:00,4889.75,4890.0,4887.5,4888.0,116,7,1,2,-1,1,-1,0,0,1,FU2D10U1g1E
2017-01-02 18:06:00-05:00,4887.75,4889.25,4887.25,4888.0,128,1,5,2,1,1,-1,0,0,0,MD2U8D5g1E
2017-01-02 18:07:00-05:00,4887.5,4888.0,4887.0,4887.75,53,1,1,2,1,2,-1,0,0,0,MD2U4D1g2E
2017-01-02 18:08:00-05:00,4887.5,4887.75,4887.0,4887.75,62,1,0,2,1,1,-1,0,0,0,MD2U3D0g1E
2017-01-02 18:09:00-05:00,4887.5,4887.5,4886.75,4887.25,44,1,0,2,-1,1,-1,0,0,0,MU2D3U0g1E
2017-01-02 18:10:00-05:00,4887.0,4887.25,4887.0,4887.0,40,0,1,0,0,1,-1,0,0,1,FD0U1D1g1E


In [10]:
candles_seq = " ".join(df['move'].values.tolist())
#write to file
with open('data/nq17-23_1min_moves_3.txt', 'w') as f:
    f.write(candles_seq)



In [5]:
# load indexed tocken sequence
with open('data/indexed_tokens_100.txt', 'r') as f:
    sequence = f.read().split('\n') 


In [6]:
sequence

['21 28 81',
 '23 20 77',
 '10 1 32',
 '46 25 50 17',
 '3 1 58 86',
 '23 44 19 9',
 '23 13 12 17',
 '23 7 56',
 '31 5 68',
 '72 6 74',
 '73 74',
 '10 52 32',
 '21 5 37',
 '23 13 77',
 '34 50 6 77',
 '41 20 56',
 '41 20 77',
 '73 89',
 '41 20 11 2',
 '72 6 56',
 '18 4 68',
 '46 56',
 '93 64',
 '21 12 57',
 '72 6 32',
 '10 1 74',
 '73 89',
 '10 1 77',
 '18 5 37',
 '34 12 1 12 17',
 '23 29 32',
 '21 5 37',
 '10 1 48',
 '92 56',
 '72 1 56',
 '88 56',
 '88 32',
 '92 56',
 '88 56',
 '72 6 32',
 '46 32',
 '18 5 64',
 '18 4 37',
 '93 37',
 '34 33 12 57',
 '21 12 57',
 '88 56',
 '73 32',
 '46 32',
 '3 6 4 68',
 '21 12 37',
 '10 6 61',
 '21 5 68',
 '45 29 5 9',
 '3 33 4 37',
 '46 56',
 '18 4 37',
 '46 74',
 '88 32',
 '83 50 6 61',
 '21 19 86',
 '10 7 89',
 '21 11 37',
 '21 5 86',
 '3 33 51 1 16',
 '73 48',
 '10 7 61',
 '93 33 16',
 '10 6 56',
 '3 33 4 57',
 '10 6 32',
 '21 5 81',
 '93 64',
 '21 19 37',
 '34 12 7 32',
 '21 5 37',
 '73 61',
 '21 36 86',
 '45 20 77',
 '72 13 56',
 '10 20 5 8',
 '21

In [7]:
#max word length in the sequence
max_word_len = max([len(word) for word in sequence])
print(max_word_len)
#average word length in the sequence
avg_word_len = np.median([len(word) for word in sequence])
print(avg_word_len)

39
10.0


In [8]:

#convert each line to list of integers
sequence = [list(map(int, line.split())) for line in sequence]

# merge list of tockens into one list
sequence = [item for sublist in sequence for item in sublist]

In [9]:
import torch, h5py
seq_tensor = torch.tensor(sequence)
seq_tensor = seq_tensor[~torch.isnan(seq_tensor)].to(torch.int64)


In [15]:
seq_tensor.unique(return_counts=True)

(tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
         37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
         55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
         73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
         91, 92, 93, 94, 95, 96, 97, 98, 99]),
 tensor([315301, 275521, 264866, 253045, 245420, 240689, 240339, 233859, 229113,
         227666, 211898, 200027, 186810, 186626, 180913, 180127, 178360, 178051,
         164736, 160645, 160528, 155786, 145989, 144580, 141645, 139779, 137660,
         134063, 131622, 128417, 127789, 125570, 123539, 119240, 119028, 112088,
         109591, 107146, 106292, 105966, 103786,  99283,  94276,  93739,  93455,
          93093,  87783,  83995,  83299,  83104,  81131,  77644,  74377,  72927,
          72201,  72067,  70577,  70479,  68589,  67247,  6611

In [11]:
with h5py.File('data/nq17-23_1min_candle_seq_bpe100.hdf5', 'w') as f:
       dataset = f.create_dataset('data', shape=seq_tensor.shape, dtype='i8')
       dataset[:] = seq_tensor[:]



In [12]:
dataset

<Closed HDF5 dataset>

In [13]:
#load hdf5 file
with h5py.File('data/nq17-23_1min_candle_seq_bpe100.hdf5', 'r') as f:
    data = f['data'][:]
    

In [14]:
data

array([21, 28, 81, ..., 66, 24,  2])

In [5]:
# loac index to token mapping json file
import json
with open('data/index_to_token_100.json', 'r') as f:
    index_to_token = json.load(f)

In [4]:
data

array([65, 61, 18, ..., 11, 95,  1])