In [1]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
#print np in non-scientific notation
np.set_printoptions(suppress=True)


In [2]:

df = pd.read_feather('data/nq17-23_1min.feather')
df.index = df.index - pd.Timedelta(minutes=1)



In [7]:
tick_size = 0.25
df['volume'] = df['volume'].astype(float)
df['body'] = (abs(df['close'] - df['open']) / tick_size)
df['top_wick'] = ((df['high'] - df[['open', 'close']].max(axis=1)) / tick_size)
df['bottom_wick'] = ((df[['open', 'close']].min(axis=1) - df['low']) / tick_size)
df['direction'] = np.sign(df['close'] - df['open'])
df['inner_candle']  = np.where((df['high'] < df['high'].shift(1)) & (df['low'] > df['low'].shift(1)), 1.0, 0.0)
df['outer_candle']  = np.where((df['high'] > df['high'].shift(1)) & (df['low'] < df['low'].shift(1)), 1.0, 0.0)

df.dropna(inplace=True)



In [8]:
df['bottom_wick'].value_counts()

bottom_wick
0.0      637803
1.0      397670
2.0      289428
3.0      208654
4.0      155104
          ...  
251.0         1
126.0         1
213.0         1
214.0         1
144.0         1
Name: count, Length: 233, dtype: int64

In [5]:
import h5py
#read cluster  data from data/nq17-23_1min_candle_seq_1024.hdf5 ad add to df
with h5py.File("data/nq17-23_1min_candle_seq_1024.hdf5", "r") as f:
    df['cluster'] = f["data"][:]
    


In [8]:
df

Unnamed: 0_level_0,open,high,low,close,volume,body,top_wick,bottom_wick,direction,inner_candle,outer_candle
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-01-02 18:00:00-05:00,4876.75,4890.25,4876.75,4888.00,488.0,45.0,9.0,0.0,1.0,0.0,0.0
2017-01-02 18:01:00-05:00,4888.00,4888.50,4887.00,4887.00,90.0,4.0,2.0,0.0,-1.0,1.0,0.0
2017-01-02 18:02:00-05:00,4887.25,4888.00,4886.75,4887.75,70.0,2.0,1.0,2.0,1.0,0.0,0.0
2017-01-02 18:03:00-05:00,4887.75,4888.00,4887.50,4888.00,40.0,1.0,0.0,1.0,1.0,0.0,0.0
2017-01-02 18:04:00-05:00,4887.50,4890.00,4887.50,4890.00,89.0,10.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2023-10-26 14:40:00-04:00,14268.25,14268.25,14256.75,14259.50,1044.0,35.0,0.0,11.0,-1.0,0.0,0.0
2023-10-26 14:41:00-04:00,14262.00,14282.50,14261.75,14282.00,1376.0,80.0,2.0,1.0,1.0,0.0,0.0
2023-10-26 14:42:00-04:00,14282.00,14286.25,14278.75,14281.00,1001.0,4.0,17.0,9.0,-1.0,0.0,0.0
2023-10-26 14:43:00-04:00,14281.25,14281.25,14281.00,14281.00,5.0,1.0,0.0,0.0,-1.0,1.0,0.0


In [149]:
# from statsmodels.tsa.stattools import adfuller

# result = adfuller(df['top_wick'].iloc[0:100000])
# print('ADF Statistic: %f' % result[0])
# print('p-value: %f' % result[1])

# # Вывод результатов
# for key, value in result[4].items():
#     print('\t%s: %.3f' % (key, value))

In [150]:

# body_list = sorted(df[['body'].unique().tolist())

# #convert value to index
# body_to_index = {v: (i) for i, v in enumerate(body_list)}

# #convert index to value
# index_to_body = {i: v for i, v in enumerate(body_list)}

# #apply mapping of value to index
# df['candle_index'] = df['body'].map(body_to_index)



In [9]:
from sklearn.cluster import KMeans

# Предполагаем, что df - это ваш DataFrame
X = df[['volume', 'top_wick', 'body', 'bottom_wick','direction','inner_candle',	'outer_candle']].to_numpy()
X [:,:4] = X[:,:4] + 1 #add 1 to avoid log(0) = -inf
X[:,:4] = np.log(X[:,:4])


#standardize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[:,:4] = scaler.fit_transform(X[:,:4])


# Количество квантилей
num_quantiles = 50

# Применение K-means
cluster = KMeans(n_clusters=num_quantiles, random_state=0, n_init=10).fit_predict(X)


In [9]:
cluster.max()

99

In [10]:

# Получение меток кластера для каждой свечи
df['cluster'] = cluster


In [14]:
#print(df['cluster'].value_counts())
df[df['cluster'] == 25]

Unnamed: 0_level_0,open,high,low,close,volume,body,top_wick,bottom_wick,direction,inner_candle,outer_candle,cluster
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-01-02 18:06:00-05:00,4887.75,4889.25,4887.25,4888.00,128.0,1.0,5.0,2.0,1.0,0.0,0.0,25
2017-01-03 09:54:00-05:00,4896.75,4898.50,4896.00,4897.00,493.0,1.0,6.0,3.0,1.0,1.0,0.0,25
2017-01-05 08:20:00-05:00,4923.50,4924.75,4923.00,4923.75,228.0,1.0,4.0,2.0,1.0,0.0,1.0,25
2017-01-05 09:59:00-05:00,4950.50,4952.75,4949.50,4951.50,272.0,4.0,5.0,4.0,1.0,0.0,0.0,25
2017-01-05 11:35:00-05:00,4944.75,4946.00,4944.25,4945.00,184.0,1.0,4.0,2.0,1.0,0.0,0.0,25
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-26 06:54:00-04:00,14337.50,14340.50,14336.50,14338.75,127.0,5.0,7.0,4.0,1.0,0.0,0.0,25
2023-10-26 06:55:00-04:00,14339.00,14341.50,14337.75,14339.25,126.0,1.0,9.0,5.0,1.0,0.0,0.0,25
2023-10-26 06:57:00-04:00,14336.50,14339.50,14334.50,14336.75,176.0,1.0,11.0,8.0,1.0,0.0,0.0,25
2023-10-26 08:14:00-04:00,14336.50,14338.25,14333.75,14337.00,151.0,2.0,5.0,11.0,1.0,0.0,0.0,25


In [15]:
df.to_feather('data/nq17-23_1min_quantile_50.feather')
#df = pd.read_feather('data/nq17-23_1min_quantile_100.feather')
#df

In [23]:
candles = torch.tensor(df['cluster'].values, dtype=torch.long)
candles = candles.unfold(0, 2, 1)
candles.unique(dim=1).shape

torch.Size([2372835, 2])

In [16]:
sizes_mean = df.groupby('cluster').mean()[['top_wick', 'body', 'bottom_wick']]
sizes_median = df.groupby('cluster').median()[['top_wick', 'body', 'bottom_wick']]
directions = df.groupby('cluster').mean()['direction']

In [17]:
#convert sizes median dataframe to dict with quantile as key abd tuple of sizes as value
index_to_candles = sizes_median.to_dict('index')
index_to_candles = {k: tuple(v.values()) for k, v in index_to_candles.items()}
index_to_direction = directions.to_dict()

# merge sizes dict with directions dict in index_to_candles
for k, v in index_to_candles.items():
    index_to_candles[k] = {'sizes': v, 'direction': index_to_direction[k]}


In [18]:
def full_candle_restore(candle_index:(np.array,torch.Tensor), start_price:float, number_of_candles = None, index_map = index_to_candles) -> pd.DataFrame:
    '''
    Restore full candle from index of candle and start price
    '''
    if isinstance(candle_index, torch.Tensor):
        candle_index = candle_index.numpy()
    if number_of_candles is not None:
        candle_index = candle_index[:number_of_candles]
    
    
    candles = []
    for idx, cdl_idx in enumerate(candle_index):
        top_wick, body, bottom_wick = index_map[cdl_idx]['sizes']
        direction = index_map[cdl_idx]['direction']

        candle = {}
        if idx == 0:
            candle['open'] = start_price
        else:
            candle['open'] = candles[-1]['close']
        close = candle['open'] + body * tick_size * direction
        high = close + top_wick * tick_size if close > candle['open'] else candle['open'] + top_wick * tick_size
        low = candle['open'] - bottom_wick * tick_size if close > candle['open'] else close - bottom_wick * tick_size
        candle['high'] = high
        candle['low'] = low
        candle['close'] = close
        candles.append(candle)
    
    return pd.DataFrame(candles)

restored_df = full_candle_restore(df['cluster'].values, df['open'].values[0],number_of_candles=1000)

In [19]:
#plot original and restored candles using plotly
import plotly.graph_objects as go
import plotly.express as px

original_candles = df[['open', 'high','low','close']].iloc[:restored_df.shape[0]]

fig = go.Figure(data=[go.Candlestick(x=original_candles.index,
                open=original_candles['open'],
                high=original_candles['high'],
                low=original_candles['low'],
                close=original_candles['close'])])


fig.update_layout(xaxis_rangeslider_visible=False)
fig.show()

fig = go.Figure(data=[go.Candlestick(x=restored_df.index,
                open=restored_df['open'],
                high=restored_df['high'],
                low=restored_df['low'],
                close=restored_df['close'])])
fig.update_layout(xaxis_rangeslider_visible=False)
# add text lable witn cluster number above each candle
fig.add_trace(go.Scatter(x=restored_df.index, y=restored_df['high'] + 0.5, text=df['cluster'].values[:restored_df.shape[0]], mode="text"))

fig.show()




In [20]:
#contstruct the tensor with  sequence of candles.
seq_columns = ['cluster']

seq_tensor = torch.tensor(df[seq_columns].values).reshape(-1)
# seq_tensor = seq_tensor[:seq_tensor.shape[0] //5*5] #cut the tail to make it dividable by 5

# # add 5 min divider token to the sequence

# mark_5min = torch.zeros(seq_tensor.shape[0],1, dtype=torch.int8)
# mark_5min = seq_tensor.max() + 1
# seq_tensor_5min = seq_tensor.view(-1,5)
# full_seq_tensor = torch.empty(seq_tensor_5min.shape[0], seq_tensor_5min.shape[1] + 1, dtype=torch.long)
# full_seq_tensor[:,:-1] = seq_tensor_5min
# full_seq_tensor[:,-1] = mark_5min
# seq_tensor = full_seq_tensor.reshape(-1)

In [21]:
seq_tensor = seq_tensor[~torch.isnan(seq_tensor)].to(torch.int64)


In [38]:

# save sequense tensor to hdf5
import h5py
with h5py.File('data/nq17-23_1min_candle_seq_50.hdf5', 'w') as f:
       dataset = f.create_dataset('data', shape=seq_tensor.shape, dtype='i8')
       dataset[:] = seq_tensor[:]
       #save index_to_candle dictionary
       for key, value in index_to_candles.items():
              group = f.create_group(str(key))
              group.create_dataset('sizes', data=value['sizes'])
              group.create_dataset('direction', data=value['direction'])
              
       






In [59]:
struct_df = pd.read_feather('data/nq_struct_df.feather')

In [60]:
struct_df['move'] = struct_df['structure'] - struct_df['structure'].shift(1)
struct_df.fillna(0, inplace=True)

In [61]:
df = df.join(struct_df[['move','structure']])

In [62]:
df.bfill(inplace=True)

In [63]:
swing_len = 25
df.loc[df[(df['move'] >=swing_len) & (df['move'] != df['move'].shift(1))].index, 'trade'] = 1
df.loc[df[(df['move'] >=swing_len) & (df['move'] == df['move'].shift(1))].index, 'trade'] = 2
df.loc[df[(df['move'] >=swing_len) & (df['move'] != df['move'].shift(-1))].index, 'trade'] = 3

df.loc[df[(df['move'] <=-swing_len) & (df['move'] != df['move'].shift(1))].index, 'trade'] = 4
df.loc[df[(df['move'] <=-swing_len) & (df['move'] == df['move'].shift(1))].index, 'trade'] = 5
df.loc[df[(df['move'] <=-swing_len) & (df['move'] != df['move'].shift(-1))].index, 'trade'] = 6

df['trade'].fillna(0, inplace=True)


In [1]:
df

NameError: name 'df' is not defined