# Process data config

In [7]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sys
sys.path.append('..')

from os import environ
import pandas as pd
from indicators import indicators
from datetime import timedelta
from tqdm.auto import tqdm
from config.config import ConfigFactory

class CFG:
    cls_target_ratio = 1.021

# Load data and add indicators

In [8]:
# Set environment variable
environ["ENV"] = "1h_4h"

# Get configs
configs = ConfigFactory.factory(environ).configs

def get_file(ticker):
    ''' Find files buy ticker names, file names can be in different formats '''
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    return None, None

def add_indicators(df, ttype, configs):
    # add RSI
    rsi = indicators.RSI(ttype, configs)
    df = rsi.get_indicator(df, '', '', 0)
    # add RSI
    stoch = indicators.STOCH(ttype, configs)
    df = stoch.get_indicator(df, '', '', 0)
    # add Trend
    trend = indicators.Trend(ttype, configs)
    df = trend.get_indicator(df, '', '', 0)
    # add MACD
    macd = indicators.MACD(ttype, configs)
    df = macd.get_indicator(df, '', '', 0)
    # add ATR
    atr = indicators.ATR(ttype, configs)
    df = atr.get_indicator(df, '', '', 0)
    # add SMA
    # sma = indicators.SMA(ttype, configs)
    # df = sma.get_indicator(df, '', '', 0)
    return df

def create_train_df(df, ttype, configs, target_offset, first, last, step):
    ''' Create train dataset from signal statistics and ticker candle data'''
    train_df = pd.DataFrame()
    tickers = df['ticker'].unique()
    
    for ticker in tqdm(tickers):
        # get signals with current ticker
        signal_df = df[df['ticker'] == ticker]
        times = signal_df['time']
        
        # load candle history of this ticker
        tmp_df_1h, tmp_df_4h = get_file(ticker)

        # add indicators 
        try:
            tmp_df_1h = add_indicators(tmp_df_1h, ttype, configs)
        except TypeError:
            continue

        # add historical data for current ticker
        for i, t in enumerate(times.to_list()):
            pass_cycle = False
            pattern = signal_df.iloc[i, signal_df.columns.get_loc('pattern')]
            row = tmp_df_1h.loc[tmp_df_1h['time'] == t, :].reset_index(drop=True)
            
            for i in range(first, last + step, step):
                time_prev = t + timedelta(hours= -i)
                try:
                    row_tmp = tmp_df_1h.loc[tmp_df_1h['time'] == time_prev, :].reset_index(drop=True)
                    row_tmp.columns = [c + f'_prev_{i}' for c in row_tmp.columns]
                except IndexError:
                    pass_cycle = True
                    break
                row = pd.concat([row, row_tmp.iloc[:,1:]], axis=1)
                row['ticker'] = ticker
                row['pattern'] = pattern
                
            if pass_cycle:
                continue

            row['target'] = 0
            
            if row['pattern'].values == 'STOCH_RSI':
                if ttype == 'buy':
                    row['ttype'] = 'sell'
                else:
                    row['ttype'] = 'buy'
            else:
                row['ttype'] = ttype
            
            # If ttype = buy and during the selected period high price was higher than close_price * target_ratio
            # and earlier low price wasn't lower than close_price / target_ratio, than target is True, else target is False.
            # Similarly for ttype = sell 
            close_price = tmp_df_1h.loc[tmp_df_1h['time'] == t, 'close'].values
            
            for i in range(1, target_offset + 1):
                time_next = t + timedelta(hours=i)
                target_buy = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'high'].reset_index(drop=True)
                target_sell = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'low'].reset_index(drop=True)

                try:
                    target_buy = target_buy > close_price * CFG.cls_target_ratio
                    target_sell = target_sell < close_price / CFG.cls_target_ratio
                except ValueError:
                    pass_cycle = True
                    break
                
                try:
                    if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
                        break
                    elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
                        row['target'] = 1
                        break
                except KeyError:
                    pass_cycle = True
                    break
            
            if pass_cycle:
                continue

            # add data to the dataset
            if train_df.shape[0] == 0:
                train_df = row
            else:
                train_df = pd.concat([train_df, row])
    
    return train_df

# for how long time (in hours) we want to predict
target_offset = 48
# first previous data point to collect for model training (value represents number of hours before signal point)
first = 4
# last previous data point to collect for model training (value represents number of hours before signal point)
last = 96
# step of previous data points collecting (total number of points to collect is (last - first + step) / step)
step = 4

# Buy
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/buy_stat_1h.pkl')
# dataset for model train
train_buy = create_train_df(df, 'buy', configs, target_offset, first, last, step)
train_buy = train_buy.dropna()

# Sell
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/sell_stat_1h.pkl')
# dataset for model train
train_sell = create_train_df(df, 'sell', configs, target_offset, first, last, step)
train_sell = train_sell.dropna()

train_buy = pd.concat([train_buy, train_sell[train_sell['ttype'] == 'buy']]).sort_values('time').reset_index(drop=True)
train_sell = pd.concat([train_sell, train_buy[train_buy['ttype'] == 'sell']]).sort_values('time').reset_index(drop=True)

train_buy.to_pickle(f'signal_stat/train_buy_{last}.pkl')
train_sell.to_pickle(f'signal_stat/train_sell_{last}.pkl')

display(df.head())
display(df.shape)


  0%|          | 0/362 [00:00<?, ?it/s]

  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or 

  0%|          | 0/355 [00:00<?, ?it/s]

  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or 

Unnamed: 0,time,ticker,timeframe,pattern,signal_price,signal_smooth_price,pct_price_diff_1,mfe_1,mae_1,pct_price_diff_2,...,mae_21,pct_price_diff_22,mfe_22,mae_22,pct_price_diff_23,mfe_23,mae_23,pct_price_diff_24,mfe_24,mae_24
0,2023-02-22 19:00:00,BTCUSDT,1h,Pattern_Trend,23634.98,24192.575,-0.156909,0.219405,0.713884,-0.32044,...,4.566143,-0.426914,0.219405,4.566143,-0.38571,0.219405,4.566143,-0.363563,0.219405,4.566143
1,2023-08-06 00:00:00,BTCUSDT,1h,Pattern_Trend,29073.45,29066.232917,0.004727,0.132513,0.288661,-0.001273,...,1.04531,0.010945,1.24385,1.04531,0.022703,1.24385,1.472278,0.024584,1.24385,2.007361
2,2023-02-09 07:00:00,ETHUSDT,1h,Pattern_Trend,1623.85,1657.19125,-0.119395,0.119872,0.525591,-0.220385,...,1.950987,-3.27399,7.641813,1.950987,-3.477739,7.641813,1.950987,-3.708031,7.641813,1.950987
3,2023-02-13 13:00:00,ETHUSDT,1h,Pattern_Trend,1482.54,1520.777917,-0.130064,0.097162,0.551435,-0.235273,...,2.377053,-1.995866,1.756582,2.377053,-1.911682,1.756582,3.177359,-1.836074,1.756582,3.177359
4,2023-02-22 19:00:00,ETHUSDT,1h,Pattern_Trend,1605.81,1646.4475,-0.168626,0.120118,0.497249,-0.341644,...,4.911422,-0.136769,0.120118,4.911422,-0.029618,0.120118,4.911422,0.075257,0.120118,4.911422


(6807, 78)

# Check pattern/target distribution

In [9]:
train_sell[['target', 'pattern']].value_counts()

target  pattern      
1       MACD             1987
0       MACD             1264
1       STOCH_RSI        1167
0       STOCH_RSI        1124
        Pattern_Trend     957
1       Pattern_Trend     952
dtype: int64

In [10]:
train_buy[['target', 'pattern']].value_counts()

target  pattern      
0       Pattern_Trend    1714
1       Pattern_Trend    1602
        STOCH_RSI        1167
0       STOCH_RSI        1124
1       MACD             1000
0       MACD              650
dtype: int64

# Check target correctness

In [11]:
# i = 1000

# x = train_df.loc[(train_df.target == 1) & (train_df.ttype == 'buy'), ['ticker', 'ttype', 'pattern', 'time', 'close', 'target']]
# y = x.iloc[i]
# low_price, high_price = y['close'] / CFG.cls_target_ratio, y['close'] * CFG.cls_target_ratio,
# print(y['ticker'], y['time'], y['ttype'])

# tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{y["ticker"]}_1h.pkl')
# # tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{y["ticker"][:-4]}-{y["ticker"][-4:]}_1h.pkl')
# # tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{y["ticker"][:-4]}-{y["ticker"][-4:]}-SWAP_4h.pkl')

# tmp_df_1h['low_price'] = low_price
# tmp_df_1h['high_price'] = high_price
# idx = tmp_df_1h[tmp_df_1h['time'] == y['time']].index[0]

# tmp_df_1h = tmp_df_1h.iloc[idx:idx+24][['time', 'close', 'high', 'high_price', 'low', 'low_price']]

# if y['ttype'] == 'buy':
#     tmp_df_1h['signal'] = tmp_df_1h['high'] > tmp_df_1h['high_price']
#     tmp_df_1h['anti_signal'] = tmp_df_1h['low'] < tmp_df_1h['low_price']
# else:
#     tmp_df_1h['signal'] = tmp_df_1h['low'] < tmp_df_1h['low_price']
#     tmp_df_1h['anti_signal'] = tmp_df_1h['high'] > tmp_df_1h['high_price']

# tmp_df_1h