# Process data config

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sys
sys.path.append('..')

from os import environ
import pandas as pd
from indicators import indicators
from datetime import timedelta
from tqdm.auto import tqdm
from config.config import ConfigFactory

class CFG:
    cls_target_ratio = 1.021

# Load data and add indicators

In [4]:
# Set environment variable
environ["ENV"] = "1h_4h"

# Get configs
configs = ConfigFactory.factory(environ).configs

def get_file(ticker):
    ''' Find files buy ticker names, file names can be in different formats '''
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    return None, None

def add_indicators(df, ttype, configs):
    # add RSI
    rsi = indicators.RSI(ttype, configs)
    df = rsi.get_indicator(df, '', '', 0)
    # add RSI
    stoch = indicators.STOCH(ttype, configs)
    df = stoch.get_indicator(df, '', '', 0)
    # add Trend
    trend = indicators.Trend(ttype, configs)
    df = trend.get_indicator(df, '', '', 0)
    # add MACD
    macd = indicators.MACD(ttype, configs)
    df = macd.get_indicator(df, '', '', 0)
    # add ATR
    atr = indicators.ATR(ttype, configs)
    df = atr.get_indicator(df, '', '', 0)
    # add SMA
    # sma = indicators.SMA(ttype, configs)
    # df = sma.get_indicator(df, '', '', 0)
    return df

def create_train_df(df, ttype, configs, target_offset, first, last, step):
    ''' Create train dataset from signal statistics and ticker candle data'''
    train_df = pd.DataFrame()
    tickers = df['ticker'].unique()
    
    for ticker in tqdm(tickers):
        # get signals with current ticker
        signal_df = df[df['ticker'] == ticker]
        times = signal_df['time']
        
        # load candle history of this ticker
        tmp_df_1h, tmp_df_4h = get_file(ticker)

        # add indicators 
        try:
            tmp_df_1h = add_indicators(tmp_df_1h, ttype, configs)
        except TypeError:
            continue

        # add historical data for current ticker
        for i, t in enumerate(times.to_list()):
            pass_cycle = False
            pattern = signal_df.iloc[i, signal_df.columns.get_loc('pattern')]
            row = tmp_df_1h.loc[tmp_df_1h['time'] == t, :].reset_index(drop=True)
            
            for i in range(first, last + step, step):
                time_prev = t + timedelta(hours= -i)
                try:
                    row_tmp = tmp_df_1h.loc[tmp_df_1h['time'] == time_prev, :].reset_index(drop=True)
                    row_tmp.columns = [c + f'_prev_{i}' for c in row_tmp.columns]
                except IndexError:
                    pass_cycle = True
                    break
                row = pd.concat([row, row_tmp.iloc[:,1:]], axis=1)
                row['ticker'] = ticker
                row['pattern'] = pattern
                
            if pass_cycle:
                continue

            row['target'] = 0
            
            if row['pattern'].values == 'STOCH_RSI':
                if ttype == 'buy':
                    row['ttype'] = 'sell'
                else:
                    row['ttype'] = 'buy'
            else:
                row['ttype'] = ttype
            
            # If ttype = buy and during the selected period high price was higher than close_price * target_ratio
            # and earlier low price wasn't lower than close_price / target_ratio, than target is True, else target is False.
            # Similarly for ttype = sell 
            close_price = tmp_df_1h.loc[tmp_df_1h['time'] == t, 'close'].values
            
            for i in range(1, target_offset + 1):
                time_next = t + timedelta(hours=i)
                target_buy = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'high'].reset_index(drop=True)
                target_sell = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'low'].reset_index(drop=True)

                try:
                    target_buy = target_buy > close_price * CFG.cls_target_ratio
                    target_sell = target_sell < close_price / CFG.cls_target_ratio
                except ValueError:
                    pass_cycle = True
                    break
                
                try:
                    if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
                        break
                    elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
                        row['target'] = 1
                        break
                except KeyError:
                    pass_cycle = True
                    break
            
            if pass_cycle:
                continue

            # add data to the dataset
            if train_df.shape[0] == 0:
                train_df = row
            else:
                train_df = pd.concat([train_df, row])
    
    return train_df

# for how long time (in hours) we want to predict
target_offset = 48
# first previous data point to collect for model training (value represents number of hours before signal point)
first = 4
# last previous data point to collect for model training (value represents number of hours before signal point)
last = 96
# step of previous data points collecting (total number of points to collect is (last - first + step) / step)
step = 4

# Buy
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/buy_stat_1h.pkl')
# dataset for model train
train_buy = create_train_df(df, 'buy', configs, target_offset, first, last, step)
train_buy = train_buy.dropna()

# Sell
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/sell_stat_1h.pkl')
# dataset for model train
train_sell = create_train_df(df, 'sell', configs, target_offset, first, last, step)
train_sell = train_sell.dropna()

df = pd.concat([train_buy, train_sell]).sort_values('time').reset_index(drop=True)
df.to_pickle(f'signal_stat/train_df_{last}.pkl')

display(df.head())
display(df.shape)


  0%|          | 0/374 [00:00<?, ?it/s]

  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or 

  0%|          | 0/368 [00:00<?, ?it/s]

  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
  if row['pattern'].values == 'STOCH_RSI':
  if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
  elif (row['ttype'].values == 'buy' and target_buy[0]) or 

Unnamed: 0,time,open,high,low,close,volume,rsi,stoch_slowk,stoch_slowd,stoch_slowk_dir,...,linear_reg_angle_prev_96,macd_prev_96,macdsignal_prev_96,macdhist_prev_96,macd_dir_prev_96,macdsignal_dir_prev_96,atr_prev_96,close_smooth_prev_96,target,ttype
0,2022-10-13 18:00:00,22.39,23.02,22.37,22.93,22371.49,47.677698,34.732108,30.172518,0.106554,...,3.772075,-0.04147,-0.066519,0.025049,-0.105369,-0.059383,0.113042,24.149167,0,sell
1,2022-10-14 04:00:00,23.47,24.07,23.47,23.98,20004.17,72.477672,90.316467,91.151018,-0.009465,...,1.723219,-0.016099,-0.031895,0.015796,-0.194682,-0.064296,0.112924,24.142083,1,sell
2,2022-10-18 23:00:00,23.46,23.7,23.44,23.62,3346.24,43.727673,25.327885,24.045347,-0.007431,...,-6.562901,0.035983,0.107527,-0.071544,-0.288045,-0.100355,0.188287,23.758333,1,sell
3,2022-10-26 01:00:00,24.15,24.15,23.76,23.84,13772.65,64.054892,83.517327,87.090858,-0.029809,...,-10.369665,-0.020052,-0.052992,0.03294,-0.201703,-0.103966,0.133742,22.383333,1,buy
4,2022-10-26 05:00:00,24.04,24.1,23.98,24.03,8515.96,68.293118,54.118435,61.22084,-0.108235,...,-11.869113,-0.017024,-0.033512,0.016488,0.0,-0.106127,0.121932,22.38125,1,buy


(25093, 505)

# Check pattern/target distribution

In [5]:
train_sell[['target', 'pattern']].value_counts()

target  pattern       
1       MACD              1985
        PumpDump_Trend    1928
0       PumpDump_Trend    1902
        MACD              1250
1       Pattern_Trend      961
0       Pattern_Trend      946
        STOCH_RSI          580
1       STOCH_RSI          543
dtype: int64

In [6]:
train_buy[['target', 'pattern']].value_counts()

target  pattern       
0       PumpDump_Trend    4533
1       PumpDump_Trend    4440
0       Pattern_Trend     1681
1       Pattern_Trend     1548
        MACD              1003
0       MACD               643
1       STOCH_RSI          606
0       STOCH_RSI          544
dtype: int64

# Check target correctness

In [None]:
# i = 1000

# x = train_df.loc[(train_df.target == 1) & (train_df.ttype == 'buy'), ['ticker', 'ttype', 'pattern', 'time', 'close', 'target']]
# y = x.iloc[i]
# low_price, high_price = y['close'] / CFG.cls_target_ratio, y['close'] * CFG.cls_target_ratio,
# print(y['ticker'], y['time'], y['ttype'])

# tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{y["ticker"]}_1h.pkl')
# # tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{y["ticker"][:-4]}-{y["ticker"][-4:]}_1h.pkl')
# # tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{y["ticker"][:-4]}-{y["ticker"][-4:]}-SWAP_4h.pkl')

# tmp_df_1h['low_price'] = low_price
# tmp_df_1h['high_price'] = high_price
# idx = tmp_df_1h[tmp_df_1h['time'] == y['time']].index[0]

# tmp_df_1h = tmp_df_1h.iloc[idx:idx+24][['time', 'close', 'high', 'high_price', 'low', 'low_price']]

# if y['ttype'] == 'buy':
#     tmp_df_1h['signal'] = tmp_df_1h['high'] > tmp_df_1h['high_price']
#     tmp_df_1h['anti_signal'] = tmp_df_1h['low'] < tmp_df_1h['low_price']
# else:
#     tmp_df_1h['signal'] = tmp_df_1h['low'] < tmp_df_1h['low_price']
#     tmp_df_1h['anti_signal'] = tmp_df_1h['high'] > tmp_df_1h['high_price']

# tmp_df_1h