# Process data config

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sys
sys.path.append('..')

from os import environ
import pandas as pd
from indicators import indicators
from datetime import timedelta
from tqdm.auto import tqdm
from config.config import ConfigFactory

class CFG:
    collect_data = False # create new dataset or load previous
    select_features = False
    train_NN = False
    train_LGBM = True
    n_repeats = 1
    n_folds = 5
    cls_target_ratio = 1.021

# Load data and add indicators

In [3]:
# Set environment variable
environ["ENV"] = "1h_4h"

# Get configs
configs = ConfigFactory.factory(environ).configs

def get_file(ticker):
    ''' Find files buy ticker names, file names can be in different formats '''
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    return None, None

def add_indicators(df, ttype, configs):
    # add RSI
    rsi = indicators.RSI(ttype, configs)
    df = rsi.get_indicator(df, '', '', 0)
    # add RSI
    stoch = indicators.STOCH(ttype, configs)
    df = stoch.get_indicator(df, '', '', 0)
    # add Trend
    trend = indicators.Trend(ttype, configs)
    df = trend.get_indicator(df, '', '', 0)
    # add MACD
    macd = indicators.MACD(ttype, configs)
    df = macd.get_indicator(df, '', '', 0)
    # add ATR
    atr = indicators.ATR(ttype, configs)
    df = atr.get_indicator(df, '', '', 0)
    # add SMA
    # sma = indicators.SMA(ttype, configs)
    # df = sma.get_indicator(df, '', '', 0)
    return df

def create_train_df(df, ttype, configs, target_offset, first, last, step):
    ''' Create train dataset from signal statistics and ticker candle data'''
    train_df = pd.DataFrame()
    tickers = df['ticker'].unique()
    
    for ticker in tqdm(tickers):
        # get signals with current ticker
        signal_df = df[df['ticker'] == ticker]
        times = signal_df['time']
        
        # load candle history of this ticker
        tmp_df_1h, tmp_df_4h = get_file(ticker)

        # add indicators 
        tmp_df_1h = add_indicators(tmp_df_1h, ttype, configs)

        # add historical data for current ticker
        for i, t in enumerate(times.to_list()):
            pass_cycle = False
            pattern = signal_df.iloc[i, signal_df.columns.get_loc('pattern')]
            row = tmp_df_1h.loc[tmp_df_1h['time'] == t, :].reset_index(drop=True)
            
            for i in range(first, last + step, step):
                time_prev = t + timedelta(hours= -i)
                try:
                    row_tmp = tmp_df_1h.loc[tmp_df_1h['time'] == time_prev, :].reset_index(drop=True)
                    row_tmp.columns = [c + f'_prev_{i}' for c in row_tmp.columns]
                except IndexError:
                    pass_cycle = True
                    break
                row = pd.concat([row, row_tmp.iloc[:,1:]], axis=1)
                row['ticker'] = ticker
                row['pattern'] = pattern
                
            if pass_cycle:
                continue

            row['target'] = 0
            
            if row['pattern'].values == 'STOCH_RSI':
                if ttype == 'buy':
                    row['ttype'] = 'sell'
                else:
                    row['ttype'] = 'buy'
            else:
                row['ttype'] = ttype
            
            # Ff ttype = buy and during the selected period high price was higher than close_price * target_ratio
            # and earlier low price wasn't lower than close_price / target_ratio, than target is True, else target is False.
            # Similarly for ttype = sell 
            close_price = tmp_df_1h.loc[tmp_df_1h['time'] == t, 'close'].values
            
            for i in range(1, target_offset + 1):
                time_next = t + timedelta(hours=i)
                target_buy = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'high'].reset_index(drop=True)
                target_sell = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'low'].reset_index(drop=True)

                try:
                    target_buy = target_buy > close_price * CFG.cls_target_ratio
                    target_sell = target_sell < close_price / CFG.cls_target_ratio
                except ValueError:
                    pass_cycle = True
                    break
                
                try:
                    if (row['ttype'].values == 'buy' and target_sell[0]) or (row['ttype'].values == 'sell' and target_buy[0]):
                        break
                    elif (row['ttype'].values == 'buy' and target_buy[0]) or (row['ttype'].values == 'sell' and target_sell[0]):
                        row['target'] = 1
                        break
                except KeyError:
                    pass_cycle = True
                    break
            
            if pass_cycle:
                continue

            # add data to the dataset
            if train_df.shape[0] == 0:
                train_df = row
            else:
                train_df = pd.concat([train_df, row])
    
    return train_df

# for how long time (in hours) we want to predict
target_offset = 24
# first previous data point to collect for model training (value represents number of hours before signal point)
first = 4
# last previous data point to collect for model training (value represents number of hours before signal point)
last = 72
# step of previous data points collecting (total number of points to collect is (last - first + step) / step)
step = 4

if CFG.collect_data is True:
    # Buy
    # dataset with the signal statistics
    df = pd.read_pickle('signal_stat/buy_stat_1h.pkl')
    # dataset for model train
    train_buy = create_train_df(df, 'buy', configs, target_offset, first, last, step)
    train_buy = train_buy.dropna()

    # Sell
    # dataset with the signal statistics
    df = pd.read_pickle('signal_stat/sell_stat_1h.pkl')
    # dataset for model train
    train_sell = create_train_df(df, 'sell', configs, target_offset, first, last, step)
    train_sell = train_sell.dropna()

    df = pd.concat([train_buy, train_sell]).sort_values('time').reset_index(drop=True)
    df.to_pickle(f'signal_stat/train_df_{last}.pkl')
else:
    df = pd.read_pickle(f'signal_stat/train_df_{last}.pkl')

display(df.head())
display(df.shape)


Unnamed: 0,time,open,high,low,close,volume,rsi,stoch_slowk,stoch_slowd,stoch_slowk_dir,...,linear_reg_angle_prev_72,macd_prev_72,macdsignal_prev_72,macdhist_prev_72,macd_dir_prev_72,macdsignal_dir_prev_72,atr_prev_72,close_smooth_prev_72,target,ttype
0,2022-09-10 21:00:00,0.9999,0.9999,0.9998,0.9998,227818.0,58.3802,35.714286,34.52381,0.205808,...,-0.856226,-2e-05,-1e-05,-9e-06,0.318808,0.227337,0.000114,1.000004,0,buy
1,2022-09-15 15:00:00,1.0,1.0,0.9999,1.0,250135.0,58.463239,64.285714,53.174603,0.357436,...,5.250146,1.6e-05,1.1e-05,6e-06,0.229362,0.110533,0.000113,0.999742,0,buy
2,2022-09-21 19:00:00,1.0,1.0,0.9999,0.9999,49801.0,45.480088,57.142857,61.904762,-0.066667,...,5.078633,6e-06,-4e-06,1e-05,-1.898082,-0.192558,0.000113,0.999988,0,buy
3,2022-12-25 15:00:00,16.26,16.28,16.22,16.26,8091.48,34.890095,26.002932,23.959198,0.064161,...,10.131495,0.101978,0.165148,-0.06317,-0.103585,-0.067033,0.161075,16.47375,1,sell
4,2022-12-25 15:00:00,6.89,6.91,6.89,6.9,3986.13,30.533005,14.404762,18.392857,-0.204445,...,11.812509,0.028818,-0.003283,0.032101,-0.946146,-0.453481,0.088674,6.850417,0,sell


(12043, 385)

# Remove stablecoins

In [5]:
df = df[df['ticker'] != 'TUSDUSDT'].reset_index(drop=True)
display(df.head())
display(df.shape)
df.to_pickle(f'signal_stat/train_df_{last}.pkl')

Unnamed: 0,time,open,high,low,close,volume,rsi,stoch_slowk,stoch_slowd,stoch_slowk_dir,...,linear_reg_angle_prev_72,macd_prev_72,macdsignal_prev_72,macdhist_prev_72,macd_dir_prev_72,macdsignal_dir_prev_72,atr_prev_72,close_smooth_prev_72,target,ttype
0,2022-12-25 15:00:00,16.26,16.28,16.22,16.26,8091.48,34.890095,26.002932,23.959198,0.064161,...,10.131495,0.101978,0.165148,-0.06317,-0.103585,-0.067033,0.161075,16.47375,1,sell
1,2022-12-25 15:00:00,6.89,6.91,6.89,6.9,3986.13,30.533005,14.404762,18.392857,-0.204445,...,11.812509,0.028818,-0.003283,0.032101,-0.946146,-0.453481,0.088674,6.850417,0,sell
2,2022-12-25 15:00:00,0.3471,0.3478,0.3462,0.3478,3774232.0,35.11765,17.786212,17.135809,-0.04677,...,4.27932,0.000238,-0.00028,0.000518,0.0,-0.227125,0.002465,0.344346,0,sell
3,2022-12-25 15:00:00,0.397,0.398,0.393,0.396,304916.8,36.897427,42.857143,43.650794,-0.069109,...,0.303562,-0.000151,-0.000647,0.000496,-0.292488,-0.109834,0.003108,0.398667,0,sell
4,2022-12-25 15:00:00,0.01755,0.01755,0.0175,0.01754,4566524.6,21.09409,8.823657,9.251647,-0.033617,...,2.053866,3.9e-05,2e-05,1.9e-05,0.0,0.372109,0.000129,0.017925,1,sell


(11988, 385)

# Check target correctness

In [None]:
# i = 1000

# x = train_df.loc[(train_df.target == 1) & (train_df.ttype == 'buy'), ['ticker', 'ttype', 'pattern', 'time', 'close', 'target']]
# y = x.iloc[i]
# low_price, high_price = y['close'] / CFG.cls_target_ratio, y['close'] * CFG.cls_target_ratio,
# print(y['ticker'], y['time'], y['ttype'])

# tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{y["ticker"]}_1h.pkl')
# # tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{y["ticker"][:-4]}-{y["ticker"][-4:]}_1h.pkl')
# # tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{y["ticker"][:-4]}-{y["ticker"][-4:]}-SWAP_4h.pkl')

# tmp_df_1h['low_price'] = low_price
# tmp_df_1h['high_price'] = high_price
# idx = tmp_df_1h[tmp_df_1h['time'] == y['time']].index[0]

# tmp_df_1h = tmp_df_1h.iloc[idx:idx+24][['time', 'close', 'high', 'high_price', 'low', 'low_price']]

# if y['ttype'] == 'buy':
#     tmp_df_1h['signal'] = tmp_df_1h['high'] > tmp_df_1h['high_price']
#     tmp_df_1h['anti_signal'] = tmp_df_1h['low'] < tmp_df_1h['low_price']
# else:
#     tmp_df_1h['signal'] = tmp_df_1h['low'] < tmp_df_1h['low_price']
#     tmp_df_1h['anti_signal'] = tmp_df_1h['high'] > tmp_df_1h['high_price']

# tmp_df_1h