# Load data and add indicators

In [4]:
import sys
sys.path.append('..')

from os import environ
import numpy as np
import pandas as pd
from indicators import indicators
from datetime import timedelta
from tqdm.auto import tqdm
from config.config import ConfigFactory

# Set environment variable
environ["ENV"] = "1h_4h"

# Get configs
configs = ConfigFactory.factory(environ).configs

def get_file(ticker):
    ''' Find files buy ticker names, file names can be in different formats '''
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    return None, None

def add_indicators(df, ttype, configs):
    # add RSI
    rsi = indicators.RSI(ttype, configs)
    df = rsi.get_indicator(df, '', '', 0)
    # add RSI
    stoch = indicators.STOCH(ttype, configs)
    df = stoch.get_indicator(df, '', '', 0)
    # add Trend
    trend = indicators.Trend(ttype, configs)
    df = trend.get_indicator(df, '', '', 0)
    # add MACD
    macd = indicators.MACD(ttype, configs)
    df = macd.get_indicator(df, '', '', 0)
    # add ATR
    atr = indicators.ATR(ttype, configs)
    df = atr.get_indicator(df, '', '', 0)
    # add SMA
    # sma = indicators.SMA(ttype, configs)
    # df = sma.get_indicator(df, '', '', 0)
    return df

def create_train_df(df, ttype, configs, target_offset, first, last, step):
    ''' Create train dataset from signal statistics and ticker candle data'''
    train_df = pd.DataFrame()
    tickers = df['ticker'].unique()
    
    for ticker in tqdm(tickers):
        # get signals with current ticker
        signal_df = df[df['ticker'] == ticker]
        times = signal_df['time']
        
        # load candle history of this ticker
        tmp_df_1h, tmp_df_4h = get_file(ticker)

        # add indicators 
        tmp_df_1h = add_indicators(tmp_df_1h, ttype, configs)

        # add historical data for current ticker
        for i, t in enumerate(times.to_list()):
            pass_cycle = False
            pattern = signal_df.iloc[i, signal_df.columns.get_loc('pattern')]
            row = tmp_df_1h.loc[tmp_df_1h['time'] == t, :].reset_index(drop=True)
            
            for i in range(first, last + step, step):
                time_prev = t + timedelta(hours= -i)
                try:
                    row_tmp = tmp_df_1h.loc[tmp_df_1h['time'] == time_prev, :].reset_index(drop=True)
                    row_tmp.columns = [c + f'_prev_{i}' for c in row_tmp.columns]
                except IndexError:
                    pass_cycle = True
                    break
                row = pd.concat([row, row_tmp.iloc[:,1:]], axis=1)
                row['ticker'] = ticker
                row['pattern'] = pattern
                
            if pass_cycle:
                continue
            
            # add target
            time_next = t + timedelta(hours=target_offset)
            if ttype == 'buy':
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'high'].reset_index(drop=True)
            else:
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'low'].reset_index(drop=True)

            target.name = 'target'
            rows = pd.concat([row, target], axis=1)
            
            # add data to the dataset
            if train_df.shape[0] == 0:
                train_df = rows
            else:
                train_df = pd.concat([train_df, rows])
    
    return train_df

# for how long time (in hours) we want to predict
target_offset = 24
# first previous data point to collect for model training (value represents number of hours before signal point)
first = 4
# last previous data point to collect for model training (value represents number of hours before signal point)
last = 48
# step of previous data points collecting (total number of points to collect is (last - first + step) / step)
step = 4

# Buy
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/buy_stat_1h.pkl')
# dataset for model train
train_buy = create_train_df(df, 'buy', configs, target_offset, first, last, step)
train_buy = train_buy.dropna()

# Sell
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/sell_stat_1h.pkl')
# dataset for model train
train_sell = create_train_df(df, 'sell', configs, target_offset, first, last, step)
train_sell = train_sell.dropna()

train_df = pd.concat([train_buy, train_sell]).sort_values('time').reset_index(drop=True)
display(train_df.head())
display(train_df.shape)


100%|██████████| 370/370 [01:07<00:00,  5.45it/s]
100%|██████████| 308/308 [00:35<00:00,  8.58it/s]


Unnamed: 0,time,open,high,low,close,volume,rsi,stoch_slowk,stoch_slowd,stoch_slowk_dir,stoch_slowd_dir,stoch_diff,linear_reg,linear_reg_angle,macd,macdsignal,macdhist,macd_dir,macdsignal_dir,atr,close_smooth,open_prev_4,high_prev_4,low_prev_4,close_prev_4,volume_prev_4,rsi_prev_4,stoch_slowk_prev_4,stoch_slowd_prev_4,stoch_slowk_dir_prev_4,stoch_slowd_dir_prev_4,stoch_diff_prev_4,linear_reg_prev_4,linear_reg_angle_prev_4,macd_prev_4,macdsignal_prev_4,macdhist_prev_4,macd_dir_prev_4,macdsignal_dir_prev_4,atr_prev_4,close_smooth_prev_4,ticker,pattern,open_prev_8,high_prev_8,low_prev_8,close_prev_8,volume_prev_8,rsi_prev_8,stoch_slowk_prev_8,stoch_slowd_prev_8,stoch_slowk_dir_prev_8,stoch_slowd_dir_prev_8,stoch_diff_prev_8,linear_reg_prev_8,linear_reg_angle_prev_8,macd_prev_8,macdsignal_prev_8,macdhist_prev_8,macd_dir_prev_8,macdsignal_dir_prev_8,atr_prev_8,close_smooth_prev_8,open_prev_12,high_prev_12,low_prev_12,close_prev_12,volume_prev_12,rsi_prev_12,stoch_slowk_prev_12,stoch_slowd_prev_12,stoch_slowk_dir_prev_12,stoch_slowd_dir_prev_12,stoch_diff_prev_12,linear_reg_prev_12,linear_reg_angle_prev_12,macd_prev_12,macdsignal_prev_12,macdhist_prev_12,macd_dir_prev_12,macdsignal_dir_prev_12,atr_prev_12,close_smooth_prev_12,open_prev_16,high_prev_16,low_prev_16,close_prev_16,volume_prev_16,rsi_prev_16,stoch_slowk_prev_16,stoch_slowd_prev_16,stoch_slowk_dir_prev_16,stoch_slowd_dir_prev_16,stoch_diff_prev_16,linear_reg_prev_16,linear_reg_angle_prev_16,macd_prev_16,macdsignal_prev_16,macdhist_prev_16,macd_dir_prev_16,macdsignal_dir_prev_16,atr_prev_16,close_smooth_prev_16,open_prev_20,high_prev_20,low_prev_20,close_prev_20,volume_prev_20,rsi_prev_20,stoch_slowk_prev_20,stoch_slowd_prev_20,stoch_slowk_dir_prev_20,stoch_slowd_dir_prev_20,stoch_diff_prev_20,linear_reg_prev_20,linear_reg_angle_prev_20,macd_prev_20,macdsignal_prev_20,macdhist_prev_20,macd_dir_prev_20,macdsignal_dir_prev_20,atr_prev_20,close_smooth_prev_20,open_prev_24,high_prev_24,low_prev_24,close_prev_24,volume_prev_24,rsi_prev_24,stoch_slowk_prev_24,stoch_slowd_prev_24,stoch_slowk_dir_prev_24,stoch_slowd_dir_prev_24,stoch_diff_prev_24,linear_reg_prev_24,linear_reg_angle_prev_24,macd_prev_24,macdsignal_prev_24,macdhist_prev_24,macd_dir_prev_24,macdsignal_dir_prev_24,atr_prev_24,close_smooth_prev_24,open_prev_28,high_prev_28,low_prev_28,close_prev_28,volume_prev_28,rsi_prev_28,stoch_slowk_prev_28,stoch_slowd_prev_28,stoch_slowk_dir_prev_28,stoch_slowd_dir_prev_28,stoch_diff_prev_28,linear_reg_prev_28,linear_reg_angle_prev_28,macd_prev_28,macdsignal_prev_28,macdhist_prev_28,macd_dir_prev_28,macdsignal_dir_prev_28,atr_prev_28,close_smooth_prev_28,open_prev_32,high_prev_32,low_prev_32,close_prev_32,volume_prev_32,rsi_prev_32,stoch_slowk_prev_32,stoch_slowd_prev_32,stoch_slowk_dir_prev_32,stoch_slowd_dir_prev_32,stoch_diff_prev_32,linear_reg_prev_32,linear_reg_angle_prev_32,macd_prev_32,macdsignal_prev_32,macdhist_prev_32,macd_dir_prev_32,macdsignal_dir_prev_32,atr_prev_32,close_smooth_prev_32,open_prev_36,high_prev_36,low_prev_36,close_prev_36,volume_prev_36,rsi_prev_36,stoch_slowk_prev_36,stoch_slowd_prev_36,stoch_slowk_dir_prev_36,stoch_slowd_dir_prev_36,stoch_diff_prev_36,linear_reg_prev_36,linear_reg_angle_prev_36,macd_prev_36,macdsignal_prev_36,macdhist_prev_36,macd_dir_prev_36,macdsignal_dir_prev_36,atr_prev_36,close_smooth_prev_36,open_prev_40,high_prev_40,low_prev_40,close_prev_40,volume_prev_40,rsi_prev_40,stoch_slowk_prev_40,stoch_slowd_prev_40,stoch_slowk_dir_prev_40,stoch_slowd_dir_prev_40,stoch_diff_prev_40,linear_reg_prev_40,linear_reg_angle_prev_40,macd_prev_40,macdsignal_prev_40,macdhist_prev_40,macd_dir_prev_40,macdsignal_dir_prev_40,atr_prev_40,close_smooth_prev_40,open_prev_44,high_prev_44,low_prev_44,close_prev_44,volume_prev_44,rsi_prev_44,stoch_slowk_prev_44,stoch_slowd_prev_44,stoch_slowk_dir_prev_44,stoch_slowd_dir_prev_44,stoch_diff_prev_44,linear_reg_prev_44,linear_reg_angle_prev_44,macd_prev_44,macdsignal_prev_44,macdhist_prev_44,macd_dir_prev_44,macdsignal_dir_prev_44,atr_prev_44,close_smooth_prev_44,open_prev_48,high_prev_48,low_prev_48,close_prev_48,volume_prev_48,rsi_prev_48,stoch_slowk_prev_48,stoch_slowd_prev_48,stoch_slowk_dir_prev_48,stoch_slowd_dir_prev_48,stoch_diff_prev_48,linear_reg_prev_48,linear_reg_angle_prev_48,macd_prev_48,macdsignal_prev_48,macdhist_prev_48,macd_dir_prev_48,macdsignal_dir_prev_48,atr_prev_48,close_smooth_prev_48,target
0,2022-12-25 11:00:00,46.26,46.36,46.22,46.32,118.178,46.491941,25.582139,30.565279,-0.153935,-0.094094,-5.058263,14.743892,0.792062,-0.000206,0.046597,-0.046803,-0.684832,-0.158108,0.292395,46.5,46.5,46.57,46.48,46.55,61.767,51.841832,40.816327,41.237307,-0.021462,-0.030866,-1.292134,16.886083,5.659086,0.075095,0.083534,-0.008439,0.0,-0.002941,0.313171,46.5125,GMXUSDT,Pattern_Trend,46.66,46.76,46.6,46.7,190.484,56.926473,46.790397,44.197805,0.019338,-0.01318,0.231851,16.155359,8.361473,0.087864,0.084188,0.003676,0.0,-0.009786,0.335724,46.48,46.85,46.85,46.42,46.58,564.118,54.860672,47.515528,44.710638,0.096618,0.018884,3.153348,14.493804,8.962271,0.099582,0.084137,0.015445,0.125895,0.037924,0.332216,46.390417,46.47,46.63,46.3,46.32,744.344,49.561511,40.228758,48.660131,-0.095115,-0.069233,-4.324573,12.730454,4.707387,0.07102,0.076278,-0.005257,0.0,0.020809,0.334046,46.29125,46.42,46.53,46.38,46.47,175.393,54.723225,60.662522,66.643248,-0.063949,-0.030833,-3.925741,11.989522,3.795526,0.094389,0.067517,0.026872,0.0,0.185419,0.350795,46.220833,46.45,46.53,46.36,46.36,164.49,53.272903,73.532209,72.698875,0.003284,0.028845,0.551213,11.527984,4.07582,0.072998,0.02844,0.044558,0.0,1.258746,0.383808,46.149583,46.5,46.72,46.46,46.62,241.902,59.954839,68.616726,59.071881,0.152659,0.124571,7.200074,11.193354,6.870587,0.049619,-0.031324,0.080943,0.0,-0.265167,0.406432,46.12,46.12,46.64,46.06,46.29,1703.863,54.342791,41.720687,36.55477,0.140004,0.095457,4.286842,10.218249,6.816037,-0.05924,-0.088933,0.029692,-0.153084,-0.026067,0.410635,46.07625,46.14,46.14,45.61,45.75,374.855,41.877823,26.028338,28.11342,-0.031486,-0.015907,-0.511938,10.533592,-2.6903,-0.093782,-0.095059,0.001277,0.0,-0.031598,0.420471,46.065,46.09,46.09,45.9,45.92,178.2,44.273884,28.400824,29.705455,-0.103387,-0.133153,-4.471784,11.833909,0.464248,-0.100972,-0.108983,0.008011,0.0,-0.026705,0.428276,46.03875,46.07,46.37,45.93,46.15,532.213,48.211249,46.422386,48.451402,-0.041116,-0.044675,-2.124732,14.199926,-1.294932,-0.096485,-0.12539,0.028905,0.0,-0.052294,0.43492,46.0575,46.44,46.53,45.85,45.95,1809.134,43.505209,55.961221,54.684249,0.058372,0.05848,3.344189,15.836865,-4.003925,-0.110585,-0.156218,0.045632,0.0,-0.078445,0.460133,46.190833,45.83
1,2022-12-25 19:00:00,1.292,1.293,1.28,1.292,24194.5,27.640603,16.651064,16.746801,-0.150625,-0.236119,-5.811989,20.604183,-28.291688,-0.013059,-0.007752,-0.005307,0.232816,0.258062,0.010403,1.332208,1.313,1.324,1.309,1.322,36982.5,37.328301,38.601227,43.92359,-0.121833,-0.111811,-6.44081,14.146224,-15.773604,-0.005509,-0.003107,-0.002402,0.752244,0.199808,0.008463,1.340458,GTCUSDT,Pattern_Trend,1.353,1.36,1.35,1.351,88688.8,55.0453,65.034014,63.777254,0.059908,0.103472,4.510727,12.145535,8.918242,-0.000284,-0.00198,0.001696,-0.456386,-0.130956,0.00698,1.343333,1.347,1.347,1.341,1.342,1939.6,46.016761,45.906593,41.762821,0.089253,0.121617,3.869048,12.3583,0.167951,-0.00261,-0.003229,0.000619,0.0,-0.034974,0.006727,1.3435,1.339,1.343,1.339,1.342,6029.8,45.269107,27.747253,26.923077,-0.097426,-0.135222,-4.624978,13.364734,-6.222921,-0.003728,-0.003623,-0.000105,0.0,0.023556,0.006835,1.346958,1.344,1.345,1.341,1.343,8958.6,44.230995,43.233909,40.58298,0.059183,0.080942,2.589535,12.382839,-5.066362,-0.003558,-0.003304,-0.000254,0.0,0.037295,0.007228,1.350917,1.347,1.348,1.343,1.343,6577.0,43.141855,31.994609,25.556909,0.328699,0.309868,5.936692,12.023771,-4.461538,-0.003547,-0.002803,-0.000744,0.0,0.114736,0.007782,1.353083,1.342,1.342,1.338,1.34,7699.3,38.679233,9.942317,11.380843,-0.197216,-0.290066,-4.617121,12.263572,-9.255824,-0.003863,-0.001554,-0.002309,0.267208,-0.230515,0.008152,1.354708,1.35,1.35,1.348,1.349,5590.3,43.568529,32.02416,42.958425,-0.20892,-0.168189,-10.859008,10.379522,-2.862596,-0.000833,0.000763,-0.001595,0.0,-0.248891,0.008524,1.357583,1.363,1.365,1.358,1.359,8108.7,50.289166,75.839599,80.473406,-0.004597,0.033753,0.978488,11.292521,3.446809,0.002208,0.001968,0.00024,0.0,0.1118,0.00901,1.358708,1.362,1.367,1.358,1.364,35036.4,55.635844,71.506892,68.451337,0.054281,0.076197,4.404762,10.682233,6.075315,0.002011,0.0012,0.000811,0.273602,0.197346,0.009464,1.357917,1.356,1.359,1.351,1.352,7651.7,47.751214,51.090226,50.296575,0.070047,0.051488,3.264643,10.158722,2.00219,0.000303,0.000629,-0.000326,-0.254458,-0.046887,0.009565,1.356333,1.358,1.359,1.353,1.354,14131.8,49.424916,41.507937,42.653061,-0.006001,-0.020624,-0.037793,10.488927,4.181646,0.0006,0.000702,-0.000102,-0.152035,-0.013956,0.010223,1.352333,1.322
2,2022-12-26 11:00:00,0.9162,0.9164,0.9041,0.9116,74806.21,46.098886,73.599792,71.800194,0.083753,0.118964,5.979474,12.428864,-5.306077,-0.003355,-0.005291,0.001936,-0.102655,-0.085493,0.015634,0.913917,0.9142,0.9206,0.9113,0.9145,91093.51,47.814372,51.240132,45.658605,0.158348,0.176941,5.994004,12.804207,-3.251928,-0.005473,-0.007477,0.002004,-0.13909,-0.053756,0.015953,0.915908,FTTBUSD,Pattern_Trend,0.8928,0.9084,0.8919,0.9002,171000.84,37.901082,26.746845,26.098241,0.014228,-0.025354,-0.152222,13.657995,-9.661385,-0.009624,-0.008879,-0.000746,0.0,0.014767,0.015916,0.918392,0.9103,0.9196,0.908,0.9144,70190.36,41.835764,27.052305,29.132548,-0.009182,0.007811,0.10786,11.759367,-3.661027,-0.008517,-0.008474,-4.2e-05,0.0,0.016719,0.015876,0.922304,0.9092,0.9145,0.9068,0.9109,45190.17,36.608648,28.210069,28.262818,0.019432,0.040866,1.151411,10.71752,-5.537014,-0.008528,-0.007842,-0.000686,0.0,0.011267,0.016965,0.926125,0.9249,0.9249,0.9167,0.92,87553.44,36.930106,21.691104,27.196041,-0.177899,-0.144111,-5.692908,10.359234,-2.578605,-0.007302,-0.007841,0.000539,0.0,-0.023094,0.01692,0.932442,0.9298,0.9303,0.924,0.9244,39370.81,38.541736,43.480226,47.294055,-0.042565,0.01736,-1.136807,12.077393,-0.334131,-0.007979,-0.008638,0.000658,0.0,-0.020558,0.018688,0.937158,0.9324,0.9375,0.9305,0.9314,27369.14,42.576961,45.535594,40.733646,0.088428,0.046355,2.604892,14.508144,1.496353,-0.008884,-0.009373,0.000489,0.0,-0.001378,0.020911,0.942904,0.928,0.9441,0.9234,0.9292,117893.29,40.246061,36.818815,35.221771,0.001545,-0.031234,-0.548302,16.992498,-1.908,-0.010758,-0.009093,-0.001665,0.0,0.071481,0.023128,0.948483,0.9406,0.9406,0.9221,0.9299,134211.07,38.125641,39.538588,38.739557,0.091481,0.088397,2.968918,17.911165,-4.304029,-0.009027,-0.006666,-0.002361,0.0,0.106769,0.024584,0.953925,0.952,0.953,0.945,0.9503,119788.19,43.523265,30.944313,29.51021,0.019537,-0.02977,-0.03278,19.354642,2.774436,-0.005312,-0.004313,-0.000999,0.0,0.079552,0.024739,0.963504,0.9491,0.955,0.9409,0.9522,208440.26,44.140618,32.118506,35.724205,-0.057994,-0.025086,-1.883968,21.302644,2.964429,-0.005177,-0.002995,-0.002183,0.108176,0.324779,0.027409,0.972183,0.96,0.9694,0.9522,0.962,156139.94,47.234592,39.700335,37.123861,0.13191,0.189204,4.551732,23.329862,5.584309,-0.003143,-0.000664,-0.002479,0.0,4.144364,0.030074,0.977667,0.9273
3,2022-12-26 11:00:00,31.67,31.7,31.57,31.6,978.546,37.067652,39.271895,51.004334,-0.195484,-0.120471,-11.084099,40.337944,-15.19123,-0.173984,-0.196916,0.022932,0.0,-0.036265,0.202137,31.833333,31.73,32.03,31.66,31.87,5176.885,44.405214,77.214803,72.957601,0.083138,0.134993,6.203636,41.805591,-12.270913,-0.18008,-0.232283,0.052203,0.0,-0.050536,0.21231,31.98375,COMPUSDT,Pattern_Trend,31.82,32.08,31.79,32.08,2690.865,49.329084,50.661265,39.096185,0.344885,0.27069,8.8513,44.27926,-15.641882,-0.248246,-0.283865,0.035619,0.0,-0.001928,0.192595,32.133333,31.55,31.56,31.34,31.39,419.806,14.817168,18.576188,18.226242,-0.020843,-0.030746,-0.541802,43.149979,-35.919521,-0.35504,-0.272471,-0.082569,0.0,0.100306,0.185207,32.286667,31.57,31.68,31.5,31.61,2695.156,15.238461,20.310621,19.86053,0.03377,0.026231,0.558789,35.794147,-35.598268,-0.284421,-0.178513,-0.105908,0.21322,0.188514,0.179192,32.5125,32.4,32.4,32.17,32.34,1397.751,26.452889,18.684797,18.17018,-0.053577,-0.096185,-1.614801,28.063522,-23.578205,-0.120919,-0.092899,-0.02802,0.0,0.074154,0.141813,32.707917,32.6,32.6,32.49,32.53,647.272,34.205356,24.876976,26.869167,-0.088413,-0.054273,-2.443829,24.262583,-16.039489,-0.085765,-0.070313,-0.015452,0.113068,0.045685,0.137468,32.745417,32.79,32.8,32.72,32.72,123.653,42.463777,32.496599,31.094171,-0.027858,-0.093654,-1.986962,22.420778,-7.398875,-0.053753,-0.061305,0.007553,0.0,-0.033819,0.143725,32.780417,32.77,32.8,32.69,32.79,495.786,45.504895,41.285768,49.021775,-0.150984,-0.142728,-9.382902,23.643356,-7.86658,-0.059569,-0.070013,0.010443,0.0,-0.03017,0.154702,32.80875,32.84,32.92,32.82,32.82,214.973,45.072939,80.699302,83.438233,-0.004107,0.042584,0.922092,23.815575,-8.423689,-0.057145,-0.079537,0.022392,0.0,-0.067348,0.159325,32.853333,32.85,32.93,32.8,32.93,419.149,50.547007,74.747281,63.6699,0.200752,0.199359,10.383329,24.702102,-9.049436,-0.078736,-0.105996,0.027259,-0.134838,-0.04077,0.168501,32.90125,32.66,32.77,32.63,32.75,658.448,40.237558,34.245441,32.590774,0.092556,0.087545,2.868365,25.113733,-15.903928,-0.138647,-0.119743,-0.018904,0.0,0.069082,0.17805,32.952083,32.83,32.83,32.63,32.63,357.9,30.765687,23.314042,24.531985,0.012699,0.066965,0.949179,21.090391,-17.363179,-0.12211,-0.088182,-0.033928,0.102736,0.098692,0.184513,33.060833,32.62
4,2022-12-26 19:00:00,3.297,3.3,3.295,3.298,11074.08,27.71078,13.148723,11.58205,0.100909,-0.053247,0.446921,9.329446,-6.509616,-0.012609,-0.010707,-0.001903,0.0,0.03857,0.014612,3.325583,3.32,3.322,3.314,3.315,29703.63,33.36172,13.334557,18.915776,-0.260643,-0.237547,-7.080512,7.685691,-3.3566,-0.009783,-0.009322,-0.000461,0.0,0.007116,0.014603,3.333167,CAKEUSDT,Pattern_Trend,3.325,3.333,3.319,3.324,14014.23,36.000703,43.027481,49.856204,-0.068543,-0.00557,-2.292609,7.755127,-1.90671,-0.009164,-0.009033,-0.000131,0.0,-0.002399,0.01537,3.341125,3.344,3.346,3.331,3.333,5728.97,40.166732,51.134657,45.325231,0.202683,0.2593,7.797414,8.388108,0.84645,-0.008412,-0.009266,0.000854,0.0,-0.017817,0.016211,3.348917,3.33,3.339,3.319,3.327,15696.49,33.381038,20.40859,18.679968,0.045551,-0.035558,0.281018,9.220674,-1.057321,-0.01174,-0.00956,-0.00218,0.0,0.077523,0.015908,3.355417,3.334,3.337,3.32,3.322,3458.68,24.445134,20.588306,23.369145,-0.068634,-0.042399,-1.217966,10.592637,-1.569584,-0.011126,-0.006799,-0.004326,0.133891,0.222638,0.015151,3.363875,3.341,3.346,3.336,3.341,15092.73,30.950012,25.176264,28.783974,-0.046019,-0.02319,-1.097442,11.907217,3.226957,-0.006479,-0.002746,-0.003734,0.383997,1.005816,0.015208,3.370833,3.364,3.368,3.359,3.365,5398.61,43.964548,30.427388,31.992803,-0.080437,-0.091004,-3.103815,11.650768,-1.990766,-0.001436,0.000151,-0.001587,1.392652,-0.450975,0.0122,3.373,3.369,3.382,3.369,3.376,9235.78,51.113219,42.632653,48.23979,-0.113982,-0.103055,-6.447703,12.928458,2.679752,0.000738,0.001276,-0.000538,-0.234632,-0.077106,0.012382,3.372083,3.382,3.382,3.369,3.37,8673.79,46.576799,65.667885,70.412916,-0.020385,-0.00264,-0.983231,15.185299,1.81508,0.002184,0.001572,0.000612,0.0,0.498917,0.012646,3.372167,3.381,3.389,3.377,3.388,9093.3,60.54073,72.981615,69.472512,0.044199,0.051506,3.124864,15.299074,7.077771,0.002359,-1.2e-05,0.002372,-7.77537,-0.570936,0.012993,3.370625,3.373,3.375,3.366,3.373,6393.86,51.611109,59.343806,55.562461,0.148109,0.146041,6.533922,15.11041,2.29309,-0.000738,-0.001905,0.001167,-0.385556,-0.107376,0.013621,3.371333,3.366,3.367,3.362,3.362,2736.57,44.489379,36.285714,34.0858,0.049769,0.031402,1.471185,16.509265,-5.070883,-0.003804,-0.002525,-0.001279,0.0,0.273099,0.013469,3.373042,3.305


(5292, 264)

# Select features

In [6]:
import lightgbm as lgb
from sklearn.metrics import log_loss

from eli5.sklearn import PermutationImportance
from shaphypetune import BoostBoruta

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

from sklearn.metrics import log_loss, mean_squared_error, accuracy_score

import warnings
warnings.filterwarnings('ignore')

from colorama import Style, Fore

palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

blk = Style.BRIGHT + Fore.BLACK
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL

class CFG:
    n_repeats = 1
    n_folds = 5

def lgbm_tuning(df, permut=False, boruta=False):
    features = [c for c in df.columns if c not in ['time', 'target', 'ticker', 'pattern']]
    groups = df['ticker']

    outer_cv_score = [] # store all cv scores of outer loop inference

    perm_df_ = pd.DataFrame()
    feature_importances_ = pd.DataFrame()
    boruta_df_ = pd.DataFrame()
    
    for i in range(CFG.n_repeats):
        print(f'Repeat {blu}#{i+1}')
        
        if task_type == 'cls':
            y_fold = df['target'] >= df['close']
            kf = StratifiedGroupKFold(n_splits=CFG.n_folds, shuffle=True, random_state=180820231)
            eval_metric = 'logloss'
        else:
            y_fold = (df['target'] - df['close']) / df['close']
            kf = GroupKFold(n_splits=CFG.n_folds)
            eval_metric = 'mse'

        X, y = df[features], y_fold
        oof = np.zeros(len(df))
        models_ = [] # Used to store models trained in the inner loop.
        
        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y, groups)):
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            if task_type == 'cls':
                clf = lgb.LGBMClassifier(**params)
            else:
                clf = lgb.LGBMRegressor(**params)
            clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
                    eval_metric=eval_metric, 
                    callbacks=[lgb.log_evaluation(100)])

            models_.append(clf)

            if task_type == 'cls':
                val_preds = clf.predict_proba(X_val)[:,1]
                val_score = log_loss(y_val, val_preds)
            else:
                val_preds = clf.predict(X_val)
                val_score = mean_squared_error(y_val, val_preds, squared=False)
            
            oof[val_idx] = val_preds
            best_iter = clf.best_iteration_

            print(f'Fold: {blu}{fold + 1:>3}{res}| loss: {blu}{val_score:.5f}{res}| Best iteration: {blu}{best_iter:>4}{res}')

            # permutation importance
            if permut:
                perm = PermutationImportance(clf, scoring=None, n_iter=1, 
                                             random_state=42, cv=None, refit=False).fit(X_val, y_val)

                perm_importance_df = pd.DataFrame({'importance': perm.feature_importances_}, 
                                                    index=X_val.columns).sort_index()

                if perm_df_.shape[0] == 0:
                    perm_df_ = perm_importance_df.copy()
                else:
                    perm_df_ += perm_importance_df

            # gboost feature importance
            f_i = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns), 
                                      reverse=True, key=lambda x: x[1]), 
                                columns=['Value','Feature'])

            if feature_importances_.shape[0] == 0:
                feature_importances_ = f_i.copy()
            else:
                feature_importances_['Value'] += f_i['Value']
                    
            # BORUTA importance
            if boruta:
                model = BoostBoruta(clf, importance_type='shap_importances', train_importance=False)
                try:
                    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
                            eval_metric=eval_metric, 
                            callbacks=[lgb.log_evaluation(100)])
                except RuntimeError:
                    continue
                
                boruta_importance_df = pd.DataFrame({'importance': model.ranking_}, 
                                                        index=X_train.columns).sort_index()
                if boruta_df_.shape[0] == 0:
                    boruta_df_ = boruta_importance_df.copy()
                else:
                    boruta_df_ += boruta_importance_df

        if task_type == 'cls':
            outer_cv = log_loss(y, oof)
        else:
            outer_cv = mean_squared_error(y, oof, squared=False)
        
        outer_cv_score.append(outer_cv)

    print(f'{red} Outer Holdout avg score: {res} log_loss: {red}{np.mean(outer_cv_score):.5f}{res}')
    print(f'{"*" * 50}\n')
    
    if permut:
        perm_df_ = perm_df_.sort_values('importance', ascending=False)
        perm_df_ = perm_df_.reset_index().rename({'index': 'Feature'}, axis=1)
        
    if boruta:
        boruta_df_ = boruta_df_.sort_values('importance')
        boruta_df_ = boruta_df_.reset_index().rename({'index': 'Feature'}, axis=1)
                                    
    feature_importances_ = feature_importances_.sort_values('Value', ascending=False).reset_index(drop=True)
    
    return perm_df_, feature_importances_, boruta_df_, np.mean(outer_cv_score)


params = {
          'n_estimators': 1000,
          'learning_rate': 0.01,
          'early_stopping_round': 100,
          'max_depth': 7,
          'subsample' : 0.8,
          'colsample_bytree': 0.75,
          'num_leaves': 32,
          'verbosity': -1,
          'importance_type': 'gain'
        }

task_type = 'cls'

if task_type == 'cls':
    params['boosting_type'] = 'dart'
    params['objective'] = 'binary'
else:
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'

perm_df_, feature_importances_, boruta_df_, outer_cv_score = lgbm_tuning(train_df, permut=True, boruta=True)

# perm_df_['rank'] = perm_df_['importance'].rank(ascending=False)
# boruta_df_['rank'] = boruta_df_['importance'].rank()
# feature_importances_['rank'] = feature_importances_['Value'].rank(ascending=False)

# res = pd.concat([perm_df_[['Feature','rank']], boruta_df_[['Feature','rank']], feature_importances_[['Feature','rank']]])
# res.groupby('Feature')['rank'].sum().sort_values().head(30).index.to_list()

Repeat [1m[34m#1
[100]	training's binary_logloss: 0.629555	valid_1's binary_logloss: 0.668531
[200]	training's binary_logloss: 0.600252	valid_1's binary_logloss: 0.658932
[300]	training's binary_logloss: 0.570159	valid_1's binary_logloss: 0.651191
[400]	training's binary_logloss: 0.543204	valid_1's binary_logloss: 0.646409
[500]	training's binary_logloss: 0.516703	valid_1's binary_logloss: 0.643065
[600]	training's binary_logloss: 0.504283	valid_1's binary_logloss: 0.640452
[700]	training's binary_logloss: 0.485808	valid_1's binary_logloss: 0.637168
[800]	training's binary_logloss: 0.471286	valid_1's binary_logloss: 0.633747
[900]	training's binary_logloss: 0.453675	valid_1's binary_logloss: 0.630768
[1000]	training's binary_logloss: 0.442356	valid_1's binary_logloss: 0.630699
Fold: [1m[34m  1[0m| loss: [1m[34m0.63070[0m| Best iteration: [1m[34m   0[0m
[100]	training's binary_logloss: 0.624893	valid_1's binary_logloss: 0.671183
[200]	training's binary_logloss: 0.594855	valid

In [7]:
perm_df_['rank'] = perm_df_['importance'].rank(ascending=False)
boruta_df_['rank'] = boruta_df_['importance'].rank()
feature_importances_['rank'] = feature_importances_['Value'].rank(ascending=False)

res = pd.concat([perm_df_[['Feature','rank']], boruta_df_[['Feature','rank']], feature_importances_[['Feature','rank']]])
res.groupby('Feature')['rank'].sum().sort_values().head(30)

Feature
stoch_diff                   3.5
stoch_slowk_dir              9.0
stoch_slowd_prev_8          11.0
rsi                         14.0
macdsignal_dir              14.0
macd_prev_36                27.5
linear_reg_angle_prev_8     36.5
rsi_prev_4                  37.5
macdsignal_prev_36          57.5
stoch_slowk_prev_8          64.5
stoch_slowd_dir_prev_48     76.5
stoch_slowk                 85.5
rsi_prev_36                157.5
stoch_slowd_dir_prev_4     170.0
stoch_slowd_prev_4         176.0
stoch_slowk_dir_prev_24    176.0
stoch_slowd_prev_12        178.0
stoch_diff_prev_4          193.0
stoch_slowd_dir_prev_32    197.0
stoch_slowk_prev_40        198.0
stoch_slowd_dir_prev_40    199.0
stoch_diff_prev_8          212.0
rsi_prev_8                 212.0
linear_reg_prev_44         214.0
stoch_diff_prev_36         216.0
stoch_slowd                216.0
stoch_slowd_dir_prev_8     217.0
rsi_prev_24                227.0
rsi_prev_44                228.0
stoch_slowk_prev_16        231.0
Na

# Train model with selected features

In [114]:
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression, LinearRegression


def model_train(df, task_type, how, n_folds): 
    oof = np.zeros([df['target'].shape[0], 1])
    # features = [c for c in df.columns if c not in ['time', 'target', 'ticker', 'pattern']]
    features = res.groupby('Feature')['rank'].sum().sort_values().head(30).index.to_list()

    X, groups = df[features], df['ticker']
    X = pd.concat([X, pd.get_dummies(df['pattern'], drop_first=True)], axis=1)
    
    if task_type == 'cls':
        y = df['target'] >= df['close']
        kf = StratifiedGroupKFold(n_splits=n_folds, shuffle=True, random_state=180820231)
    else:
        y = (df['target'] - df['close']) / df['close']
        kf = GroupKFold(n_splits=n_folds)

    oe_enc = OrdinalEncoder()
    groups = oe_enc.fit_transform(groups.values.reshape(-1, 1))

    print(f"Training with {len(features)} features")
    
    if how == 'lreg':
        scaler = StandardScaler()
        X[X.columns] = scaler.fit_transform(X)
    
    for fold, (fit_idx, val_idx) in enumerate(kf.split(X, y, groups)):
        # Split the dataset according to the fold indexes.
        X_train = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]
        
        models = list()
        if how == 'lgbm':
            if task_type == 'cls':
                model = lgb.LGBMClassifier(**params)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                        eval_metric='logloss', callbacks = [lgb.log_evaluation(100)])
                # best_iter = model.best_iteration_
            else:
                model = lgb.LGBMRegressor(**params)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                        eval_metric='mse', callbacks = [lgb.log_evaluation(100)])
                # best_iter = model.best_iteration_
        elif how == 'lreg':
            if task_type == 'cls':
                model = LogisticRegression(C=0.1, max_iter=100000)#, class_weight='balanced')
                model.fit(X_train, y_train)
            else:
                model = LinearRegression(positive=True)
                model.fit(X_train, y_train)

        if task_type == 'cls':
            val_preds = model.predict_proba(X_val)
            val_score = log_loss(y_val, val_preds)
            acc_score = confident_accuracy_score(y_val, val_preds[:,1])
            print(f'Logloss: {val_score}, Confident objects accuracy: {acc_score}')
            oof[val_idx, 0] = val_preds[:,1]
        else:
            val_preds = model.predict(X_val)
            val_score = mean_squared_error(y_val, val_preds, squared=False)
            print('RMSE: {val_score}')
            oof[val_idx, 0] = val_preds
        
        models.append(model)
        
    return oof, models

def confident_accuracy_score(y, oof, low_bound=0.35, high_bound=0.65):
    ''' Consider only high confident objects for accuracy score calculation;
        object probability must be lower than low_bound or higher than high_bound '''
    pred_conf = np.zeros_like(oof)
    pred_conf[oof > high_bound] = 1
    pred_conf[oof < low_bound] = 0
    pred_conf = pred_conf[(oof < low_bound) | (oof > high_bound)]

    y_conf = y.values.reshape(-1,1)[(oof < low_bound) | (oof > high_bound)]

    return accuracy_score(y_conf, pred_conf)   

# task_type = 'reg'
task_type = 'cls'

# best params for classification
if task_type == 'cls':
    params = {
            'boosting_type': 'dart',
            'n_estimators': 800,
            'learning_rate': 0.02,
            #   'early_stopping_round': 50,
            'max_depth': 10,
            'colsample_bytree': 0.8,
            'subsample': 0.9,
            'subsample_freq': 1,
            'num_leaves': 26,
            'verbosity': -1,
            'max_bin': 255,
            'reg_alpha': 1e-5,
            'reg_lambda': 1e-7,
            'objective': 'binary'
            }
else:
    # best params for regression
    params = {
            'boosting_type': 'gbdt',
            'n_estimators': 1000,
            'learning_rate': 0.021,
            'early_stopping_round': 100,
            'max_depth': 7,
            'colsample_bytree': 0.75,
            'subsample': 0.8,
            'subsample_freq': 1,
            'num_leaves': 27,
            'verbosity': -1,
            'max_bin': 511,
            'reg_alpha': 1e-4,
            'reg_lambda': 1e-4,
            'objective': 'regression'
            }

oof, models = model_train(train_df, task_type=task_type, how='lgbm', n_folds=5) # 0.061096263508601985 / logloss: 0.6226973017816237 acc: 0.6424792139077853
# oof, models = model_train(train_df, task_type=task_type, how='lreg', n_folds=5) # 0.06958035063954768 / logloss: 0.6985539186132326 acc: 0.587892049598833

if task_type == 'cls':
    y = train_df['target'] >= train_df['close']
    low_bound, high_bound = 0.35, 0.65
    display(log_loss(y, oof))
    display(confident_accuracy_score(y, oof, low_bound, high_bound))
else:
    y = (train_df['target'] - train_df['close']) / train_df['close']
    display(mean_squared_error(y, oof, squared=False))


Training with 30 features
[100]	valid_0's binary_logloss: 0.655505
[200]	valid_0's binary_logloss: 0.645369
[300]	valid_0's binary_logloss: 0.635218
[400]	valid_0's binary_logloss: 0.628613
[500]	valid_0's binary_logloss: 0.624361
[600]	valid_0's binary_logloss: 0.621881
[700]	valid_0's binary_logloss: 0.621013
[800]	valid_0's binary_logloss: 0.620899
Logloss: 0.6208992424519378, Confident objects accuracy: 0.7933333333333333
[100]	valid_0's binary_logloss: 0.654601
[200]	valid_0's binary_logloss: 0.644081
[300]	valid_0's binary_logloss: 0.637742
[400]	valid_0's binary_logloss: 0.635968
[500]	valid_0's binary_logloss: 0.634911
[600]	valid_0's binary_logloss: 0.634532
[700]	valid_0's binary_logloss: 0.63317
[800]	valid_0's binary_logloss: 0.632782
Logloss: 0.6327821318886288, Confident objects accuracy: 0.7518987341772152
[100]	valid_0's binary_logloss: 0.65499
[200]	valid_0's binary_logloss: 0.646602
[300]	valid_0's binary_logloss: 0.640441
[400]	valid_0's binary_logloss: 0.635496
[500

0.6226973017816237

0.7686973749380882