# Load data and add indicators

In [1]:
import sys
sys.path.append('..')

from os import environ
import numpy as np
import pandas as pd
from indicators import indicators
from datetime import timedelta
from tqdm.auto import tqdm
from config.config import ConfigFactory

# Set environment variable
environ["ENV"] = "1h_4h"

# Get configs
configs = ConfigFactory.factory(environ).configs

def get_file(ticker):
    ''' Find files buy ticker names, file names can be in different formats '''
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    return None, None

def add_indicators(df, ttype, configs):
    # add RSI
    rsi = indicators.RSI(ttype, configs)
    df = rsi.get_indicator(df, '', '', 0)
    # add RSI
    stoch = indicators.STOCH(ttype, configs)
    df = stoch.get_indicator(df, '', '', 0)
    # add Trend
    trend = indicators.Trend(ttype, configs)
    df = trend.get_indicator(df, '', '', 0)
    # add MACD
    macd = indicators.MACD(ttype, configs)
    df = macd.get_indicator(df, '', '', 0)
    # add ATR
    atr = indicators.ATR(ttype, configs)
    df = atr.get_indicator(df, '', '', 0)
    # add SMA
    # sma = indicators.SMA(ttype, configs)
    # df = sma.get_indicator(df, '', '', 0)
    return df

def create_train_df(df, ttype, configs, target_offset, first, last, step):
    ''' Create train dataset from signal statistics and ticker candle data'''
    train_df = pd.DataFrame()
    tickers = df['ticker'].unique()
    
    for ticker in tqdm(tickers):
        # get signals with current ticker
        signal_df = df[df['ticker'] == ticker]
        times = signal_df['time']
        
        # load candle history of this ticker
        tmp_df_1h, tmp_df_4h = get_file(ticker)

        # add indicators 
        tmp_df_1h = add_indicators(tmp_df_1h, ttype, configs)

        # add historical data for current ticker
        for i, t in enumerate(times.to_list()):
            pass_cycle = False
            pattern = signal_df.iloc[i, signal_df.columns.get_loc('pattern')]
            row = tmp_df_1h.loc[tmp_df_1h['time'] == t, :].reset_index(drop=True)
            
            for i in range(first, last + step, step):
                time_prev = t + timedelta(hours= -i)
                try:
                    row_tmp = tmp_df_1h.loc[tmp_df_1h['time'] == time_prev, :].reset_index(drop=True)
                    row_tmp.columns = [c + f'_prev_{i}' for c in row_tmp.columns]
                except IndexError:
                    pass_cycle = True
                    break
                row = pd.concat([row, row_tmp.iloc[:,1:]], axis=1)
                row['ticker'] = ticker
                row['pattern'] = pattern
                
            if pass_cycle:
                continue
            
            # add target
            time_next = t + timedelta(hours=target_offset)
            if ttype == 'buy':
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'high'].reset_index(drop=True)
            else:
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'low'].reset_index(drop=True)

            target.name = 'target'
            rows = pd.concat([row, target], axis=1)
            
            # add data to the dataset
            if train_df.shape[0] == 0:
                train_df = rows
            else:
                train_df = pd.concat([train_df, rows])
    
    return train_df

# for how long time (in hours) we want to predict
target_offset = 24
# first previous data point to collect for model training (value represents number of hours before signal point)
first = 4
# last previous data point to collect for model training (value represents number of hours before signal point)
last = 48
# step of previous data points collecting (total number of points to collect is (last - first + step) / step)
step = 4

# Buy
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/buy_stat_1h.pkl')
# dataset for model train
train_buy = create_train_df(df, 'buy', configs, target_offset, first, last, step)
train_buy = train_buy.dropna()

# Sell
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/sell_stat_1h.pkl')
# dataset for model train
train_sell = create_train_df(df, 'sell', configs, target_offset, first, last, step)
train_sell = train_sell.dropna()

train_df = pd.concat([train_buy, train_sell]).sort_values('time').reset_index(drop=True)
display(train_df.head())
display(train_df.shape)


  0%|          | 0/344 [00:00<?, ?it/s]

  0%|          | 0/317 [00:00<?, ?it/s]

Unnamed: 0,time,open,high,low,close,volume,rsi,stoch_slowk,stoch_slowd,stoch_slowk_dir,...,linear_reg_prev_48,linear_reg_angle_prev_48,macd_prev_48,macdsignal_prev_48,macdhist_prev_48,macd_dir_prev_48,macdsignal_dir_prev_48,atr_prev_48,close_smooth_prev_48,target
0,2023-02-10 06:00:00,0.1702,0.1704,0.1688,0.1689,317715.0,26.327542,6.603062,6.975576,0.082533,...,38.615341,24.261279,0.001298,0.001086,0.000212,0.0,0.032781,0.001836,0.194158,0.1753
1,2023-02-10 06:00:00,0.1702,0.1704,0.1688,0.1689,317715.0,26.327542,6.603062,6.975576,0.082533,...,38.615341,24.261279,0.001298,0.001086,0.000212,0.0,0.032781,0.001836,0.194158,0.1753
2,2023-02-13 09:00:00,0.1722,0.1726,0.1721,0.1722,72840.0,31.468613,12.139043,9.610858,0.360603,...,33.009032,26.69252,0.000263,-0.000127,0.00039,0.481678,-0.360744,0.00204,0.172821,0.1686
3,2023-02-13 09:00:00,0.1722,0.1726,0.1721,0.1722,72840.0,31.468613,12.139043,9.610858,0.360603,...,33.009032,26.69252,0.000263,-0.000127,0.00039,0.481678,-0.360744,0.00204,0.172821,0.1686
4,2023-02-13 23:00:00,0.1662,0.1669,0.166,0.1667,273771.0,35.098695,17.575718,14.569659,0.119088,...,38.832289,24.306479,0.000472,0.000459,1.3e-05,0.0,0.014491,0.001569,0.174871,0.1792


(4534, 264)

# Select features

In [2]:
import lightgbm as lgb
from sklearn.metrics import log_loss

from eli5.sklearn import PermutationImportance
from shaphypetune import BoostBoruta

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

from sklearn.metrics import log_loss, mean_squared_error, accuracy_score

import warnings
warnings.filterwarnings('ignore')

from colorama import Style, Fore

palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

blk = Style.BRIGHT + Fore.BLACK
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL

class CFG:
    n_repeats = 1
    n_folds = 5

def lgbm_tuning(df, permut=False, boruta=False):
    features = [c for c in df.columns if c not in ['time', 'target', 'ticker', 'pattern']]
    groups = df['ticker']

    outer_cv_score = [] # store all cv scores of outer loop inference

    perm_df_ = pd.DataFrame()
    feature_importances_ = pd.DataFrame()
    boruta_df_ = pd.DataFrame()
    
    for i in range(CFG.n_repeats):
        print(f'Repeat {blu}#{i+1}')
        
        if task_type == 'cls':
            y_fold = df['target'] >= df['close']
            kf = StratifiedGroupKFold(n_splits=CFG.n_folds, shuffle=True, random_state=180820231)
            eval_metric = 'logloss'
        else:
            y_fold = (df['target'] - df['close']) / df['close']
            kf = GroupKFold(n_splits=CFG.n_folds)
            eval_metric = 'mse'

        X, y = df[features], y_fold
        oof = np.zeros(len(df))
        models_ = [] # Used to store models trained in the inner loop.
        
        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y, groups)):
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            if task_type == 'cls':
                clf = lgb.LGBMClassifier(**params)
            else:
                clf = lgb.LGBMRegressor(**params)
            clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
                    eval_metric=eval_metric, 
                    callbacks=[lgb.log_evaluation(100)])

            models_.append(clf)

            if task_type == 'cls':
                val_preds = clf.predict_proba(X_val)[:,1]
                val_score = log_loss(y_val, val_preds)
            else:
                val_preds = clf.predict(X_val)
                val_score = mean_squared_error(y_val, val_preds, squared=False)
            
            oof[val_idx] = val_preds
            best_iter = clf.best_iteration_

            print(f'Fold: {blu}{fold + 1:>3}{res}| loss: {blu}{val_score:.5f}{res}| Best iteration: {blu}{best_iter:>4}{res}')

            # permutation importance
            if permut:
                perm = PermutationImportance(clf, scoring=None, n_iter=1, 
                                             random_state=42, cv=None, refit=False).fit(X_val, y_val)

                perm_importance_df = pd.DataFrame({'importance': perm.feature_importances_}, 
                                                    index=X_val.columns).sort_index()

                if perm_df_.shape[0] == 0:
                    perm_df_ = perm_importance_df.copy()
                else:
                    perm_df_ += perm_importance_df

            # gboost feature importance
            f_i = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns), 
                                      reverse=True, key=lambda x: x[1]), 
                                columns=['Value','Feature'])

            if feature_importances_.shape[0] == 0:
                feature_importances_ = f_i.copy()
            else:
                feature_importances_['Value'] += f_i['Value']
                    
            # BORUTA importance
            if boruta:
                model = BoostBoruta(clf, importance_type='shap_importances', train_importance=False)
                try:
                    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
                            eval_metric=eval_metric, 
                            callbacks=[lgb.log_evaluation(100)])
                except RuntimeError:
                    continue
                
                boruta_importance_df = pd.DataFrame({'importance': model.ranking_}, 
                                                        index=X_train.columns).sort_index()
                if boruta_df_.shape[0] == 0:
                    boruta_df_ = boruta_importance_df.copy()
                else:
                    boruta_df_ += boruta_importance_df

        if task_type == 'cls':
            outer_cv = log_loss(y, oof)
        else:
            outer_cv = mean_squared_error(y, oof, squared=False)
        
        outer_cv_score.append(outer_cv)

    print(f'{red} Outer Holdout avg score: {res} log_loss: {red}{np.mean(outer_cv_score):.5f}{res}')
    print(f'{"*" * 50}\n')
    
    if permut:
        perm_df_ = perm_df_.sort_values('importance', ascending=False)
        perm_df_ = perm_df_.reset_index().rename({'index': 'Feature'}, axis=1)
        
    if boruta:
        boruta_df_ = boruta_df_.sort_values('importance')
        boruta_df_ = boruta_df_.reset_index().rename({'index': 'Feature'}, axis=1)
                                    
    feature_importances_ = feature_importances_.sort_values('Value', ascending=False).reset_index(drop=True)
    
    return perm_df_, feature_importances_, boruta_df_, np.mean(outer_cv_score)


params = {
          'n_estimators': 1000,
          'learning_rate': 0.01,
          'early_stopping_round': 100,
          'max_depth': 7,
          'subsample' : 0.8,
          'colsample_bytree': 0.75,
          'num_leaves': 32,
          'verbosity': -1,
          'importance_type': 'gain'
        }

task_type = 'cls'

if task_type == 'cls':
    params['boosting_type'] = 'dart'
    params['objective'] = 'binary'
else:
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'

perm_df_, feature_importances_, boruta_df_, outer_cv_score = lgbm_tuning(train_df, permut=True, boruta=True)

# perm_df_['rank'] = perm_df_['importance'].rank(ascending=False)
# boruta_df_['rank'] = boruta_df_['importance'].rank()
# feature_importances_['rank'] = feature_importances_['Value'].rank(ascending=False)

# res = pd.concat([perm_df_[['Feature','rank']], boruta_df_[['Feature','rank']], feature_importances_[['Feature','rank']]])
# res.groupby('Feature')['rank'].sum().sort_values().head(30).index.to_list()

  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
  def _jit_build_partition_tree(xmin, xmax, ymi

Repeat [1m[34m#1
[100]	training's binary_logloss: 0.620645	valid_1's binary_logloss: 0.673693
[200]	training's binary_logloss: 0.586947	valid_1's binary_logloss: 0.66539
[300]	training's binary_logloss: 0.552079	valid_1's binary_logloss: 0.662331
[400]	training's binary_logloss: 0.521368	valid_1's binary_logloss: 0.661721
[500]	training's binary_logloss: 0.489789	valid_1's binary_logloss: 0.660274
[600]	training's binary_logloss: 0.478111	valid_1's binary_logloss: 0.659816
[700]	training's binary_logloss: 0.454773	valid_1's binary_logloss: 0.657408
[800]	training's binary_logloss: 0.440695	valid_1's binary_logloss: 0.657467
[900]	training's binary_logloss: 0.421117	valid_1's binary_logloss: 0.656943
[1000]	training's binary_logloss: 0.4065	valid_1's binary_logloss: 0.65636
Fold: [1m[34m  1[0m| loss: [1m[34m0.65636[0m| Best iteration: [1m[34m   0[0m
[100]	training's binary_logloss: 0.62404	valid_1's binary_logloss: 0.664944
[200]	training's binary_logloss: 0.592152	valid_1's 

In [1]:
perm_df_['rank'] = perm_df_['importance'].rank(ascending=False)
boruta_df_['rank'] = boruta_df_['importance'].rank()
feature_importances_['rank'] = feature_importances_['Value'].rank(ascending=False)

res = pd.concat([perm_df_[['Feature','rank']], boruta_df_[['Feature','rank']], feature_importances_[['Feature','rank']]])
res.groupby('Feature')['rank'].sum().sort_values().head(30)

NameError: name 'perm_df_' is not defined

# Train model with selected features

In [None]:
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression, LinearRegression


def model_train(df, task_type, how, n_folds): 
    oof = np.zeros([df['target'].shape[0], 1])
    # features = [c for c in df.columns if c not in ['time', 'target', 'ticker', 'pattern']]
    features = res.groupby('Feature')['rank'].sum().sort_values().head(30).index.to_list()

    X, groups = df[features], df['ticker']
    X = pd.concat([X, pd.get_dummies(df['pattern'], drop_first=True)], axis=1)
    
    if task_type == 'cls':
        y = df['target'] >= df['close']
        kf = StratifiedGroupKFold(n_splits=n_folds, shuffle=True, random_state=180820231)
    else:
        y = (df['target'] - df['close']) / df['close']
        kf = GroupKFold(n_splits=n_folds)

    oe_enc = OrdinalEncoder()
    groups = oe_enc.fit_transform(groups.values.reshape(-1, 1))

    print(f"Training with {len(features)} features")
    
    if how == 'lreg':
        scaler = StandardScaler()
        X[X.columns] = scaler.fit_transform(X)
    
    for fold, (fit_idx, val_idx) in enumerate(kf.split(X, y, groups)):
        # Split the dataset according to the fold indexes.
        X_train = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]
        
        models = list()
        if how == 'lgbm':
            if task_type == 'cls':
                model = lgb.LGBMClassifier(**params)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                        eval_metric='logloss', callbacks = [lgb.log_evaluation(100)])
                # best_iter = model.best_iteration_
            else:
                model = lgb.LGBMRegressor(**params)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                        eval_metric='mse', callbacks = [lgb.log_evaluation(100)])
                # best_iter = model.best_iteration_
        elif how == 'lreg':
            if task_type == 'cls':
                model = LogisticRegression(C=0.1, max_iter=100000)#, class_weight='balanced')
                model.fit(X_train, y_train)
            else:
                model = LinearRegression(positive=True)
                model.fit(X_train, y_train)

        if task_type == 'cls':
            val_preds = model.predict_proba(X_val)
            val_score = log_loss(y_val, val_preds)
            acc_score = confident_accuracy_score(y_val, val_preds[:,1])
            print(f'Logloss: {val_score}, Confident objects accuracy: {acc_score}')
            oof[val_idx, 0] = val_preds[:,1]
        else:
            val_preds = model.predict(X_val)
            val_score = mean_squared_error(y_val, val_preds, squared=False)
            print('RMSE: {val_score}')
            oof[val_idx, 0] = val_preds
        
        models.append(model)
        
    return oof, models

def confident_accuracy_score(y, oof, low_bound=0.35, high_bound=0.65):
    ''' Consider only high confident objects for accuracy score calculation;
        object probability must be lower than low_bound or higher than high_bound '''
    pred_conf = np.zeros_like(oof)
    pred_conf[oof > high_bound] = 1
    pred_conf[oof < low_bound] = 0
    pred_conf = pred_conf[(oof < low_bound) | (oof > high_bound)]

    y_conf = y.values.reshape(-1,1)[(oof < low_bound) | (oof > high_bound)]

    return accuracy_score(y_conf, pred_conf)   

# task_type = 'reg'
task_type = 'cls'

# best params for classification
if task_type == 'cls':
    params = {
            'boosting_type': 'dart',
            'n_estimators': 800,
            'learning_rate': 0.02,
            #   'early_stopping_round': 50,
            'max_depth': 10,
            'colsample_bytree': 0.8,
            'subsample': 0.9,
            'subsample_freq': 1,
            'num_leaves': 26,
            'verbosity': -1,
            'max_bin': 255,
            'reg_alpha': 1e-5,
            'reg_lambda': 1e-7,
            'objective': 'binary'
            }
else:
    # best params for regression
    params = {
            'boosting_type': 'gbdt',
            'n_estimators': 1000,
            'learning_rate': 0.021,
            'early_stopping_round': 100,
            'max_depth': 7,
            'colsample_bytree': 0.75,
            'subsample': 0.8,
            'subsample_freq': 1,
            'num_leaves': 27,
            'verbosity': -1,
            'max_bin': 511,
            'reg_alpha': 1e-4,
            'reg_lambda': 1e-4,
            'objective': 'regression'
            }

oof, models = model_train(train_df, task_type=task_type, how='lgbm', n_folds=5) # 0.061096263508601985 / logloss: 0.6226973017816237 acc: 0.6424792139077853
# oof, models = model_train(train_df, task_type=task_type, how='lreg', n_folds=5) # 0.06958035063954768 / logloss: 0.6985539186132326 acc: 0.587892049598833

if task_type == 'cls':
    y = train_df['target'] >= train_df['close']
    low_bound, high_bound = 0.35, 0.65
    display(log_loss(y, oof))
    display(confident_accuracy_score(y, oof, low_bound, high_bound))
else:
    y = (train_df['target'] - train_df['close']) / train_df['close']
    display(mean_squared_error(y, oof, squared=False))


Training with 30 features
[100]	valid_0's binary_logloss: 0.655505
[200]	valid_0's binary_logloss: 0.645369
[300]	valid_0's binary_logloss: 0.635218
[400]	valid_0's binary_logloss: 0.628613
[500]	valid_0's binary_logloss: 0.624361
[600]	valid_0's binary_logloss: 0.621881
[700]	valid_0's binary_logloss: 0.621013
[800]	valid_0's binary_logloss: 0.620899
Logloss: 0.6208992424519378, Confident objects accuracy: 0.7933333333333333
[100]	valid_0's binary_logloss: 0.654601
[200]	valid_0's binary_logloss: 0.644081
[300]	valid_0's binary_logloss: 0.637742
[400]	valid_0's binary_logloss: 0.635968
[500]	valid_0's binary_logloss: 0.634911
[600]	valid_0's binary_logloss: 0.634532
[700]	valid_0's binary_logloss: 0.63317
[800]	valid_0's binary_logloss: 0.632782
Logloss: 0.6327821318886288, Confident objects accuracy: 0.7518987341772152
[100]	valid_0's binary_logloss: 0.65499
[200]	valid_0's binary_logloss: 0.646602
[300]	valid_0's binary_logloss: 0.640441
[400]	valid_0's binary_logloss: 0.635496
[500

0.6226973017816237

0.7686973749380882