In this notebook, the primary model is the one that sets a limit buy/sell order based on ATR and profit take/stop loss based on daily volatility.

In [1]:
import ccxt
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

bybit = ccxt.bybit({"apiKey":"", "secret":""})
bybit.set_sandbox_mode(True)

import math
from tqdm import tqdm


import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp
import seaborn as sns
import talib

from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score, KFold, TimeSeriesSplit

In [70]:
def get_daily_vol(close, lookback=100):
    
    """
    This function is credited to Marcos Lopez de Prado. 
    
    """
    print('Calculating daily volatility for dynamic thresholds')
    
    df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1))
    df0 = df0[df0 > 0]
    df0 = (pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:]))
        
    df0 = close.loc[df0.index] / close.loc[df0.values].values - 1  # daily returns
    df1 = df0.ewm(span=lookback).std()
    return df1

In [72]:
def get_dollar_bars(time_bars, dollar_threshold): 
    
    """
    This function is credit to Max Bodoia and partially revised by Yusaku Fujii
    
    """
    
    # Make ohlcv df a dictionary
    time_bars = time_bars.to_dict('records') 
    
    # initialize an empty list of dollar bars
    dollar_bars = []

    # initialize the running dollar volume at zero
    running_volume = 0

    # initialize the running high and low with placeholder values
    running_high, running_low = 0, math.inf

    # for each time bar...
    for i in range(len(time_bars)):

        # get the timestamp, open, high, low, close, and volume of the next bar
        next_close, next_high, next_low, next_open, next_timestamp, next_volume,next_fee = [time_bars[i][k] for k in ['close', 'high', 'low', 'open', 'timestamp', 'volume','maker fee']]

        # get the midpoint price of the next bar (the average of the open and the close)
        midpoint_price = ((next_open) + (next_close))/2

        # get the approximate dollar volume of the bar using the volume and the midpoint price
        dollar_volume = next_volume * midpoint_price

        # update the running high and low
        running_high, running_low = max(running_high, next_high), min(running_low, next_low)

        # if the next bar's dollar volume would take us over the threshold...
        if dollar_volume + running_volume >= dollar_threshold:

            # set the timestamp for the dollar bar as the timestamp at which the bar closed (i.e. one minute after the timestamp of the last minutely bar included in the dollar bar)
            bar_timestamp = next_timestamp + timedelta(minutes=5)
            
            # add a new dollar bar to the list of dollar bars with the timestamp, running high/low, and next close
            dollar_bars += [{'timestamp': bar_timestamp, 'open': next_open, 'high': running_high, 'low': running_low, 'close': next_close,'maker fee':next_fee}]

            # reset the running volume to zero
            running_volume = 0

            # reset the running high and low to placeholder values
            running_high, running_low = 0, math.inf

        # otherwise, increment the running volume
        else:
            running_volume += dollar_volume

    # change the list of dollar bars to its dataframe
    dollar_bars = pd.DataFrame(dollar_bars)
    
    # set timestamp as index
    dollar_bars.set_index('timestamp',inplace=True)
    
    # remove the date that duplicates
    dollar_bars=dollar_bars.drop_duplicates()
    
    return dollar_bars


In [73]:
features = ['ADX', 'APO', 'AROONOSC', 'AROON_aroondown', 'AROON_aroonup', 'ATR',
       'BBANDS_lowerband', 'BBANDS_middleband', 'BBANDS_upperband', 'BETA',
       'CCI', 'DX', 'EMA', 'HT_TRENDMODE', 'KAMA', 'LINEARREG',
       'LINEARREG_ANGLE', 'LINEARREG_INTERCEPT', 'LINEARREG_SLOPE', 'MA',
       'MIDPOINT', 'MOM', 'RSI', 'STDDEV', 'STOCHF_fastk',
       'STOCH_slowd', 'STOCH_slowk', 'T3', 'TRIMA', 'ULTOSC', 'WILLR', 'WMA']

def calc_features(df):
    
    """
    This function is credited to richmanbtc.
    
    """
    open = df['open']
    high = df['high']
    low = df['low']
    close = df['close']
    #volume = df['volume']

    hilo = (df['high'] + df['low']) / 2
    df['BBANDS_upperband'], df['BBANDS_middleband'], df['BBANDS_lowerband'] = talib.BBANDS(close, timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)
    df['BBANDS_upperband'] -= hilo
    df['BBANDS_middleband'] -= hilo
    df['BBANDS_lowerband'] -= hilo
    df['DEMA'] = talib.DEMA(close, timeperiod=30) - hilo
    df['EMA'] = talib.EMA(close, timeperiod=30) - hilo
    df['HT_TRENDLINE'] = talib.HT_TRENDLINE(close) - hilo
    df['KAMA'] = talib.KAMA(close, timeperiod=30) - hilo
    df['MA'] = talib.MA(close, timeperiod=30, matype=0) - hilo
    df['MIDPOINT'] = talib.MIDPOINT(close, timeperiod=14) - hilo
    df['SMA'] = talib.SMA(close, timeperiod=30) - hilo
    df['T3'] = talib.T3(close, timeperiod=5, vfactor=0) - hilo
    df['TEMA'] = talib.TEMA(close, timeperiod=30) - hilo
    df['TRIMA'] = talib.TRIMA(close, timeperiod=30) - hilo
    df['WMA'] = talib.WMA(close, timeperiod=30) - hilo

    df['ADX'] = talib.ADX(high, low, close, timeperiod=14)
    df['ADXR'] = talib.ADXR(high, low, close, timeperiod=14)
    df['APO'] = talib.APO(close, fastperiod=12, slowperiod=26, matype=0)
    df['AROON_aroondown'], df['AROON_aroonup'] = talib.AROON(high, low, timeperiod=14)
    df['AROONOSC'] = talib.AROONOSC(high, low, timeperiod=14)
    df['BOP'] = talib.BOP(open, high, low, close)
    df['CCI'] = talib.CCI(high, low, close, timeperiod=14)
    df['DX'] = talib.DX(high, low, close, timeperiod=14)
    df['MACD_macd'], df['MACD_macdsignal'], df['MACD_macdhist'] = talib.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)
    # skip MACDEXT MACDFIX たぶん同じなので
    #df['MFI'] = talib.MFI(high, low, close, volume, timeperiod=14)
    df['MINUS_DI'] = talib.MINUS_DI(high, low, close, timeperiod=14)
    df['MINUS_DM'] = talib.MINUS_DM(high, low, timeperiod=14)
    df['MOM'] = talib.MOM(close, timeperiod=10)
    df['PLUS_DI'] = talib.PLUS_DI(high, low, close, timeperiod=14)
    df['PLUS_DM'] = talib.PLUS_DM(high, low, timeperiod=14)
    df['RSI'] = talib.RSI(close, timeperiod=14)
    df['STOCH_slowk'], df['STOCH_slowd'] = talib.STOCH(high, low, close, fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
    df['STOCHF_fastk'], df['STOCHF_fastd'] = talib.STOCHF(high, low, close, fastk_period=5, fastd_period=3, fastd_matype=0)
    df['STOCHRSI_fastk'], df['STOCHRSI_fastd'] = talib.STOCHRSI(close, timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
    df['TRIX'] = talib.TRIX(close, timeperiod=30)
    df['ULTOSC'] = talib.ULTOSC(high, low, close, timeperiod1=7, timeperiod2=14, timeperiod3=28)
    df['WILLR'] = talib.WILLR(high, low, close, timeperiod=14)

    #df['AD'] = talib.AD(high, low, close, volume)
    #df['ADOSC'] = talib.ADOSC(high, low, close, volume, fastperiod=3, slowperiod=10)
    #df['OBV'] = talib.OBV(close, volume)

    df['ATR'] = talib.ATR(high, low, close, timeperiod=14)
    df['NATR'] = talib.NATR(high, low, close, timeperiod=14)
    df['TRANGE'] = talib.TRANGE(high, low, close)

    df['HT_DCPERIOD'] = talib.HT_DCPERIOD(close)
    df['HT_DCPHASE'] = talib.HT_DCPHASE(close)
    df['HT_PHASOR_inphase'], df['HT_PHASOR_quadrature'] = talib.HT_PHASOR(close)
    df['HT_SINE_sine'], df['HT_SINE_leadsine'] = talib.HT_SINE(close)
    df['HT_TRENDMODE'] = talib.HT_TRENDMODE(close)

    df['BETA'] = talib.BETA(high, low, timeperiod=5)
    df['CORREL'] = talib.CORREL(high, low, timeperiod=30)
    df['LINEARREG'] = talib.LINEARREG(close, timeperiod=14) - close
    df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(close, timeperiod=14)
    df['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(close, timeperiod=14) - close
    df['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(close, timeperiod=14)
    df['STDDEV'] = talib.STDDEV(close, timeperiod=5, nbdev=1)

    return df

In [87]:
def compute_primary_model_ret_buy(df,cond,high,low,sl,tp,fee):
    
    """
    tp:Upper
    sl:Lower
    
    """
    exit_price=sl.copy()
    exit_price[:]=np.nan
    
    for i in range(exit_price.size):
        for j in range(i+1,exit_price.size):
            
            if cond[i]==1:
                if low[j]<sl[j]:
                    exit_price[i]=sl[j]
                    break
                    
                    
                elif high[j]>tp[j]:
                    exit_price[i]=tp[j]
                    break
                                  
    df['exit_price_buy']=exit_price
    df['y_buy']=df['exit_price_buy']/df['buy_price']-1-2*fee
    df.loc[df['y_buy'].isna(),'y_buy']=0
    
    return df

In [88]:
def compute_primary_model_ret_sell(df,cond,high,low,sl,tp,fee):
    
    """
    tp:Lower
    sl:Upper
    
    ・現状、slが優先的に処理されている。tpを経由してからslに到達するprice pathを、
    　＋リターンとして評価していない点が、優れている。
    
    """
    exit_price=sl.copy()
    exit_price[:]=np.nan

    for i in range(exit_price.size):
        for j in range(i+1,exit_price.size):
            
            if cond[i]==1:
                if high[j]>sl[j]:
                    exit_price[i]=sl[j]
                    break
                    
                elif low[j]<tp[j]:
                    exit_price[i]=tp[j]
                    break

    df['exit_price_sell']=exit_price
    df['y_sell']=df['exit_price_sell']/df['sell_price']-1-2*fee
    df.loc[df['y_sell'].isna(),'y_sell']=0
    
    return df

In [67]:
df=pd.read_parquet('btcusdt_2102_raw')
df

Unnamed: 0_level_0,open,high,low,close,volume,maker fee
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-02-21 00:45:00,56123.0,56231.0,55868.0,55967.5,98.816,-0.00025
2021-02-21 01:00:00,55967.5,56266.0,55883.5,56238.5,96.312,-0.00025
2021-02-21 01:15:00,56238.5,56252.5,56023.5,56054.0,88.322,-0.00025
2021-02-21 01:30:00,56054.0,56080.0,55911.5,55997.5,91.413,-0.00025
2021-02-21 01:45:00,55997.5,56356.0,55985.5,56356.0,79.990,-0.00025
...,...,...,...,...,...,...
2022-06-16 01:30:00,22687.0,22706.0,22515.0,22542.0,190.518,0.00010
2022-06-16 01:45:00,22542.0,22600.0,22361.5,22372.5,101.782,0.00010
2022-06-16 02:00:00,22372.5,22600.0,22360.0,22505.5,45.235,0.00010
2022-06-16 02:15:00,22505.5,22600.0,22377.5,22407.0,137.157,0.00010


In [76]:
df=get_dollar_bars(time_bars=df.reset_index("timestamp"), dollar_threshold=df['close'].mean()*df['volume'].mean())
df['daily vol']=get_daily_vol(close=df['close'].drop_duplicates(), lookback=100)

Calculating daily volatility for dynamic thresholds


In [77]:
df=df.dropna()
df

Unnamed: 0_level_0,open,high,low,close,maker fee,daily vol
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-02-22 01:35:00,56770.0,57127.0,56658.5,57015.5,-0.00025,0.003399
2021-02-22 01:50:00,57015.5,57062.0,56912.0,56985.0,-0.00025,0.002911
2021-02-22 02:05:00,56985.0,56985.0,56596.0,56780.5,-0.00025,0.004775
2021-02-22 02:20:00,56780.5,57141.5,56712.0,57071.5,-0.00025,0.004136
2021-02-22 02:35:00,57071.5,57313.5,57071.5,57159.0,-0.00025,0.004001
...,...,...,...,...,...,...
2022-06-16 00:20:00,22650.5,22694.0,22398.0,22555.0,0.00010,0.053747
2022-06-16 00:35:00,22555.0,22877.5,22464.5,22527.5,0.00010,0.054057
2022-06-16 01:05:00,22667.5,23032.5,22424.5,22951.5,0.00010,0.054733
2022-06-16 01:20:00,22951.5,23069.0,22600.0,22687.0,0.00010,0.055025


In [78]:
df=df.drop(index=df.loc[df.index.duplicated()].index)

In [79]:
df.index.duplicated().sum()

0

In [None]:
df=calc_features(df)
df=df.dropna()

In [82]:
limit_price_dist = df['ATR']*0.5
pips=0.5
limit_price_dist = np.maximum(1,(limit_price_dist / pips).round().fillna(1)) * pips

In [None]:
df.loc[:,'buy_price']=df.loc[:,'close']-limit_price_dist
df.loc[:,'sell_price']=df.loc[:,'close']+limit_price_dist

In [84]:
upper_bound=((df.loc[df.index[1]:,'close']*0.2*df.loc[df.index[1]:,'daily vol'])/pips).round()*pips
lower_bound=((df.loc[df.index[1]:,'close']*0.4*df.loc[df.index[1]:,'daily vol'])/pips).round()*pips*-1

In [None]:
df['Upper']=df['close']+upper_bound
df['Lower']=df['close']+lower_bound

In [None]:
df['buy_executed']=np.where(df['buy_price']>df['low'].shift(-1),1,0)
df['sell_executed']=np.where(df['sell_price']<df['high'].shift(-1),1,0)

In [None]:
df=compute_primary_model_ret_buy(df=df,cond=df['buy_executed'],high=df['high'],low=df['low'],sl=df['Lower'],tp=df['Upper'],fee=df['maker fee'])

In [None]:
df=compute_primary_model_ret_sell(df=df,cond=df['sell_executed'],high=df['high'],low=df['low'],sl=df['Upper'],tp=df['Lower'],fee=df['maker fee'])

In [92]:
df.loc[:,['y_buy','y_sell']]

Unnamed: 0_level_0,y_buy,y_sell
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-02-23 05:50:00,0.010622,0.009911
2021-02-23 06:20:00,0.005808,0.000000
2021-02-23 06:35:00,0.010303,0.000000
2021-02-23 07:05:00,0.000000,-0.007683
2021-02-23 07:35:00,0.001309,0.000000
...,...,...
2022-06-16 00:20:00,0.000000,-0.001669
2022-06-16 00:35:00,0.000000,-0.014471
2022-06-16 01:05:00,0.010174,0.000000
2022-06-16 01:20:00,0.007845,0.000000
