# Load data and add indicators

In [4]:
import sys
sys.path.append('..')

from os import environ
import pandas as pd
from indicators import indicators
from datetime import timedelta
from tqdm.auto import tqdm
from config.config import ConfigFactory

# Set environment variable
environ["ENV"] = "1h_4h"

# Get configs
configs = ConfigFactory.factory(environ).configs

def get_file(ticker):
    ''' Find files buy ticker names, file names can be in different formats '''
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    return None, None

def add_indicators(df, ttype, configs):
    # add RSI
    rsi = indicators.RSI(ttype, configs)
    df = rsi.get_indicator(df, '', '', 0)
    # add RSI
    stoch = indicators.STOCH(ttype, configs)
    df = stoch.get_indicator(df, '', '', 0)
    # add Trend
    trend = indicators.Trend(ttype, configs)
    df = trend.get_indicator(df, '', '', 0)
    # add MACD
    macd = indicators.MACD(ttype, configs)
    df = macd.get_indicator(df, '', '', 0)
    # add ATR
    atr = indicators.ATR(ttype, configs)
    df = atr.get_indicator(df, '', '', 0)
    return df

def create_train_df(df, ttype, configs, target_offset, first, last, step):
    ''' Create train dataset from signal statistics and ticker candle data'''
    train_df = pd.DataFrame()
    tickers = df['ticker'].unique()
    
    for ticker in tqdm(tickers):
        # get signals with current ticker
        signal_df = df[df['ticker'] == ticker]
        times = signal_df['time']
        
        # load candle history of this ticker
        tmp_df_1h, tmp_df_4h = get_file(ticker)

        # add indicators 
        tmp_df_1h = add_indicators(tmp_df_1h, ttype, configs)

        # add historical data for current ticker
        for i, t in enumerate(times.to_list()):
            pass_cycle = False
            pattern = signal_df.iloc[i, signal_df.columns.get_loc('pattern')]
            row = tmp_df_1h.loc[tmp_df_1h['time'] == t, :].reset_index(drop=True)
            
            for i in range(first, last + step, step):
                time_prev = t + timedelta(hours= -i)
                try:
                    row_tmp = tmp_df_1h.loc[tmp_df_1h['time'] == time_prev, :].reset_index(drop=True)
                    row_tmp.columns = [c + f'_prev_{i}' for c in row_tmp.columns]
                except IndexError:
                    pass_cycle = True
                    break
                row = pd.concat([row, row_tmp.iloc[:,1:]], axis=1)
                row['ticker'] = ticker
                row['pattern'] = pattern
                
            if pass_cycle:
                continue
            
            # add target
            time_next = t + timedelta(hours=target_offset)
            if ttype == 'buy':
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'high'].reset_index(drop=True)
            else:
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'low'].reset_index(drop=True)

            target.name = 'target'
            rows = pd.concat([row, target], axis=1)
            
            # add data to the dataset
            if train_df.shape[0] == 0:
                train_df = rows
            else:
                train_df = pd.concat([train_df, rows])
    
    return train_df

# for how long time (in hours) we want to predict
target_offset = 24
# first previous data point to collect for model training (value represents number of hours before signal point)
first = 1
# last previous data point to collect for model training (value represents number of hours before signal point)
last = 10
# step of previous data points collecting (total number of points to collect is (last - first + step) / step)
step = 1

# Buy
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/buy_stat_1h.pkl')
# dataset for model train
train_buy = create_train_df(df, 'buy', configs, target_offset, first, last, step)
train_buy = train_buy.dropna()

# Sell
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/sell_stat_1h.pkl')
# dataset for model train
train_sell = create_train_df(df, 'sell', configs, target_offset, first, last, step)
train_sell = train_sell.dropna()

train_df = pd.concat([train_buy, train_sell]).sort_values('time').reset_index(drop=True)
display(train_df.head())
display(train_df.shape)


100%|██████████| 370/370 [00:56<00:00,  6.54it/s]
100%|██████████| 308/308 [00:30<00:00, 10.15it/s]


Unnamed: 0,time,open,high,low,close,volume,rsi,stoch_slowk,stoch_slowd,stoch_slowk_dir,...,linear_reg_prev_10,linear_reg_angle_prev_10,macd_prev_10,macdsignal_prev_10,macdhist_prev_10,macd_dir_prev_10,macdsignal_dir_prev_10,atr_prev_10,close_smooth_prev_10,target
0,2022-12-23 07:00:00,1.074,1.1,1.073,1.085,310312.7,65.341348,85.293979,87.119652,0.026489,...,29.684219,-15.006676,-0.006157,-0.004305,-0.001852,0.159254,0.135557,0.010157,1.04125,1.086
1,2022-12-23 07:00:00,0.693,0.777,0.691,0.768,516397.7,81.721385,85.531532,83.150654,0.067481,...,10.538362,-4.140549,-0.014886,-0.015304,0.000418,0.0,-0.010925,0.014147,0.67875,0.73
2,2022-12-23 19:00:00,0.05791,0.058277,0.05768,0.058167,395965.0,40.206455,18.989544,16.812634,0.026486,...,50.557651,-3.492108,-0.000819,-0.001134,0.000315,0.0,-0.065048,0.001013,0.059634,0.056878
3,2022-12-23 23:00:00,0.00272,0.00272,0.002702,0.002712,11043840.0,26.198774,8.825903,9.923913,0.045659,...,31.974021,-11.381398,-1.5e-05,-2.2e-05,6e-06,0.0,-0.063876,2.1e-05,0.002841,0.002773
4,2022-12-24 03:00:00,0.43,0.433,0.43,0.433,49209.85,44.025328,17.388802,12.18472,0.23539,...,14.829397,-7.155657,0.000279,0.000208,7.1e-05,-0.122295,1.215276,0.004542,0.43675,0.426


(5484, 224)

In [5]:
import numpy as np
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


params = {
          'objective': 'multiclass',
        #   'metric': 'multi_logloss',
          'n_estimators': 2000,
          'learning_rate': 0.11,
          'early_stopping_round': 50,
          'max_depth': 9
        }


def model_train(df, how, n_folds, stratified): 
    oof = np.zeros([df['target'].shape[0], 5])
    features = [c for c in df.columns if c not in ['time', 'target', 'ticker', 'pattern']]
    X, groups = df[features], df['ticker']
    X = pd.concat([X, pd.get_dummies(train_df['pattern'], drop_first=True)], axis=1)
    y = np.clip(np.round((train_df['target'] - train_df['close']) / train_df['close'] * 100, 0), -2, 2) + 2
    
    oe_enc = OrdinalEncoder()
    groups = oe_enc.fit_transform(groups.values.reshape(-1, 1))
    
    if stratified:
        kf = StratifiedGroupKFold(n_splits=n_folds, shuffle=True, random_state=180820231)
    else:
        kf = GroupKFold(n_splits=n_folds)
        
    print(f"Training with {len(features)} features")
    
    if how == 'linreg' or how == 'logreg':
        scaler = StandardScaler()
        X[X.columns] = scaler.fit_transform(X)
    
    for fold, (fit_idx, val_idx) in enumerate(kf.split(X, y, groups)):
        # Split the dataset according to the fold indexes.
        X_train = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]
        
        if how == 'lgbmc':
            model = lgb.LGBMClassifier(**params)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                      eval_metric='multiclass', verbose=100)
            # best_iter = model.best_iteration_
        elif how == 'logreg':
            model = LogisticRegression(C=0.1, max_iter=100000)#, class_weight='balanced')
            model.fit(X_train, y_train)

        val_preds = model.predict_proba(X_val)
        oof[val_idx, :] = val_preds
        val_score = log_loss(y_val, val_preds)
        print(val_score)

    return oof

oof = model_train(train_df, how='lgbmc', n_folds=5, stratified=True) # 77.78546798415482

y = np.clip(np.round((train_df['target'] - train_df['close']) / train_df['close'] * 100, 0), -2, 2) + 2
log_loss(y, oof)


Training with 220 features




1.411732392116155




1.4616621852036131




1.4583476779843323




1.3988915366877706




1.4348487278787105


1.4339585802642052

In [None]:
# 1.4339585802642052

In [7]:
train_df[train_cols].loc[:,lr.coef_ > 0]

NameError: name 'train_cols' is not defined

In [32]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()

X, y = train_df[train_cols], (train_df['target'] > train_df['open']).map({True: 1, False: 0})
X = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

logr = LogisticRegression(C=0.1, max_iter=100000, class_weight='balanced')
logr.fit(X_train, y_train)

preds = logr.predict(X_val)

accuracy_score(y_val, preds)

0.618049225159526

In [33]:
X, y = train_df[train_cols], (train_df['target'] > train_df['open']).map({True: 1, False: 0})

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

lgbc = lgb.LGBMClassifier()
lgbc.fit(X_train, y_train)

preds = lgbc.predict(X_val)
accuracy_score(y_val, preds)

0.715587967183227