In [1]:
import numpy as np
import collections
import pandas as pd
import xgboost as xgb
from vecstack import stacking
from lucrum.algo import pyta
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import lucrum.datareader.dataconst as dcons
from sklearn.metrics import precision_score
from sklearn.naive_bayes import GaussianNB
from lucrum.algo.stackedclf import StackedClf, SklearnHelper
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier, ExtraTreesClassifier)

In [2]:
# load dataset 
load_data = pd.read_csv('dataxrp.csv', sep='\t', dtype={"open": float})
load_data.drop("Unnamed: 0", axis=1, inplace=True)
display(load_data.head())

Unnamed: 0,open_time,open,high,low,close,close_time,trades,volume
0,2018-05-05 02:00:00+02:00,0.8898,0.9,0.888,0.89766,2018-05-05 02:14:59.999000+02:00,274,159373.17
1,2018-05-05 02:15:00+02:00,0.89766,0.9046,0.89601,0.90388,2018-05-05 02:29:59.999000+02:00,290,243422.1
2,2018-05-05 02:30:00+02:00,0.90388,0.9046,0.8954,0.90391,2018-05-05 02:44:59.999000+02:00,206,126523.8
3,2018-05-05 02:45:00+02:00,0.9039,0.905,0.8952,0.89644,2018-05-05 02:59:59.999000+02:00,295,121757.09
4,2018-05-05 03:00:00+02:00,0.89643,0.89922,0.89218,0.8947,2018-05-05 03:14:59.999000+02:00,251,146653.0


In [3]:
# generating features

# take a copy of the original dataset 
# so to work with another dataframe 
df = load_data.copy()

# seed to reproduce results
seed=56

# moving average parameters 
lead_ma = "ema"
lag_ma = "sma"

# hyperparameters
lead_timeperiod = 5
lag_timeperiod = 8
lag_delay = 6
crossover_distance = 1

# generate crossover indicators 
pyta.crossover(data=df,
               indicator_a=lead_ma,
               indicator_a_time=lead_timeperiod,
               indicator_b=lag_ma,
               indicator_b_time=lag_timeperiod)

# now generate lagged moving average indicators 
# for lead moving average
lead_ma_col = lead_ma + str(lead_timeperiod)

# keep reference to moving average column names
ma_cols = [lead_ma_col]
ma_cols += pyta.lag_col(data=df,
                        lag=lag_delay,
                        col=lead_ma_col,
                        loc_offset=1)

# for lag moving avergae 
lag_ma_col = lag_ma + str(lag_timeperiod)
ma_cols.append(lag_ma_col)
ma_cols += pyta.lag_col(data=df,
                        lag=lag_delay,
                        col=lag_ma_col,
                        loc_offset=1)

# now we normalize all moving average values to 0..1
df[ma_cols] = df[ma_cols].div(df[ma_cols].sum(axis=1), axis=0)

# keep reference to difference column names
# as they will be used as features to train our model
ma_diff_cols = []

# feature reduction subtract pairs (lag-lead/sma-ema)
for i in range(0, lag_delay + 1):
    
    # get current column name depending on index
    tmp_lag_col = lag_ma_col if i==0 else lag_ma_col + "_lag_" + str(i)
    tmp_lead_col = lead_ma_col if i==0 else lead_ma_col + "_lag_" + str(i)
    
    # calculate differences & insert
    ma_diff = df[tmp_lag_col] - df[tmp_lead_col]
    tmp_col = "ma_" + str(i)
    df.insert(df.shape[1] - 1, tmp_col, ma_diff)
    ma_diff_cols.append(tmp_col)

# shift crossover/outcome since we want to train model to classify crossover
# fill empty with -1 to convert to int
crossover = df["crossover"].shift(-crossover_distance).fillna(-1).astype(int).copy()
df["crossover"] = crossover
df.drop(df.tail(crossover_distance).index, inplace=True) # remove the rows filled with crossover as -1

In [4]:
# splitting data for training and test 
def train_test_split_ts(data, test_size):
    total_samples = data.shape[0]
    test_samples = int(total_samples * test_size)
    training_samples = total_samples - test_samples
    return data.head(training_samples).copy(), data.tail(test_samples).copy()

# split into training and test 
test_size = 0.30
df_training, df_test = train_test_split_ts(data=df,
                                           test_size=test_size)

In [5]:
# to train the crossover prediction model we need to check 
# for class imbalance in the training dataset 
print(df_training["crossover"].value_counts())
print(df_test["crossover"].value_counts())

0    19721
1     3552
Name: crossover, dtype: int64
0    8443
1    1530
Name: crossover, dtype: int64


In [6]:
# split data 
# training dataset
X_train = df_training[ma_diff_cols]
y_train = df_training["crossover"]

# oversample training dataset -> I think oversampling requires different steps 
# eg: try before pre processing stuff 
# X_train_os, y_train_os = SMOTE().fit_resample(X_train, y_train)

# test dataset
X_test = df_test[ma_diff_cols]
y_test = df_test["crossover"]

# display(df_training)
# display(df_test)

In [7]:
clf = RandomForestClassifier(n_jobs=-1,
                           n_estimators=100,
                           criterion="gini",
                           max_depth=15,
                           min_samples_leaf=1,
                           max_features="sqrt").fit(X_train, y_train)

y_pred = clf.predict(X_test)
score = precision_score(y_test, y_pred, pos_label=1, average="binary")
print(score)
print(collections.Counter(y_pred))

clf = GradientBoostingClassifier(n_estimators=100,
                           max_depth=5,
                           min_samples_leaf=1,
                           max_features="sqrt").fit(X_train, y_train)

y_pred = clf.predict(X_test)
score = precision_score(y_test, y_pred, pos_label=1, average="binary")
print(score)
print(collections.Counter(y_pred))

clf = xgb.XGBClassifier(random_state=0, 
                        n_jobs=-1, 
                      learning_rate=0.3, 
                      n_estimators=100, 
                      max_depth=10).fit(X_train, y_train)

y_pred = clf.predict(X_test)
score = precision_score(y_test, y_pred, pos_label=1, average="binary")
print(score)
print(collections.Counter(y_pred))

clf = LGBMClassifier(random_state=0, 
                     n_jobs=-1, 
                     learning_rate=0.2, 
                     n_estimators=100, 
                     max_depth=15).fit(X_train, y_train)

y_pred = clf.predict(X_test)
score = precision_score(y_test, y_pred, pos_label=1, average="binary")
print(score)
print(collections.Counter(y_pred))

0.7128205128205128
Counter({0: 9388, 1: 585})
0.7019667170953101
Counter({0: 9312, 1: 661})
0.6062917063870352
Counter({0: 8924, 1: 1049})
0.6465781409601634
Counter({0: 8994, 1: 979})


In [8]:
models = [
    RandomForestClassifier(n_jobs=-1,
                           n_estimators=100,
                           criterion="gini",
                           max_depth=15,
                           min_samples_leaf=1,
                           max_features="sqrt"),
    
    xgb.XGBClassifier(random_state=0, 
                      n_jobs=-1, 
                      learning_rate=0.3, 
                      n_estimators=100, 
                      max_depth=10),
    
    LGBMClassifier(random_state=0, 
                     n_jobs=-1, 
                     learning_rate=0.2, 
                     n_estimators=100, 
                     max_depth=15),
    
    GradientBoostingClassifier(n_estimators=200,
                           max_depth=3,
                           min_samples_leaf=1,
                           max_features="sqrt")
]

S_train, S_test = stacking(models, 
                           X_train, 
                           y_train, 
                           X_test,   
                           regression=False, 
                           mode='oof_pred', 
                           save_dir=None, 
                           #metric=precision_score, 
                           n_folds=5, 
                           stratified=True,
                           shuffle=False,  
                           needs_proba=True,
                           verbose=2)

meta_model = LogisticRegression(penalty="l2",
                                solver="lbfgs",
                                max_iter=1000,
                                n_jobs=-1).fit(S_train, y_train)

y_pred = meta_model.predict(S_test)
score = precision_score(y_test, y_pred, pos_label=1, average="binary")
print(score)
print(collections.Counter(y_pred))

task:         [classification]
n_classes:    [2]
metric:       [log_loss]
mode:         [oof_pred]
n_models:     [4]

model  0:     [RandomForestClassifier]
    fold  0:  [0.28638152]
    fold  1:  [0.28870036]
    fold  2:  [0.29040325]
    fold  3:  [0.29567340]
    fold  4:  [0.30792970]
    ----
    MEAN:     [0.29381765] + [0.00769070]
    FULL:     [0.29381679]

    Fitting on full train set...

model  1:     [XGBClassifier]
    fold  0:  [0.32149345]
    fold  1:  [0.30734637]
    fold  2:  [0.33579404]
    fold  3:  [0.32396978]
    fold  4:  [0.35347263]
    ----
    MEAN:     [0.32841525] + [0.01545244]
    FULL:     [0.32841375]

    Fitting on full train set...

model  2:     [LGBMClassifier]
    fold  0:  [0.28500545]
    fold  1:  [0.28296854]
    fold  2:  [0.29903503]
    fold  3:  [0.29739868]
    fold  4:  [0.30776734]
    ----
    MEAN:     [0.29443501] + [0.00925301]
    FULL:     [0.29443370]

    Fitting on full train set...

model  3:     [GradientBoostingClassif