In [42]:
import pandas as pd
import numpy as np
from numpy import hstack
import warnings
from datetime import datetime
import import_ipynb
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import talib as ta

from redis_cli import RedisClient

from tqdm import tqdm
tqdm.pandas()
warnings.simplefilter('ignore')

from file_loader import get_data, store_to_file
from features import FeatureEngineering
from data_loader import load_crypto_data, load_crypto_data2
import utils as ut

In [43]:
# r = RedisClient(db=1, username='usr_redis', password='usr_pwd')
# r.test_connection()

In [44]:
period=-(datetime.now() - datetime(2019, 1, 1)).days
# period=-(datetime(2024,12,27) - datetime(2019, 1, 1)).days
time_interval='1d'
tickers = ['BTC-USD', 'ETH-USD'] #, 'SOL-USD', 'XRP-USD'


In [45]:
# %run crypto_data_loader.ipynb
crypto_dir = load_crypto_data(tickers, period, time_interval)
# crypto_dir = load_crypto_data2(tickers, datetime(2019, 1, 1), datetime(2024, 12, 31), time_interval)

Start load crypto data, tickers ['BTC-USD', 'ETH-USD'], interval: 1d, from: 2019-01-01 23:09:01.304945


[*********************100%***********************]  2 of 2 completed

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2198 entries, 2019-01-01 to 2025-01-06
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   (BTC-USD, Open)       2198 non-null   float64
 1   (BTC-USD, High)       2198 non-null   float64
 2   (BTC-USD, Low)        2198 non-null   float64
 3   (BTC-USD, Close)      2198 non-null   float64
 4   (BTC-USD, Adj Close)  2198 non-null   float64
 5   (BTC-USD, Volume)     2198 non-null   int64  
 6   (ETH-USD, Open)       2198 non-null   float64
 7   (ETH-USD, High)       2198 non-null   float64
 8   (ETH-USD, Low)        2198 non-null   float64
 9   (ETH-USD, Close)      2198 non-null   float64
 10  (ETH-USD, Adj Close)  2198 non-null   float64
 11  (ETH-USD, Volume)     2198 non-null   int64  
dtypes: float64(10), int64(2)
memory usage: 223.2 KB





In [46]:
def plot_data(df):
    fig = go.Figure(data=go.Ohlc(x=df.index,
                    open=df['Open'],
                    high=df['High'],
                    low=df['Low'],
                    close=df['Close'],                    
    ))
    fig.show()
    

In [47]:
# store_type = file, redis
def merge_and_store_data(new_df, key, store_type='file', compress=False):
    if store_type == 'redis':
        saved_data = r.get_key(key)
    else:
        saved_data = get_data('_data_store', key, compress=compress)

    merged_df = ut.validate_duplicate_and_merge(saved_data, new_df)

    if store_type == 'redis':
        r.set_key(key, merged_df)
    else:
        store_to_file(merged_df, key, compress=compress)

    merged_df.dropna(inplace=True)
    return merged_df

### Processing

In [48]:
params = {
    'emaf': 20,
    'emam': 100,
    'emas': 150,
    'rsi': 14,
    'macd': [12, 26, 9],
    'max_train_size': 180,
    'test_size': 60,
    # 'max_train_size': 90,
    # 'test_size': 30
}

fe = FeatureEngineering(params)

In [49]:
def predict_ensemble(model_funcs, data_with_features, features):
    for train_data, val_data, test_data in tqdm(fe.split_data(data_with_features)):
        X_train, y_train = train_data[features], train_data['Target']
        X_val, y_val     = val_data[features], val_data['Target']
        X_test, y_test   = test_data[features], test_data['Target']

        ## Data normalization
        # X_train_scaled, X_val_scaled, X_test_scaled = ut.normalize_MinMaxScaler(X_train, X_val, X_test)
        X_train_scaled, X_val_scaled, X_test_scaled = ut.normalize_StandardScaler(X_train, X_val, X_test)
    
        ## Modeling
        # models = models_fit(model_funcs, X_train_scaled, y_train, X_val=X_val, y_val=y_val)
        models = ut.fit_models(model_funcs, X_train_scaled, y_train)

        ## Prediction on train, val and test samples
        predict_dict = ut.predict_models(models, X_train_scaled, X_val_scaled, X_test_scaled)

        yield ( predict_dict, y_train, y_val, y_test )


In [55]:
def get_trend_data(symbol, features, params):
    data_dir = params['data_dir']
    lag_periods = params['lag_periods']
    min_outliers=params['min_outliers']
    max_outliers=params['max_outliers']

    data = get_data(data_dir, symbol)
    data.drop(columns=['chg', 'vol_chg'], inplace=True) # Could it be as features ?

    df = fe.clear_invalid_targets(fe.add_target(fe.enrich_with_indicators(data), lag_periods))
    # df = fe.clear_invalid_targets(fe.add_target2(fe.enrich_with_indicators(data)))
    df = fe.validate_outliers(df, 'Close', min_outliers, max_outliers)
    # # print(df.isnull().sum())
    
    # Trend features
    data_with_trend, new_trend_features = fe.create_trend_features(df, features, lag_periods)    

    return data_with_trend, new_trend_features

In [51]:
def predict_process(model_funcs, data_with_features, features, use_stacking, use_blending):
    ## Split, predict
    y_train_pred_prob = list()
    y_val_pred_prob = list()
    y_test_pred_prob = list()
    y_train_total = pd.DataFrame()
    y_val_total = pd.DataFrame()
    y_test_total = pd.DataFrame()

    for predict_dict, y_train, y_val, y_test in predict_ensemble(model_funcs, data_with_features, features):
        y_train_total = pd.concat([y_train_total, y_train], ignore_index=True)
        y_val_total = pd.concat([y_val_total, y_val], ignore_index=True)
        y_test_total = pd.concat([y_test_total, y_test], ignore_index=True)

        y_train_pred_prob.append([d['train'] for d in predict_dict][0])
        y_val_pred_prob.append([d['val'] for d in predict_dict][0])
        y_test_pred_prob.append([d['test'] for d in predict_dict][0])

    ## 2D-array
    train_pred_prob = hstack(y_train_pred_prob)
    val_pred_prob = hstack(y_val_pred_prob)
    test_pred_prob = hstack(y_test_pred_prob)

    print(f"     Train size: {len(y_train_total)}, Val size: {len(y_val_total)},   Test size: {len(y_test_total)}")
    print(f"Pred Train size: {len(train_pred_prob)}, Val size: {len(val_pred_prob)}, Test size: {len(test_pred_prob)}")

    ## Final model using whole data
    if use_stacking:
        stacked_train_X = ut.stacking_pred(train_pred_prob).reshape(-1,1)
        stacked_val_X = ut.stacking_pred(val_pred_prob).reshape(-1,1)
        stacked_test_X = ut.stacking_pred(test_pred_prob).reshape(-1,1)
        
        final_model_func = ut.ModelFunc.LOGISTIC_REG 
        final_model = ut.fit_models([final_model_func], stacked_train_X, y_train_total)[0]
        predict_dict = ut.predict_models([final_model], stacked_train_X, stacked_val_X, stacked_test_X)

    elif use_blending:
        blended_train_X = ut.stacking_pred(train_pred_prob).reshape(-1,1)
        blended_val_X = ut.stacking_pred(val_pred_prob).reshape(-1,1)
        blended_test_X = ut.stacking_pred(test_pred_prob).reshape(-1,1)

        final_model_func = ut.ModelFunc.LOGISTIC_REG
        final_model = ut.fit_models([final_model_func], blended_train_X, y_train_total)[0]
        predict_dict = ut.predict_models([final_model], blended_train_X, blended_val_X, blended_test_X)

    else:
        final_model_func = model_funcs[0]
        final_model = ut.fit_models([final_model_func], train_pred_prob.reshape(-1,1), y_train_total)[0]
        predict_dict = ut.predict_models([final_model], train_pred_prob.reshape(-1,1), val_pred_prob.reshape(-1,1),\
                                       test_pred_prob.reshape(-1,1))

    ensemble_train = [d['train'] for d in predict_dict][0]
    ensemble_val = [d['val'] for d in predict_dict][0]
    ensemble_test = [d['test'] for d in predict_dict][0]

    # ## Display metrics, ROC AUC for train, val and test samples
    train_roc_auc = ut.roc_auc_score_metric(y_train_total, ensemble_train)
    val_roc_auc = ut.roc_auc_score_metric(y_val_total, ensemble_val)
    test_roc_auc = ut.roc_auc_score_metric(y_test_total, ensemble_test)
  
    print('=== Train sample metrics ===')
    print(f'ROC AUC: {train_roc_auc:.4f}')
    print(ut.calculate_metrics_table(y_train_total, ensemble_train))

    print('=== Val sample metrics ===')
    print(f'ROC AUC: {val_roc_auc:.4f}')
    print(ut.calculate_metrics_table(y_val_total, ensemble_val))

    print('=== Test sample metrics ===')
    print(f'ROC AUC: {test_roc_auc:.4f}')
    print(ut.calculate_metrics_table(y_test_total, ensemble_test))
    print('===========================')    

In [52]:
def get_models():
    models = list()
    # models.append(ut.ModelFunc.LOGISTIC_REG)

    # models.append(ut.ModelFunc.LINEAR_REG)
    # models.append(ut.ModelFunc.KNN_REG) 
    # models.append(ut.ModelFunc.DECISION_TREE_REG)
    # models.append(ut.ModelFunc.RANDOM_FOREST_REG)
    # models.append(ut.ModelFunc.CATBOOST_REG)
    # models.append(ut.ModelFunc.XGBOOST_REG)

    # models.append(ut.ModelFunc.XGBOOST_CLASS)
    # models.append(ut.ModelFunc.CATBOOST_CLASS)

    # models.append(ut.ModelFunc.RANDOM_FOREST_CLASS)
    models.append(ut.ModelFunc.DECISION_TREE_CLASS)
    models.append(ut.ModelFunc.KNN_CLASS)
    return models

In [53]:
## Train,val,test using whole data
# symbols =['BTC-USD']
# for symbol in symbols:
# # for name in tqdm(tickers):
#     print(f'=== symbol: {symbol}, stacking: {use_stacking}, blending: {use_blending} ===')

#     data = get_data(crypto_dir, symbol)
#     data.drop(columns=['chg', 'vol_chg'], inplace=True) # Could it be as features ?

#     df = fe.clear_invalid_targets(fe.add_target(fe.enrich_with_indicators(data), lag_periods))
#     # df = fe.clear_invalid_targets(fe.add_target2(fe.enrich_with_indicators(data)))
#     df = fe.validate_outliers(df, 'Close', min_outliers, max_outliers)
#     # # print(df.isnull().sum())
    
#     ## Store data
#     # df = merge_and_store_data(df, symbol, compress=True) # Store data
#     # print(df.isnull().sum())

#     ## Add features
#     OHLCV = ['Open', 'High', 'Low', 'Close', 'Volume']

#     # Trend features
#     data_with_trend, new_trend_features = fe.create_trend_features(df, OHLCV, lag_periods)
#     # print(data_with_trend.isnull().sum())
#     features = new_trend_features + trend_indicators
#     data_with_features = data_with_trend[features + ['Target', 'Date']]

#     # Rolling features
#     # window_sizes = [7, 14, 30]
#     # data_with_rolling, new_rolling_features = fe.create_rolling_features(df, OHLCV, window_sizes)
#     # features = new_rolling_features + trend_indicators + ['Target']
#     # data_with_features = data_with_rolling[features + ['Date']]

#     # print(len(data_with_features))
#     # print(data_with_features.isnull().sum())
#     data_with_features.set_index('Date', inplace=True)
#     # display(data_with_features.tail(10))

#     train_data, val_data, test_data = ut.split_data_by_date(data_with_features)

#     X_train = train_data[features]
#     y_train = train_data['Target']

#     X_val = val_data[features]
#     y_val = val_data['Target']

#     X_test = test_data[features]
#     y_test = test_data['Target']

#     print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}")

#     model_funcs = get_models()
#     print(f'Models: {model_funcs}')

#     ## Data normalization
#     X_train_scaled, X_val_scaled, X_test_scaled = ut.normalize_MinMaxScaler(X_train, X_val, X_test)
#     # X_train_scaled, X_val_scaled, X_test_scaled = ut.normalize_StandardScaler(X_train, X_val, X_test)

#     ## Modeling
#     # models = models_fit(model_funcs, X_train_scaled, y_train, X_val=X_val, y_val=y_val)
#     models = ut.fit_models(model_funcs, X_train_scaled, y_train)

#     ## Prediction on train, val and test samples
#     predict_dict = ut.predict_models(models, X_train_scaled, X_val_scaled, X_test_scaled)

#     stacked_train_X = ut.stacking_pred([d['train'] for d in predict_dict][0])
#     stacked_val_X = ut.stacking_pred([d['val'] for d in predict_dict][0])
#     stacked_test_X = ut.stacking_pred([d['test'] for d in predict_dict][0])
    
#     final_model_func = ut.ModelFunc.LOGISTIC_REG 
#     final_model = ut.fit_models([final_model_func], stacked_train_X, y_train)[0]
#     predict_dict = ut.predict_models([final_model], stacked_train_X, stacked_val_X, stacked_test_X)

#     ensemble_train = [d['train'] for d in predict_dict][0]
#     ensemble_val = [d['val'] for d in predict_dict][0]
#     ensemble_test = [d['test'] for d in predict_dict][0]

#     # ## Display metrics, ROC AUC for train, val and test samples
#     train_roc_auc = ut.roc_auc_score_metric(y_train, ensemble_train)
#     val_roc_auc = ut.roc_auc_score_metric(y_val, ensemble_val)
#     test_roc_auc = ut.roc_auc_score_metric(y_test, ensemble_test)
  
#     print('=== Train sample metrics ===')
#     print(f'ROC AUC: {train_roc_auc:.4f}')
#     print(ut.calculate_metrics_table(y_train, ensemble_train))

#     print('=== Val sample metrics ===')
#     print(f'ROC AUC: {val_roc_auc:.4f}')
#     print(ut.calculate_metrics_table(y_val, ensemble_val))

#     print('=== Test sample metrics ===')
#     print(f'ROC AUC: {test_roc_auc:.4f}')
#     print(ut.calculate_metrics_table(y_test, ensemble_test))
#     print('===========================')


In [None]:
## Train,val,test using WFO
data_params = {
    'data_dir': 'crypto_data',
    'lag_periods': 7,
    'min_outliers': .23,
    'max_outliers': .77
}
use_stacking = True
use_blending = False
trend_indicators = [ 'emaf', 'emam', 'emas', 'rsi', 'macd', 'adx']
threshold = 0.6 # ???
symbols =['BTC-USD']

for symbol in symbols:
# for name in tqdm(tickers):
    print(f'=== symbol: {symbol}, stacking: {use_stacking}, blending: {use_blending} ===')

    ## Features
    OHLCV = ['Open', 'High', 'Low', 'Close', 'Volume']

    # Trend features
    data_with_trend, new_trend_features = get_trend_data(symbol, OHLCV, data_params)
    features = new_trend_features + trend_indicators
    data_with_features = data_with_trend[features + ['Target', 'Date']]

    ## Store data
    data_with_features = merge_and_store_data(data_with_trend, symbol, compress=True)
    # display(data_with_features.tail(10))

    ## Prediction
    model_funcs = get_models()
    print(f'Models: {model_funcs}')
    predict_process(model_funcs, data_with_features, features, use_stacking, use_blending)

    ## Plot
    plot_data(data_with_features)


=== symbol: BTC-USD, stacking: True, blending: False ===
Outliers detected: 0
Models: [<function decision_tree_classifier_model at 0x7f475baa1ee0>, <function knn_classifier_model at 0x7f475baa2020>]


30it [00:00, 39.33it/s]


     Train size: 2160, Val size: 3240,   Test size: 1800
Pred Train size: 2160, Val size: 3240, Test size: 1800
=== Train sample metrics ===
ROC AUC: 0.9977
   Cutoff  Precision     Recall   Accuracy   F1-Score
0    50.0  97.066437  98.425197  97.592593  97.741095
1    60.0  97.558849  97.900262  97.592593  97.729258
2    70.0  97.807018  97.550306  97.546296  97.678493
3    80.0  98.734177  95.538058  96.990741  97.109827
=== Val sample metrics ===
ROC AUC: 0.5134
   Cutoff  Precision     Recall   Accuracy   F1-Score
0    50.0  54.241338  53.161593  51.666667  53.696038
1    60.0  54.393842  49.648712  51.512346  51.913070
2    70.0  54.393842  49.648712  51.512346  51.913070
3    80.0  54.242820  48.653396  51.296296  51.296296
=== Test sample metrics ===
ROC AUC: 0.5225
   Cutoff  Precision     Recall   Accuracy   F1-Score
0    50.0  53.826851  44.045175  49.277778  48.447205
1    60.0  55.585106  42.915811  50.555556  48.435689
2    70.0  55.585106  42.915811  50.555556  48.435689
