In [77]:
import pandas as pd
import numpy as np
from numpy import hstack
import warnings
from datetime import datetime
import import_ipynb
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import talib as ta

from redis_cli import RedisClient

from tqdm import tqdm
tqdm.pandas()
warnings.simplefilter('ignore')

from file_loader import get_data, store_to_file
from features import FeatureEngineering
from data_loader import load_crypto_data, load_crypto_data2
import utils as ut

In [78]:
# r = RedisClient(db=1, username='usr_redis', password='usr_pwd')
# r.test_connection()

In [79]:
period=-(datetime.now() - datetime(2019, 1, 1)).days
# period=-(datetime(2024,12,27) - datetime(2019, 1, 1)).days
time_interval='1d'
tickers = ['BTC-USD', 'ETH-USD'] #, 'SOL-USD', 'XRP-USD'


In [80]:
# %run crypto_data_loader.ipynb
crypto_dir = load_crypto_data(tickers, period, time_interval)
# crypto_dir = load_crypto_data2(tickers, datetime(2019, 1, 1), datetime(2024, 12, 31), time_interval)

Start load crypto data, tickers ['BTC-USD', 'ETH-USD'], interval: 1d, from: 2019-01-01 16:08:51.437605


[*********************100%***********************]  2 of 2 completed

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2198 entries, 2019-01-01 to 2025-01-06
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   (ETH-USD, Open)       2198 non-null   float64
 1   (ETH-USD, High)       2198 non-null   float64
 2   (ETH-USD, Low)        2198 non-null   float64
 3   (ETH-USD, Close)      2198 non-null   float64
 4   (ETH-USD, Adj Close)  2198 non-null   float64
 5   (ETH-USD, Volume)     2198 non-null   int64  
 6   (BTC-USD, Open)       2198 non-null   float64
 7   (BTC-USD, High)       2198 non-null   float64
 8   (BTC-USD, Low)        2198 non-null   float64
 9   (BTC-USD, Close)      2198 non-null   float64
 10  (BTC-USD, Adj Close)  2198 non-null   float64
 11  (BTC-USD, Volume)     2198 non-null   int64  
dtypes: float64(10), int64(2)
memory usage: 223.2 KB





In [81]:
def compare_and_plot(data, column_name1, column_name2):
    df = data.copy()
    X_train = df[column_name1]
    y_train = df[column_name2]

    # without normalization
    model_no_norm = ut.linear_regression(X_train, y_train)
    weights_no_norm = model_no_norm.coef_

    # normalize data
    prices_norm, volume_norm = ut.normalize_MinMax_by_column(X_train, y_train)
    # print(prices_norm)

    model_with_norm = ut.linear_regression(prices_norm, volume_norm)
    weights_with_norm = model_with_norm.coef_

    # Plotting the weights
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

    axes[0].bar(['Price', 'Volume'], weights_no_norm, color='red')
    axes[0].set_title('Weigth without normalization')
    axes[0].set_ylabel('Model weghts')

    axes[1].bar(['Price', 'Volume'], weights_with_norm, color='green')
    axes[1].set_title('Weigth after normalization')
    axes[1].set_ylabel('Model weghts')

    plt.tight_layout()
    plt.show()
    

In [82]:
# store_type = file, redis
def merge_and_store_data(new_df, key, store_type='file', compress=False):
    if store_type == 'redis':
        saved_data = r.get_key(key)
    else:
        saved_data = get_data('_data_store', key, compress=compress)

    merged_df = ut.validate_duplicate_and_merge(saved_data, new_df)

    if store_type == 'redis':
        r.set_key(key, merged_df)
    else:
        store_to_file(merged_df, key, compress=compress)

    merged_df.dropna(inplace=True)
    return merged_df

In [83]:
def plot_cv_indices(cv, n_splits, X, y, date_col = None):
    """Create a sample plot for indices of a cross-validation object."""
    
    fig, ax = plt.subplots(1, 1, figsize = (11, 7))
    
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=10, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)


    # Formatting
    yticklabels = list(range(n_splits))
    
    if date_col is not None:
        tick_locations  = ax.get_xticks()
        tick_dates = [" "] + date_col.iloc[list(tick_locations[1:-1])].astype(str).tolist() + [" "]

        tick_locations_str = [str(int(i)) for i in tick_locations]
        new_labels = ['\n\n'.join(x) for x in zip(list(tick_locations_str), tick_dates) ]
        ax.set_xticks(tick_locations)
        ax.set_xticklabels(new_labels)
    
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+0.2, -.2])
    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
              ['Testing set', 'Training set'], loc=(1.02, .8))
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)

### Processing

In [84]:
def show_importance(model, model_func, params):
    importance_function = model.coef_[0]
    if model_func in [ut.ModelFunc.XGBOOST_CLASS, ut.ModelFunc.DECISION_TREE_CLASS, ut.ModelFunc.RANDOM_FOREST_CLASS, \
                      ut.ModelFunc.KNN_CLASS, ]:
        importance_function = model.feature_importances_
    if model_func in [ut.ModelFunc.CATBOOST_CLASS, ]:
        importance_function = model.get_feature_importance()

    ut.top_n_weighted_factors(importance_function, params['features'], params['top'])
    return importance_function

In [85]:
### Blending, Stacking, Ensemble
def fit_models(model_funcs, X_train, y_train, X_val=None, y_val=None):
    models = list()
    for model_func in model_funcs:
        params = ut.get_model_params(model_func)
        if X_val is None or y_val is None:
            model = ut.model_fit(model_func, X_train, y_train, params)
        else:
            if model_func is ut.ModelFunc.CATBOOST_CLASS:
                params = dict(params, early_stopping_rounds=50)
                model = ut.model_fit_with_eval(model_func, X_train, y_train, eval_set=(X_val, y_val), params=params)
            else:
                model = ut.model_fit(model_func, X_train, y_train, params)
        models.append(model)

    return models

def predict_models(models, X_train, X_val, X_test):
    models_proba = list()
    for model in models:
        models_proba.append({
            'train': np.array(model.predict_proba(X_train)[:, 1]),
            'val': np.array(model.predict_proba(X_val)[:, 1]),
            'test': np.array(model.predict_proba(X_test)[:, 1])
        })

    return models_proba


def blending_pred(*args):
    return sum(args) / len(args)
    
def stacking_pred(*args):
    return np.column_stack(args)

# stacking_pred(1,2,3)

In [86]:
params = {
    'emaf': 20,
    'emam': 100,
    'emas': 150,
    'rsi': 14,
    'macd': [12, 26, 9],
    'max_train_size': 180,
    'test_size': 60,
    # 'max_train_size': 90,
    # 'test_size': 30
}

trend_indicators = [ 'emaf', 'emam', 'emas', 'rsi', 'macd', 'adx']
lag_periods = 7 #3 # depends on timeframe, 7 days - ???
min_outliers=.23
max_outliers=.77
threshold = 0.6 # ???
use_stacking = True
use_blending = False

fe = FeatureEngineering(params)

In [87]:
def predict_ensemble(model_funcs, data_with_features, features):
    for train_data, val_data, test_data in tqdm(fe.split_data(data_with_features)):
        X_train, y_train = train_data[features], train_data['Target']
        X_val, y_val     = val_data[features], val_data['Target']
        X_test, y_test   = test_data[features], test_data['Target']

        ## Data normalization
        X_train_scaled, X_val_scaled, X_test_scaled = ut.normalize_MinMaxScaler(X_train, X_val, X_test)
        # X_train_scaled, X_val_scaled, X_test_scaled = ut.normalize_StandardScaler(X_train, X_val, X_test)
    
        ## Modeling
        # models = models_fit(model_funcs, X_train_scaled, y_train, X_val=X_val, y_val=y_val)
        models = fit_models(model_funcs, X_train_scaled, y_train)

        ## Prediction on train, val and test samples
        predict_dict = predict_models(models, X_train_scaled, X_val_scaled, X_test_scaled)

        yield ( predict_dict, y_train, y_val, y_test )


In [88]:
def get_models():
    models = list()
    # models.append(ut.ModelFunc.LOGISTIC_REG)
    # models.append(ut.ModelFunc.LINEAR_REG)
    # models.append(ut.ModelFunc.KNN_REG)
    # models.append(ut.ModelFunc.DECISION_TREE_REG)
    # models.append(ut.ModelFunc.RANDOM_FOREST_REG)
    # models.append(ut.ModelFunc.CATBOOST_REG)
    # models.append(ut.ModelFunc.XGBOOST_REG)

    # models.append(ut.ModelFunc.XGBOOST_CLASS)
    # models.append(ut.ModelFunc.CATBOOST_CLASS)

    # models.append(ut.ModelFunc.RANDOM_FOREST_CLASS)
    models.append(ut.ModelFunc.DECISION_TREE_CLASS)
    models.append(ut.ModelFunc.KNN_CLASS)
    return models

In [89]:
symbols =['BTC-USD']
for symbol in symbols:
# for name in tqdm(tickers):
    print(f'=== symbol: {symbol}, stacking: {use_stacking}, blending: {use_blending} ===')

    data = get_data(crypto_dir, symbol)
    data.drop(columns=['chg', 'vol_chg'], inplace=True) # Could it be as features ?

    df = fe.clear_invalid_targets(fe.add_target(fe.enrich_with_indicators(data), lag_periods))
    # df = fe.clear_invalid_targets(fe.add_target2(fe.enrich_with_indicators(data)))
    df = fe.validate_outliers(df, 'Close', min_outliers, max_outliers)
    # # print(df.isnull().sum())
    
    ## Store data
    # df = merge_and_store_data(df, symbol, compress=True) # Store data
    # print(df.isnull().sum())

    ## Add features
    OHLCV = ['Open', 'High', 'Low', 'Close', 'Volume']

    # Trend features
    data_with_trend, new_trend_features = fe.create_trend_features(df, OHLCV, lag_periods)
    # print(data_with_trend.isnull().sum())
    features = new_trend_features + trend_indicators  + ['Target']
    data_with_features = data_with_trend[features + ['Date']]

    # Rolling features
    # window_sizes = [7, 14, 30]
    # data_with_rolling, new_rolling_features = fe.create_rolling_features(df, OHLCV, window_sizes)
    # features = new_rolling_features + trend_indicators + ['Target']
    # data_with_features = data_with_rolling[features + ['Date']]

    # print(len(data_with_features))
    # print(data_with_features.isnull().sum())
    data_with_features.set_index('Date', inplace=True)
    # display(data_with_features.tail(10))

    test_start_date = pd.to_datetime(data_with_features.index.max()) - pd.DateOffset(months=1)
    val_start_date = pd.to_datetime(data_with_features.index.max()) - pd.DateOffset(months=2)

    train_data = data_with_features[pd.to_datetime(data_with_features.index) < val_start_date]  # все, что до предпоследнего месяца
    val_data = data_with_features[(pd.to_datetime(data_with_features.index) >= val_start_date) & (pd.to_datetime(data_with_features.index) < test_start_date)]  # предпоследний месяц
    test_data = data_with_features[pd.to_datetime(data_with_features.index) >= test_start_date]  # последний месяц
    # print(data_with_features.index[-1], train_data.index[-1], val_data.index[-1], test_data.index[-1])


    X_train = train_data[new_trend_features]
    y_train = train_data['Target']

    X_val = val_data[new_trend_features]
    y_val = val_data['Target']

    X_test = test_data[new_trend_features]
    y_test = test_data['Target']

    # Проверим размерности
    print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}")

    model_funcs = get_models()
    print(f'Models: {model_funcs}')

    ## Data normalization
    X_train_scaled, X_val_scaled, X_test_scaled = ut.normalize_MinMaxScaler(X_train, X_val, X_test)
    # X_train_scaled, X_val_scaled, X_test_scaled = ut.normalize_StandardScaler(X_train, X_val, X_test)

    ## Modeling
    # models = models_fit(model_funcs, X_train_scaled, y_train, X_val=X_val, y_val=y_val)
    models = fit_models(model_funcs, X_train_scaled, y_train)

    ## Prediction on train, val and test samples
    predict_dict = predict_models(models, X_train_scaled, X_val_scaled, X_test_scaled)

    stacked_train_X = stacking_pred([d['train'] for d in predict_dict][0])
    stacked_val_X = stacking_pred([d['val'] for d in predict_dict][0])
    stacked_test_X = stacking_pred([d['test'] for d in predict_dict][0])
    
    final_model_func = ut.ModelFunc.LOGISTIC_REG 
    final_model = fit_models([final_model_func], stacked_train_X, y_train)[0]
    predict_dict = predict_models([final_model], stacked_train_X, stacked_val_X, stacked_test_X)

    ensemble_train = [d['train'] for d in predict_dict][0]
    ensemble_val = [d['val'] for d in predict_dict][0]
    ensemble_test = [d['test'] for d in predict_dict][0]

    # ## Display metrics, ROC AUC for train, val and test samples
    train_roc_auc = ut.roc_auc_score_metric(y_train, ensemble_train)
    val_roc_auc = ut.roc_auc_score_metric(y_val, ensemble_val)
    test_roc_auc = ut.roc_auc_score_metric(y_test, ensemble_test)
  
    print('=== Train sample metrics ===')
    print(f'ROC AUC: {train_roc_auc:.4f}')
    print(ut.calculate_metrics_table(y_train, ensemble_train))

    print('=== Val sample metrics ===')
    print(f'ROC AUC: {val_roc_auc:.4f}')
    print(ut.calculate_metrics_table(y_val, ensemble_val))

    print('=== Test sample metrics ===')
    print(f'ROC AUC: {test_roc_auc:.4f}')
    print(ut.calculate_metrics_table(y_test, ensemble_test))
    print('===========================')


=== symbol: BTC-USD, stacking: True, blending: False ===
Outliers detected: 0
Train size: 1973, Val size: 31, Test size: 31
Models: [<function decision_tree_classifier_model at 0x7fe792c99f80>, <function knn_classifier_model at 0x7fe792c9a0c0>]
=== Train sample metrics ===
ROC AUC: 0.6078
   Cutoff   Precision     Recall   Accuracy   F1-Score
0    50.0   56.161864  87.992315  57.425241  68.562874
1    60.0   96.629213   8.261287  51.444501  15.221239
2    70.0  100.000000   3.842459  49.265079   7.400555
3    80.0    0.000000   0.000000  47.237709   0.000000
=== Val sample metrics ===
ROC AUC: 0.5192
   Cutoff  Precision    Recall   Accuracy  F1-Score
0    50.0      100.0  3.846154  19.354839  7.407407
1    60.0      100.0  3.846154  19.354839  7.407407
2    70.0      100.0  3.846154  19.354839  7.407407
3    80.0        0.0  0.000000  16.129032  0.000000
=== Test sample metrics ===
ROC AUC: 0.4979
   Cutoff  Precision  Recall   Accuracy   F1-Score
0    50.0       50.0    6.25  48.3870

In [38]:
symbols =['BTC-USD']
for symbol in symbols:
# for name in tqdm(tickers):
    print(f'=== symbol: {symbol}, stacking: {use_stacking}, blending: {use_blending} ===')

    data = get_data(crypto_dir, symbol)
    data.drop(columns=['chg', 'vol_chg'], inplace=True) # Could it be as features ?

    df = fe.clear_invalid_targets(fe.add_target(fe.enrich_with_indicators(data), lag_periods))
    # df = fe.clear_invalid_targets(fe.add_target2(fe.enrich_with_indicators(data)))
    df = fe.validate_outliers(df, 'Close', min_outliers, max_outliers)
    # # print(df.isnull().sum())
    
    ## Store data
    # df = merge_and_store_data(df, symbol, compress=True) # Store data
    # print(df.isnull().sum())

    ## Add features
    OHLCV = ['Open', 'High', 'Low', 'Close', 'Volume']

    # Trend features
    data_with_trend, new_trend_features = fe.create_trend_features(df, OHLCV, lag_periods)
    # print(data_with_trend.isnull().sum())
    features = new_trend_features + trend_indicators  + ['Target']
    data_with_features = data_with_trend[features + ['Date']]

    # Rolling features
    # window_sizes = [7, 14, 30]
    # data_with_rolling, new_rolling_features = fe.create_rolling_features(df, OHLCV, window_sizes)
    # features = new_rolling_features + trend_indicators + ['Target']
    # data_with_features = data_with_rolling[features + ['Date']]

    # print(len(data_with_features))
    # print(data_with_features.isnull().sum())
    data_with_features.set_index('Date', inplace=True)
    # display(data_with_features.tail(10))


    model_funcs = get_models()
    print(f'Models: {model_funcs}')

    ## Split, predict
    y_train_pred_prob = list()
    y_val_pred_prob = list()
    y_test_pred_prob = list()
    y_train_total = pd.DataFrame()
    y_val_total = pd.DataFrame()
    y_test_total = pd.DataFrame()

    for predict_dict, y_train, y_val, y_test in predict_ensemble(model_funcs, data_with_features, features):
        y_train_total = pd.concat([y_train_total, y_train], ignore_index=True)
        y_val_total = pd.concat([y_val_total, y_val], ignore_index=True)
        y_test_total = pd.concat([y_test_total, y_test], ignore_index=True)

        y_train_pred_prob.append([d['train'] for d in predict_dict][0])
        y_val_pred_prob.append([d['val'] for d in predict_dict][0])
        y_test_pred_prob.append([d['test'] for d in predict_dict][0])

    ## 2D-array
    train_pred_prob = hstack(y_train_pred_prob)
    val_pred_prob = hstack(y_val_pred_prob)
    test_pred_prob = hstack(y_test_pred_prob)

    print(len(train_pred_prob))


    ## Final model using whole data
    if use_stacking:
        stacked_train_X = stacking_pred(train_pred_prob).reshape(-1,1)
        stacked_val_X = stacking_pred(val_pred_prob).reshape(-1,1)
        stacked_test_X = stacking_pred(test_pred_prob).reshape(-1,1)
        
        final_model_func = ut.ModelFunc.LOGISTIC_REG 
        # final_model = fit_models([final_model_func], stacked_train_X, y_train_total)[0]
        final_model = fit_models([final_model_func], stacked_train_X, data_with_features['Target'])[0]
        predict_dict = predict_models([final_model], stacked_train_X, stacked_val_X, stacked_test_X)

    elif use_blending:
        blended_train_X = stacking_pred(train_pred_prob).reshape(-1,1)
        blended_val_X = stacking_pred(val_pred_prob).reshape(-1,1)
        blended_test_X = stacking_pred(test_pred_prob).reshape(-1,1)

        final_model_func = ut.ModelFunc.LOGISTIC_REG
        final_model = fit_models([final_model_func], blended_train_X, y_train_total)[0]
        predict_dict = predict_models([final_model], blended_train_X, blended_val_X, blended_test_X)

    else:
        final_model_func = model_funcs[0]
        final_model = fit_models([final_model_func], train_pred_prob.reshape(-1,1), y_train_total)[0]
        predict_dict = predict_models([final_model], train_pred_prob.reshape(-1,1), val_pred_prob.reshape(-1,1),\
                                       test_pred_prob.reshape(-1,1))

        # params = {
        #     'features': features,
        #     'top': 5,
        # }
        # importance_features = show_importance(final_model, final_model_func, params)

    ensemble_train = [d['train'] for d in predict_dict][0]
    ensemble_val = [d['val'] for d in predict_dict][0]
    ensemble_test = [d['test'] for d in predict_dict][0]

    # ## Display metrics, ROC AUC for train, val and test samples
    train_roc_auc = ut.roc_auc_score_metric(y_train_total, ensemble_train)
    val_roc_auc = ut.roc_auc_score_metric(y_val_total, ensemble_val)
    test_roc_auc = ut.roc_auc_score_metric(y_test_total, ensemble_test)
  
    print('=== Train sample metrics ===')
    print(f'ROC AUC: {train_roc_auc:.4f}')
    print(ut.calculate_metrics_table(y_train_total, ensemble_train))

    print('=== Val sample metrics ===')
    print(f'ROC AUC: {val_roc_auc:.4f}')
    print(ut.calculate_metrics_table(y_val_total, ensemble_val))

    print('=== Test sample metrics ===')
    print(f'ROC AUC: {test_roc_auc:.4f}')
    print(ut.calculate_metrics_table(y_test_total, ensemble_test))
    print('===========================')


=== symbol: BTC-USD, stacking: True, blending: False ===
Outliers detected: 1
Dropping 1 upper outliers
Dropping 0 lower outliers
Models: [<function decision_tree_classifier_model at 0x7fe792c99f80>, <function knn_classifier_model at 0x7fe792c9a0c0>]


30it [00:00, 40.40it/s]


2160


ValueError: Found input variables with inconsistent numbers of samples: [2160, 2027]