In [77]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [78]:
market_features = pd.read_csv('all_features_BTCUSDT.csv', index_col=0, parse_dates=[0], low_memory=False)
market_features = market_features.resample('D').ffill()
market_features = market_features.loc[:, market_features.isna().mean() <= 0.7]

strategies_data = {}
returns_features_path = 'returns_features'
for file in os.listdir(returns_features_path):
    if file.endswith('_returns_features.csv'):
        strategy_name = file.replace('_returns_features.csv', '')
        df = pd.read_csv(os.path.join(returns_features_path, file), index_col=0, parse_dates=[0], low_memory=False)
        df = df.resample('D').ffill()
        df['Sharpe_Ratio_Target'] = (df['Sharpe_Ratio_1_30'].shift(-1) > 0).astype(int)
        df['Mean_Returns_Target'] = (df['Mean_Returns_1'].shift(-1) > 0).astype(int)
        df = df[:-1]
        strategies_data[strategy_name] = df




merged_data = {}
for strategy, df in strategies_data.items():
    merged_df = pd.merge(market_features, df, left_index=True, right_index=True, how='inner')
    merged_df = merged_df.dropna(axis=1, thresh=0.5 * len(merged_df))
    merged_df = merged_df.dropna()
    merged_data[strategy] = merged_df

In [79]:
#target = 'Sharpe_Ratio_Target' 
target = 'Mean_Returns_Target'

In [80]:
strategies = merged_data.keys()
dfs = {}

for strategy in strategies:
    df = merged_data[strategy]
    X = df.drop(['Sharpe_Ratio_Target', 'Mean_Returns_Target'], axis=1)
    y = df[f'{target}']
    
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    importances = pd.Series(model.feature_importances_, index=X.columns)
    importances.sort_values(ascending=False, inplace=True)
    
    top_features = importances.nlargest(15).index
    
    X_selected = X[top_features]
    dfs[strategy] = pd.merge(X_selected, y, left_index=True, right_index=True, how='inner')

In [81]:
# frequency for walk-forward steps
train_end = '2023-12-31'
predict_start = '2024-01-01'
predict_end = '2024-03-31'

step = pd.DateOffset(months=3)

In [82]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
cat = CatBoostClassifier(verbose=0, random_state=42)
lgbm = LGBMClassifier(n_estimators=100, random_state=42)

In [84]:
import warnings
warnings.filterwarnings('ignore')

models = {
        'RandomForest': rf,
        'CatBoost': cat,
        'LightGBM': lgbm
    }
results = {strategy: {model: [] for model in models.keys()} for strategy in strategies}
for strategy in strategies:
    print(strategy)
    df = dfs[strategy]
    
    current_train_end = pd.to_datetime(train_end)
    current_predict_start = pd.to_datetime(predict_start)
    current_predict_end = pd.to_datetime(predict_end)
    
    while current_predict_start <= df.index.max():
        print(f"\nTraining up to {current_train_end.date()}, predicting from {current_predict_start.date()} to {current_predict_end.date()}")
        
        train = df.loc[:current_train_end]
        predict = df.loc[current_predict_start:current_predict_end]
        
        X_train = train.drop(target, axis=1)
        y_train = train[f'{target}']
        
        X_predict = predict.drop(target, axis=1)
        y_true = predict[f'{target}']

        
        for model_name, model in models.items():
            try:
                model.fit(X_train, y_train)
                
                y_pred = model.predict(X_predict)
                
                acc = accuracy_score(y_true, y_pred)
                prec = precision_score(y_true, y_pred, zero_division=0)
                rec = recall_score(y_true, y_pred, zero_division=0)
                f1 = f1_score(y_true, y_pred, zero_division=0)
                
                results[strategy][model_name].append({
                    'Model': model_name,
                    'Train_End': current_train_end.date(),
                    'Predict_Start': current_predict_start.date(),
                    'Predict_End': current_predict_end.date(),
                    'Accuracy': acc,
                    'Precision': prec,
                    'Recall': rec,
                    'F1_Score': f1
                })
            except:
                continue
        
        current_train_end = current_predict_end
        current_predict_start = current_train_end + pd.Timedelta(days=1)
        current_predict_end = current_predict_start + step - pd.Timedelta(days=1)
        
        if current_predict_end > df.index.max():
            current_predict_end = df.index.max()

G44

Training up to 2023-12-31, predicting from 2024-01-01 to 2024-03-31
[LightGBM] [Info] Number of positive: 17, number of negative: 16
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 33, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.515152 -> initscore=0.060625
[LightGBM] [Info] Start training from score 0.060625

Training up to 2024-03-31, predicting from 2024-04-01 to 2024-06-30
[LightGBM] [Info] Number of positive: 74, number of negative: 49
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 123, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.601626 -> initscore=0.412245
[LightGBM] [Info] Start training from score 0.412245

Training up to 2024-06-30, predicting from 2024-07-01 to 

No objects info loaded


[LightGBM] [Info] Number of positive: 118, number of negative: 168
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 286, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.412587 -> initscore=-0.353279
[LightGBM] [Info] Start training from score -0.353279

Training up to 2024-06-30, predicting from 2024-07-01 to 2024-08-27
[LightGBM] [Info] Number of positive: 118, number of negative: 168
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 286, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.412587 -> initscore=-0.353279
[LightGBM

In [85]:
for strategy in strategies:
    for model in models.keys():
        for dic in results[strategy][model]:
            try:
                print(f"{strategy}, Accuracy: {dic['Accuracy']:.2f}, F1: {dic['F1_Score']:.2f}")
            except:
                continue

G44, Accuracy: 0.51, F1: 0.39
G44, Accuracy: 0.48, F1: 0.59
G44, Accuracy: 0.54, F1: 0.57
G44, Accuracy: 0.40, F1: 0.10
G44, Accuracy: 0.52, F1: 0.68
G44, Accuracy: 0.48, F1: 0.60
G44, Accuracy: 0.63, F1: 0.78
G44, Accuracy: 0.49, F1: 0.63
G44, Accuracy: 0.59, F1: 0.67
G24, Accuracy: 0.46, F1: 0.51
G24, Accuracy: 0.54, F1: 0.53
G24, Accuracy: 0.52, F1: 0.50
G24, Accuracy: 0.49, F1: 0.61
G24, Accuracy: 0.62, F1: 0.70
G24, Accuracy: 0.52, F1: 0.51
G24, Accuracy: 0.45, F1: 0.52
G24, Accuracy: 0.59, F1: 0.67
G24, Accuracy: 0.46, F1: 0.43
G59_V2, Accuracy: 0.53, F1: 0.52
G59_V2, Accuracy: 0.61, F1: 0.58
G59_V2, Accuracy: 0.49, F1: 0.53
G59_V2, Accuracy: nan, F1: 0.00
G59_V2, Accuracy: 0.57, F1: 0.58
G59_V2, Accuracy: 0.53, F1: 0.57
G59_V2, Accuracy: 0.59, F1: 0.60
G70_V1, Accuracy: 0.62, F1: 0.65
G70_V1, Accuracy: 0.54, F1: 0.56
G70_V1, Accuracy: 0.43, F1: 0.47
G70_V1, Accuracy: 0.53, F1: 0.65
G70_V1, Accuracy: 0.59, F1: 0.63
G70_V1, Accuracy: 0.43, F1: 0.47
G70_V1, Accuracy: 0.49, F1: 0.59