In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [26]:
ticker_features_dir = ""
returns_features_dir = "returns_features/" 
merged_features_dir = "merged_data/"           
models_dir = "models/"                          
evaluations_dir = "evaluations/" 
tickers = ["BTCUSDT"]

for directory in [merged_features_dir, models_dir, evaluations_dir]:
        os.makedirs(directory, exist_ok=True)

In [51]:
def load_csv(filepath):
    try:
        df = pd.read_csv(
            filepath,
            index_col=0,
            parse_dates=[0],
            low_memory=False
        )
        df.sort_index(inplace=True)
        return df
    except Exception as e:
        print(f"{e}")
        return None

def merge_features(strategy_returns_df, ticker_features_df):
    strategy_daily = strategy_returns_df.resample('D').ffill()
    ticker_daily = ticker_features_df.resample('D').ffill()

    merged_df = pd.merge(strategy_daily, ticker_daily, left_index=True, right_index=True, how='inner')

    # remove features that have too many nans
    nan_fraction = merged_df.isna().mean()
    cols_to_drop = nan_fraction[nan_fraction > 0.3].index.tolist()
    if cols_to_drop:
        merged_df.drop(columns=cols_to_drop, inplace=True)

    merged_df.fillna(method='ffill', inplace=True)
    merged_df.fillna(method='bfill', inplace=True)
    merged_df.dropna(inplace=True)

    return merged_df

def preprocess_data(df, target_column='Sharpe_Ratio_30', correlation_threshold=0.9):
    X = df.drop(columns=[target_column], errors='ignore')
    y = df[target_column]

    # scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)

    # remove correlated features
    corr_matrix = X_scaled.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
    X_scaled_reduced = X_scaled.drop(columns=to_drop)

    print(f"Removed {to_drop}")

    return X_scaled_reduced, y, scaler

def split_data(X, y, test_size=0.2):
    split_index = int(len(X) * (1 - test_size))
    X_train = X.iloc[:split_index]
    X_test = X.iloc[split_index:]
    y_train = y.iloc[:split_index]
    y_test = y.iloc[split_index:]
    
    print(f"Train: {X_train.shape[0]} rows")
    print(f"Test: {X_test.shape[0]} rows")
    
    return X_train, X_test, y_train, y_test

In [52]:
 for ticker in tickers:
        ticker_feature_file = f"all_features_{ticker}.csv"
        ticker_feature_path = os.path.join(ticker_features_dir, ticker_feature_file)

        if not os.path.isfile(ticker_feature_path):
            print(f"{ticker_feature_path} not found")
            continue

        ticker_df = load_csv(ticker_feature_path)
        if ticker_df is None:
            continue

        strategy_files = [f for f in os.listdir(returns_features_dir) if f.endswith(".csv")]

        for strategy_file in strategy_files:
            strategy_name = strategy_file.replace("_returns_features.csv", "")
            strategy_feature_path = os.path.join(returns_features_dir, strategy_file)

            strategy_df = load_csv(strategy_feature_path)
            if strategy_df is None:
                continue

            # merge features
            merged_df = merge_features(strategy_df, ticker_df)

            merged_feature_file = f"merged_features_{strategy_name}.csv"
            merged_feature_path = os.path.join(merged_features_dir, merged_feature_file)
            merged_df.to_csv(merged_feature_path)

# load datasets for each strategy
merged_files = [f for f in os.listdir(merged_features_dir) if f.startswith("merged_features_") and f.endswith(".csv")]
datasets = {}
for merged_file in merged_files:
    strategy_name = merged_file.replace("merged_features_", "").replace(".csv", "")
    merged_feature_path = os.path.join(merged_features_dir, merged_file)
    df = load_csv(merged_feature_path)
    if df is not None:
        datasets[strategy_name] = df

  merged_df.fillna(method='ffill', inplace=True)
  merged_df.fillna(method='bfill', inplace=True)
  merged_df.fillna(method='ffill', inplace=True)
  merged_df.fillna(method='bfill', inplace=True)
  merged_df.fillna(method='ffill', inplace=True)
  merged_df.fillna(method='bfill', inplace=True)
  merged_df.fillna(method='ffill', inplace=True)
  merged_df.fillna(method='bfill', inplace=True)
  merged_df.fillna(method='ffill', inplace=True)
  merged_df.fillna(method='bfill', inplace=True)
  merged_df.fillna(method='ffill', inplace=True)
  merged_df.fillna(method='bfill', inplace=True)
  merged_df.fillna(method='ffill', inplace=True)
  merged_df.fillna(method='bfill', inplace=True)


In [53]:
def train_models(X_train, y_train):
    models = {}
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    models['Linear Regression'] = lr
    
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    models['Random Forest'] = rf
    
    return models

def evaluate_models(models, X_test, y_test):
    results = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results[name] = {'MSE': mse, 'R²': r2}
        print(f"{name} - MSE: {mse:.4f}, R²: {r2:.4f}")
    return results

In [54]:
def train_and_evaluate_all_strategies(datasets, models_dir, evaluations_dir, target='Sharpe_Ratio_30'):
    for strategy, df in datasets.items():
        print(f"\n=== Processing Strategy: {strategy} ===")
        
        if target not in df.columns:
            continue
        
        X_scaled_reduced, y, scaler = preprocess_data(df, target_column=target)
        X_train, X_test, y_train, y_test = split_data(X_scaled_reduced, y, test_size=0.2)
        
        models = train_models(X_train, y_train)
        
        results = evaluate_models(models, X_test, y_test)

In [55]:
train_and_evaluate_all_strategies(datasets, models_dir, evaluations_dir, target='Sharpe_Ratio_30')


=== Processing Strategy: G44 ===
Removed ['CVaR_95_10', 'Kurtosis_30', 'Realized_Volatility_10', 'Garman_Klass_Volatility_10', 'Max_Drawdown_Duration_10', 'Realized_Volatility_30', 'Parkinson_Volatility_30', 'Garman_Klass_Volatility_30', 'Max_Drawdown_Duration_30_y', 'MACD_Signal_10', 'MACD_30', 'MACD_Signal_30', 'MACD_Diff_30']
Train: 659 rows
Test: 165 rows
Linear Regression - MSE: 1.8957, R²: 0.8028
Random Forest - MSE: 1.2915, R²: 0.8656

=== Processing Strategy: G43 ===
Removed ['CVaR_95_10', 'CVaR_95_30', 'Realized_Volatility_10', 'Garman_Klass_Volatility_10', 'Max_Drawdown_Duration_10', 'Realized_Volatility_30', 'Parkinson_Volatility_30', 'Garman_Klass_Volatility_30', 'Max_Drawdown_Duration_30_y', 'MACD_Signal_10', 'RSI_10', 'MACD_30', 'MACD_Signal_30', 'MACD_Diff_30', 'RSI_30']
Train: 316 rows
Test: 79 rows
Linear Regression - MSE: 3.9157, R²: 0.4833
Random Forest - MSE: 0.1249, R²: 0.9835

=== Processing Strategy: G59_V1 ===
Removed ['CVaR_95_10', 'CVaR_95_30', 'Realized_Vola