In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

print("Loading labeled data for BTC...")
data_path = '../../data/btc_usd_h4_labeled.csv'
df = pd.read_csv(data_path, index_col='timestamp', parse_dates=True)
df.dropna(inplace=True)
print("Data loaded successfully.")

Loading labeled data for BTC...
Data loaded successfully.


In [2]:
features = [col for col in df.columns if col not in ['open', 'high', 'low', 'close', 'volume', 'target']]
X = df[features]
y = df['target']

# Split data (we'll use a smaller portion for faster tuning)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
y_train_mapped = y_train.replace({-1: 2})
y_test_mapped = y_test.replace({-1: 2})
print("Data prepared for tuning.")

param_grid = {
    'n_estimators': [200, 500],             # More trees
    'learning_rate': [0.01, 0.05],          # Smaller learning rate
    'max_depth': [3, 5],                    # Shallower, more robust trees
    'gamma': [1, 5],                        # Stronger regularization
    'reg_lambda': [5, 10],                  # Stronger L2 regularization
    'subsample': [0.7, 0.9],                # Randomly sample data
    'colsample_bytree': [0.7, 0.9]          # Randomly sample features/indicators
}

Data prepared for tuning.


In [3]:
model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',
    n_jobs=-1,
    cv=3,
    verbose=2
)

print("Starting Hyperparameter Search... This may take a long time.")
grid_search.fit(X_train, y_train_mapped)
print("Search complete.")

Starting Hyperparameter Search... This may take a long time.
Fitting 3 folds for each of 128 candidates, totalling 384 fits




[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.01, max_depth=3, n_estimators=200, reg_lambda=5, subsample=0.7; total time=   3.4s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.01, max_depth=3, n_estimators=200, reg_lambda=5, subsample=0.9; total time=   3.5s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.01, max_depth=3, n_estimators=200, reg_lambda=5, subsample=0.7; total time=   3.6s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.01, max_depth=3, n_estimators=200, reg_lambda=5, subsample=0.9; total time=   3.7s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.01, max_depth=3, n_estimators=200, reg_lambda=10, subsample=0.7; total time=   3.6s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.01, max_depth=3, n_estimators=200, reg_lambda=10, subsample=0.7; total time=   3.6s
[CV] END colsample_bytree=0.7, gamma=1, learning_rate=0.01, max_depth=3, n_estimators=200, reg_lambda=5, subsample=0.9; total time=   3.8s
[CV] END colsample_bytree

In [4]:
print("\n--- Best Parameters Found ---")
print(grid_search.best_params_)


--- Best Parameters Found ---
{'colsample_bytree': 0.9, 'gamma': 5, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 500, 'reg_lambda': 10, 'subsample': 0.7}


In [5]:
print("\n--- Performance of the Best Model on the Test Set ---")
best_model = grid_search.best_estimator_
y_pred_mapped = best_model.predict(X_test)
y_pred = pd.Series(y_pred_mapped).replace({2: -1}).values
print(classification_report(y_test, y_pred, zero_division=0))


--- Performance of the Best Model on the Test Set ---
              precision    recall  f1-score   support

          -1       0.34      0.52      0.41      1366
           0       0.42      0.50      0.46      1964
           1       0.59      0.20      0.30      1645

    accuracy                           0.41      4975
   macro avg       0.45      0.41      0.39      4975
weighted avg       0.46      0.41      0.39      4975

