In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

print("Loading labeled data for BTC...")
data_path = '../data/btc_usd_h4_labeled.csv'
df = pd.read_csv(data_path, index_col='timestamp', parse_dates=True)
df.dropna(inplace=True)
print("Data loaded successfully.")

Loading labeled data for BTC...
Data loaded successfully.


In [2]:
features = [col for col in df.columns if col not in ['open', 'high', 'low', 'close', 'volume', 'target']]
X = df[features]
y = df['target']

# Split data (we'll use a smaller portion for faster tuning)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
y_train_mapped = y_train.replace({-1: 2})
y_test_mapped = y_test.replace({-1: 2})
print("Data prepared for tuning.")

param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'gamma': [0.5, 1],
    'reg_lambda': [1, 2]
}

Data prepared for tuning.


In [3]:
model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',
    n_jobs=-1,
    cv=3,
    verbose=2
)

print("Starting Hyperparameter Search... This may take a long time.")
grid_search.fit(X_train, y_train_mapped)
print("Search complete.")

Starting Hyperparameter Search... This may take a long time.
Fitting 3 folds for each of 48 candidates, totalling 144 fits




[CV] END gamma=0.5, learning_rate=0.05, max_depth=3, n_estimators=100, reg_lambda=2; total time=   2.2s
[CV] END gamma=0.5, learning_rate=0.05, max_depth=3, n_estimators=100, reg_lambda=1; total time=   2.4s
[CV] END gamma=0.5, learning_rate=0.05, max_depth=3, n_estimators=100, reg_lambda=2; total time=   2.4s
[CV] END gamma=0.5, learning_rate=0.05, max_depth=3, n_estimators=100, reg_lambda=1; total time=   2.4s
[CV] END gamma=0.5, learning_rate=0.05, max_depth=3, n_estimators=100, reg_lambda=1; total time=   2.5s
[CV] END gamma=0.5, learning_rate=0.05, max_depth=3, n_estimators=100, reg_lambda=2; total time=   2.4s
[CV] END gamma=0.5, learning_rate=0.05, max_depth=3, n_estimators=200, reg_lambda=1; total time=   4.9s
[CV] END gamma=0.5, learning_rate=0.05, max_depth=3, n_estimators=200, reg_lambda=1; total time=   5.1s
[CV] END gamma=0.5, learning_rate=0.05, max_depth=5, n_estimators=100, reg_lambda=1; total time=   4.4s
[CV] END gamma=0.5, learning_rate=0.05, max_depth=5, n_estimator

In [4]:
print("\n--- Best Parameters Found ---")
print(grid_search.best_params_)


--- Best Parameters Found ---
{'gamma': 0.5, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'reg_lambda': 1}


In [5]:
print("\n--- Performance of the Best Model on the Test Set ---")
best_model = grid_search.best_estimator_
y_pred_mapped = best_model.predict(X_test)
y_pred = pd.Series(y_pred_mapped).replace({2: -1}).values
print(classification_report(y_test, y_pred, zero_division=0))


--- Performance of the Best Model on the Test Set ---
              precision    recall  f1-score   support

          -1       0.34      0.62      0.44      1103
           0       0.39      0.39      0.39      1457
           1       0.56      0.15      0.23      1276

    accuracy                           0.38      3836
   macro avg       0.43      0.39      0.35      3836
weighted avg       0.43      0.38      0.35      3836

