In [87]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import joblib
import json

In [109]:
# Load scaling parameters
with open("../data/scaling_params.json", "r") as f:
    scaling_params = json.load(f)

# Assuming `results` is a DataFrame containing 'Predicted' and 'Actual' columns
# Unnormalize the 'Actual' and 'Predicted' values
mean = scaling_params["next_close"]["mean"]
std = scaling_params["next_close"]["std"]

In [19]:
# Load train, validation, and test datasets
train_df = pd.read_csv("../data/train.csv")
validation_df = pd.read_csv("../data/validation.csv")  # Load validation set
test_df = pd.read_csv("../data/test.csv")

# Drop timestamp columns if they exist
train_df = train_df.drop(columns=['timestamp'], errors='ignore')
validation_df = validation_df.drop(columns=['timestamp'], errors='ignore')
test_df = test_df.drop(columns=['timestamp'], errors='ignore')

In [119]:
# Columns to be used as features and target
feature_cols = [
    "open", "high", "low", "close", "volume", "hour", 
    "avg_100_candles", "std", "ema_20", "rsi_20", "macd_line", 
    "macd_signal", "bollinger_upper", "bollinger_lower", "bollinger_bandwidth", 
    "volume_100"
]
target_col = "next_close"

# Prepare features (X) and target (y)
X_train = train_df[feature_cols]
y_train = train_df[target_col]

X_val = validation_df[feature_cols]
y_val = validation_df[target_col]

X_test = test_df[feature_cols]
y_test = test_df[target_col]

In [121]:
# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [123]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    "max_depth": [4, 6, 8],
    "eta": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "objective": ["reg:squarederror"],
    "eval_metric": ["rmse"]
}

# Use GridSearchCV for hyperparameter tuning
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", eval_metric="rmse")

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring="neg_root_mean_squared_error", verbose=2)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Best parameters from grid search
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END colsample_bytree=0.7, eta=0.01, eval_metric=rmse, max_depth=4, objective=reg:squarederror, subsample=0.7; total time=   0.6s
[CV] END colsample_bytree=0.7, eta=0.01, eval_metric=rmse, max_depth=4, objective=reg:squarederror, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.7, eta=0.01, eval_metric=rmse, max_depth=4, objective=reg:squarederror, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, eta=0.01, eval_metric=rmse, max_depth=4, objective=reg:squarederror, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.7, eta=0.01, eval_metric=rmse, max_depth=4, objective=reg:squarederror, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.7, eta=0.01, eval_metric=rmse, max_depth=4, objective=reg:squarederror, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.7, eta=0.01, eval_metric=rmse, max_depth=4, objective=reg:squarederror, subsample=0.9; total time=   0.3s


In [124]:
#Train the model with the best parameters
best_model = grid_search.best_estimator_

Validation RMSE: 0.01
Validation R² Score: 1.00


In [None]:
# Save the trained model
joblib.dump(best_model, "../xgb_model.pkl")
print("Model saved as 'xgb_model.pkl'.")