In [1]:
import numpy as np
import optuna

from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

ModuleNotFoundError: No module named 'optuna'

## Variables

In [2]:
cols_to_drop = ['Unnamed: 0', 'averageRating', 'numVotes', '_orig_order']

In [None]:
N_SPLITS = 5
EARLY_STOPPING_ROUNDS = 200
N_TRIALS = 50

tscv = TimeSeriesSplit(n_splits=N_SPLITS)

### Retrieve data

In [3]:
df = pd.read_csv(r".\data\training_dataset.csv", sep=";")
df = df.sort_values(by=['startYear', '_orig_order'])
df = df.drop(columns=cols_to_drop)
y = df["movie_score"].values
X = df.drop(columns=["movie_score"])  # + your drop columns

In [None]:
def objective(trial):
    params = {
        # Large number â€“ early stopping decides actual trees
        "n_estimators": 50_000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),

        # Tree structure
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),

        # Sampling
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),

        # Regularization
        "gamma": trial.suggest_float("gamma", 0.0, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.5, 2.0),

        # Long-tail friendly loss
        "objective": "reg:pseudohubererror",

        # Performance / reproducibility
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
        "device": "cuda",
    }

    fold_rmses = []

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X), start=1):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = XGBRegressor(**params)

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            verbose=False,
        )

        preds = model.predict(X_val)
        rmse = mean_squared_error(y_val, preds, squared=False)
        fold_rmses.append(rmse)

        # Report intermediate result (enables pruning)
        trial.report(np.mean(fold_rmses), step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_rmses))


In [None]:
optuna.visualization.plot_optimization_history(study)


In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42),
)

study.optimize(objective, n_trials=N_TRIALS)


In [None]:
best_params = study.best_params

final_model = XGBRegressor(
    **best_params,
    n_estimators=50_000,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
    device="cuda",
)

split = int(len(X) * 0.85)

final_model.fit(
    X[:split], y[:split],
    eval_set=[(X[split:], y[split:])],
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    verbose=True,
)


### Train the best model with parameters and save it.

In [None]:
best_model = final_model.best_iteration

# Refit best model on ALL data
best_model.fit(X, y, verbose=False)
# Save the model
joblib.dump(best_model, "xgb_reg_movie_log_transformed.joblib")

# Saves the best parameters and tried parameters to csv file
cv_results = pd.DataFrame(search.cv_results_)
cv_results.to_csv(r".\data\xgb_reg_movie_log_transformed.csv", index=False)