In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from sklearn.metrics import get_scorer
from xgboost import  XGBRegressor, callback
import joblib
from scipy.stats import randint, uniform
import warnings

In [None]:
useBackslash = False
trainingDatasetPath = r'..\data\training_dataset.csv' if useBackslash else r'../data/training_dataset.csv'
dataset_training = pd.read_csv(trainingDatasetPath, sep=';')

In [None]:
y = dataset_training['movie_score'].values
dataset_training = dataset_training.drop(columns=['movie_score', 'Unnamed: 0', 'Unnamed: 0.1', 'averageRating', 'numVotes', '_orig_order'])

In [None]:
x = dataset_training.values

In [None]:
random_state = 42
n_jobs = -1
cv = 3
early_stopping_rounds = 50
test_size_for_earlystop = 0.10

In [None]:
model = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',   # can be 'gpu_hist' if using older XGBoost (<2.0)
    device='cuda',        # ensures GPU acceleration on your 3070 Ti
    verbosity=1,
    n_jobs=1
)

In [None]:
param_dist = {
    'max_depth': randint(4, 12),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 5),
    'n_estimators': randint(100, 1000)
}

In [None]:
scoring = 'neg_root_mean_squared_error'

In [None]:
search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=40,             # try 30-100 depending on budget
    scoring=scoring,
    cv=cv,
    verbose=2,
    n_jobs=1               # 1 when GPU used
)

In [None]:
search.fit(x, y)

In [None]:
cv_results = pd.DataFrame(search.cv_results_).sort_values('rank_test_score').reset_index(drop=True)

In [None]:
saveResultsAt = r'..\paramter_foerste_iteration.csv' if useBackslash else r'../paramter_foerste_iteration'
cv_results.to_csv(saveResultsAt)

In [None]:
cv_results = pd.read_csv(saveResultsAt, sep=',')

In [None]:
best_params = search.best_params_
best_score = search.best_score_

# Save to disk
results = {
    "best_params": best_params,
    "best_score": best_score
}
joblib.dump(results, "best_model_results.pkl")

In [None]:
results = joblib.load("best_model_results.pkl")
best_params = results["best_params"]
best_score = results["best_score"]


In [None]:
def cast_simple_types(d):
    out = {}
    for k, v in d.items():
        if isinstance(v, (np.integer,)):
            out[k] = int(v)
        elif isinstance(v, (np.floating,)):
            out[k] = float(v)
        else:
            out[k] = v
    return out

In [None]:
final_params = cast_simple_types(best_params)
final_params['n_estimators'] = max(int(final_params.get('n_estimators', 200)), 2000)

In [None]:
final_model = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    device='cuda',
    verbosity=1,
    n_jobs=1,
    eval_metric='rmse',
    early_stopping_rounds=50,
    **best_params
)

In [None]:
X_train_full, X_val, y_train_full, y_val = train_test_split(
    x, y, test_size=test_size_for_earlystop, random_state=random_state
)

In [None]:
final_model.fit(
    X_train_full,
    y_train_full,
    eval_set=[(X_val, y_val)]
)

In [None]:
best_ntree_limit = getattr(final_model, "best_ntree_limit", None) or getattr(final_model, "best_iteration", None)
if best_ntree_limit is not None:
    print(f"\nBest ntree limit found via early stopping: {best_ntree_limit}")
    print("Refitting on entire dataset using best number of trees...")
    final_model.set_params(n_estimators=int(best_ntree_limit))
    final_model.fit(x, y, verbose=False)


In [None]:
joblib.dump(final_model, "xgb_reg_movie_score_first_iteration.joblib")
cv_results.to_csv("xgb_cv_results_movie_score.csv", index=False)

In [None]:
fi = final_model.get_booster().get_score(importance_type='gain')
fi_df = pd.DataFrame.from_dict(fi, orient='index', columns=['gain']).sort_values('gain', ascending=False)
fi_df.index.name = 'feature'
fi_df.reset_index(inplace=True)