In [8]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import randint, uniform

## Variables

In [9]:
cols_to_drop = ['Unnamed: 0', 'averageRating', 'numVotes', '_orig_order']

### Retrieve data

In [10]:
df = pd.read_csv(r".\data\training_dataset.csv", sep=";")
df = df.sort_values(by=['startYear', '_orig_order'])
df = df.drop(columns=cols_to_drop)
y = df["movie_score"].values
X = df.drop(columns=["movie_score"])  # + your drop columns

### Base model and parameter distribution for randomized search

In [11]:
base = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
    tree_method="hist",   
    device="cuda" #NVIDIA gpu for training         
)

param_dist = {
    # Number of boosting trees (iterations).
    # More trees can improve performance but increase risk of overfitting and training time.
    "n_estimators": randint(200, 2000),

    # Step size shrinkage used in each boosting step.
    # Lower values make learning more conservative and usually require more trees.
    "learning_rate": uniform(0.01, 0.2),

    # Maximum depth of each decision tree.
    # Controls model complexity: deeper trees capture more interactions but may overfit.
    "max_depth": randint(2, 10),

    # Minimum sum of instance weight (Hessian) needed in a child node.
    # Higher values make the algorithm more conservative and reduce overfitting.
    "min_child_weight": randint(1, 10),

    # Fraction of training samples used to grow each tree.
    # Subsampling helps prevent overfitting and improves generalization.
    "subsample": uniform(0.5, 0.5),         # samples between 50% and 100%

    # Fraction of features (columns) used when constructing each tree.
    # Reduces correlation between trees and helps control overfitting.
    "colsample_bytree": uniform(0.5, 0.5),  # features between 50% and 100%

    # L1 regularization term on weights (Lasso).
    # Encourages sparsity by driving some leaf weights to zero.
    "reg_alpha": uniform(0.0, 1.0),

    # L2 regularization term on weights (Ridge).
    # Penalizes large weights and stabilizes the model.
    "reg_lambda": uniform(0.5, 2.0),

    # Minimum loss reduction required to make a further split.
    # Higher values make the model more conservative by limiting tree growth.
    "gamma": uniform(0.0, 1.0),
}



### Tune the model for best hyperparameters

In [None]:
# Validation but keeping the temporal order of the data 
cv = TimeSeriesSplit(n_splits=5)

#Randomized search to find the best hyperparamters based on param_dist
search = RandomizedSearchCV(
    estimator=base,
    param_distributions=param_dist,
    n_iter=50,
    scoring="neg_mean_absolute_error",
    cv=cv,
    verbose=1,
    n_jobs=1,
    random_state=42,
)

search.fit(X, y)

### Train the best model with parameters and save it.

In [13]:
best_model = search.best_estimator_

# Refit best model on ALL data
best_model.fit(X, y, verbose=False)
# Save the model
joblib.dump(best_model, "xgb_reg_movie_log_transformed.joblib")

# Saves the best parameters and tried parameters to csv file
cv_results = pd.DataFrame(search.cv_results_)
cv_results.to_csv(r".\data\xgb_reg_movie_log_transformed.csv", index=False)