## Imports

In [4]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import randint, uniform

## Variables

In [5]:
cols_to_drop = ['Unnamed: 0', 'averageRating', 'numVotes', '_orig_order']

### Retrieve data

In [6]:
df = pd.read_csv(r".\data\training_dataset.csv", sep=";")
df = df.sort_values(by=['startYear', '_orig_order'])
df = df.drop(columns=cols_to_drop)
y = df["movie_score"].values
X = df.drop(columns=["movie_score"])  # + your drop columns

### Base model and parameter distribution for randomized search

In [7]:
base = XGBRegressor(
    objective="reg:tweedie",
    eval_metric="mae",
    random_state=42,
    n_jobs=-1,
    tree_method="hist",   
    device="cuda" #NVIDIA gpu for training         
)

param_dist = {
    "n_estimators": randint(300, 1200),
    "learning_rate": uniform(0.01, 0.08),
    "max_depth": randint(3, 6),
    "min_child_weight": randint(5, 50),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "reg_alpha": uniform(0.1, 5.0),
    "reg_lambda": uniform(1.0, 5.0),
    "gamma": uniform(0.5, 5.0),
    "tweedie_variance_power": uniform(1.1, 0.6),
}




### Tune the model for best hyperparameters

In [None]:
# Validation but keeping the temporal order of the data 
cv = TimeSeriesSplit(n_splits=5)

#Randomized search to find the best hyperparamters based on param_dist
search = RandomizedSearchCV(
    estimator=base,
    param_distributions=param_dist,
    n_iter=50,
    scoring="neg_mean_squared_log_error",
    cv=cv,
    verbose=1,
    n_jobs=1,
    random_state=42,
)

search.fit(X, y)

### Train the best model with parameters and save it.

In [9]:
best_model = search.best_estimator_

# Refit best model on ALL data
best_model.fit(X, y, verbose=False)
# Save the model
joblib.dump(best_model, "xgb_reg_movie_number_votes_tweedie.joblib")

# Saves the best parameters and tried parameters to csv file
cv_results = pd.DataFrame(search.cv_results_)
cv_results.to_csv(r".\data\xgb_reg_movie_number_votes_tweedie.csv", index=False)