# Try out raytune with scikit & keras

In [None]:
import random

import numpy as np
import tensorflow as tf
from ray import tune
from ray.air import session
from ray.tune.schedulers import AsyncHyperBandScheduler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from tensorflow.keras import callbacks, layers, models, optimizers

from q2_time.model import split_data_by_host
from q2_time.simulate_data import simulate_data

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
# Load a simulated dataset
host_id = "host_id"
target = "age_days"
train_size = 0.8
seed = 12

ft, md = simulate_data(100)
data = md.join(ft, how="left")
data.sort_values([host_id, target], inplace=True)

# todo: consider to split train into train - val during training
# (train+val) & test split
train_val, test = split_data_by_host(data, host_id, train_size, seed)
X_test, y_test = test[ft.columns], test[target]

# train & val split
train, val = split_data_by_host(train_val, host_id, train_size, seed)
X_train, y_train = train[ft.columns], train[target]
X_val, y_val = val[ft.columns], val[target]

In [None]:
# ! define training functions


# Define a training function for RandomForest
def train_rf(config, X_train, y_train, X_val, y_val, seed):
    # setting seed for scikit library
    np.random.seed(seed)
    rf = RandomForestRegressor(
        n_estimators=config["n_estimators"], max_depth=config["max_depth"]
    )
    rf.fit(X_train, y_train)
    # train score
    y_train_pred = rf.predict(X_train)
    score_train = mean_squared_error(y_train, y_train_pred)
    # val score
    y_val_pred = rf.predict(X_val)
    score_val = mean_squared_error(y_val, y_val_pred)

    session.report({"mse_val": score_val, "mse_train": score_train})


# Define a training function for Keras neural network
def train_nn(config, X_train, y_train, X_val, y_val, seed):
    # set seeds
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # define neural network
    model = models.Sequential()
    model.add(layers.Input(shape=(X_train.shape[1],)))

    n_layers = config["n_layers"]
    for i in range(n_layers):
        num_hidden = config[f"n_units_l{i}"]
        model.add(layers.Dense(num_hidden, activation="relu"))

    model.add(layers.Dense(1))

    # define learning
    learning_rate = config["learning_rate"]
    optimizer = optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss="mean_squared_error")

    early_stopping = callbacks.EarlyStopping(patience=10, restore_best_weights=True)

    model.fit(
        X_train,
        y_train,
        epochs=100,
        batch_size=config["batch_size"],
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0,
    )

    # train score
    y_train_pred = model.predict(X_train)
    score_train = mean_squared_error(y_train, y_train_pred)
    # val score
    y_val_pred = model.predict(X_val)
    score_val = mean_squared_error(y_val, y_val_pred)

    session.report({"mse_val": score_val, "mse_train": score_train})

In [None]:
# ! define search spaces
# scikit random forest
rf_space = {
    "n_estimators": tune.randint(100, 1000),
    "max_depth": tune.randint(2, 32),
    "min_samples_split": tune.choice([0.0001, 0.001, 0.01, 0.1]),
    "min_samples_leaf": tune.choice([0.00001, 0.0001, 0.001]),
    "max_features": tune.choice([None, "sqrt", "log2", 0.1, 0.2, 0.5, 0.8]),
    "min_impurity_decrease": tune.choice([0.0001, 0.001, 0.01]),
    "bootstrap": tune.choice([True, False]),
}

# keras neural network
nn_space = {
    # Sample random uniformly between [1,9] rounding to multiples of 3
    "n_layers": tune.qrandint(1, 9, 3),
    "learning_rate": tune.loguniform(1e-5, 1e-1),
    "batch_size": tune.choice([32, 64, 128]),
}
for i in range(9):
    nn_space[f"n_units_l{i}"] = tune.randint(3, 64)

In [None]:
def run_trials(
    trainable,
    search_space,
    X_train,
    y_train,
    X_val,
    y_val,
    seed,
    num_trials=5,
    scheduler_grace_period=5,
    scheduler_max_t=100,
    resources={"cpu": 1},
):
    # set seed for search algorithms/schedulers
    random.seed(seed)
    np.random.seed(seed)

    # AsyncHyperBand enables aggressive early stopping of bad trials.
    scheduler = AsyncHyperBandScheduler(
        # Only stop trials at least this old in time (measured in training iteration)
        grace_period=scheduler_grace_period,
        # stopping trials after max_t iterations have passed
        max_t=scheduler_max_t,
    )

    analysis_rf = tune.Tuner(
        # trainable with input parameters passed and set resources
        tune.with_resources(
            tune.with_parameters(
                trainable,
                X_train=X_train,
                y_train=y_train,
                X_val=X_val,
                y_val=y_val,
                seed=seed,
            ),
            resources,
        ),
        # hyperparameter space
        param_space=search_space,
        tune_config=tune.TuneConfig(
            # todo: consider taking RMSE loss
            metric="mse_val",
            mode="min",
            # define the scheduler
            scheduler=scheduler,
            # number of trials to run
            num_samples=num_trials,
        ),
    )
    return analysis_rf.fit()

In [None]:
results_rf = run_trials(train_rf, rf_space, X_train, y_train, X_val, y_val, seed)
print("Best hyperparameters found were: ", results_rf.get_best_result().config)

In [None]:
results_nn = run_trials(train_nn, nn_space, X_train, y_train, X_val, y_val, seed)
print("Best hyperparameters found were: ", results_nn.get_best_result().config)