In [None]:
# | default_exp _experiments.auto

# Auto MPG

The Auto MPG Dataset is a regression dataset [1] with 7 features:

- Cylinders

- Displacement

- Horsepower

- Weight

- Acceleration

- Model Year

- Origin.

The dependant variable MPG is monotonically decreasing with respect to features Weigh, Displacement, and Horsepower. The `monotonicity_indicator` corrsponding to these features are set to -1, since the relationship is a monotonically decreasing one with respect to the dependant variable.

This is a part of comparison with methods and datasets from COMET [2].

References:

1. Ross Quinlan. Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann, 1993.

    https://archive.ics.uci.edu/ml/datasets/auto+mpg

2. Aishwarya Sivaraman, Golnoosh Farnadi, Todd Millstein, and Guy Van den Broeck. Counterexample-guided learning of monotonic neural networks. Advances in Neural Information Processing Systems, 33:11936–11948, 2020.

    Github repo: https://github.com/AishwaryaSivaraman/COMET


In [None]:
monotonicity_indicator = {
    "Cylinders": 0,
    "Displacement": -1,
    "Horsepower": -1,
    "Weight": -1,
    "Acceleration": 0,
    "Model_Year": 0,
    "Origin": 0,
}

In [None]:
# | include: false

from airt.keras.experiments import (
    create_tuner_stats,
    find_hyperparameters,
    get_train_n_test_data,
)

In [None]:
# | include: false
import shutil
from os import environ

import tensorflow as tf

In [None]:
# | include: false

environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"

gpus = tf.config.list_physical_devices("GPU")
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.set_visible_devices(gpus[2], "GPU")
        logical_gpus = tf.config.list_logical_devices("GPU")
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)

3 Physical GPUs, 1 Logical GPU


These are a few examples of the dataset:

In [None]:
# | echo: false

train_df, test_df = get_train_n_test_data(dataset_name="auto")
display(train_df.head().T.style)

Unnamed: 0,0,1,2,3,4
Cylinders,1.482807,1.482807,1.482807,1.482807,1.482807
Displacement,1.073028,1.482902,1.044432,1.025368,2.235927
Horsepower,0.650564,1.548993,1.163952,0.907258,2.396084
Weight,0.606625,0.828131,0.523413,0.542165,1.587581
Acceleration,-1.275546,-1.452517,-1.275546,-1.80646,-1.983431
Model_Year,-1.631803,-1.631803,-1.631803,-1.631803,-1.631803
Origin,-0.701669,-0.701669,-0.701669,-0.701669,-0.701669
ground_truth,18.0,15.0,16.0,17.0,15.0


## Hyperparameter search

The choice of the batch size and the maximum number of epochs depends on the dataset size. For this dataset, we use the following values:

In [None]:
batch_size = 16
max_epochs = 50

We use the Type-2 architecture built using `MonoDense` layer with the following set of hyperparameters ranges:

In [None]:
def hp_params_f(hp):
    return dict(
        units=hp.Int("units", min_value=16, max_value=24, step=1),
        n_layers=hp.Int("n_layers", min_value=2, max_value=2),
        activation=hp.Choice("activation", values=["elu"]),
        learning_rate=hp.Float(
            "learning_rate", min_value=1e-2, max_value=0.3, sampling="log"
        ),
        weight_decay=hp.Float(
            "weight_decay", min_value=1e-2, max_value=0.3, sampling="log"
        ),
        dropout=hp.Float("dropout", min_value=0.0, max_value=0.5, sampling="linear"),
        decay_rate=hp.Float(
            "decay_rate", min_value=0.8, max_value=1.0, sampling="reverse_log"
        ),
    )

The following fixed parameters are used to build the Type-2 architecture for this dataset:

- `final_activation` is used to build the final layer for regression problem (set to `None`) or for the classification problem (`"sigmoid"`),

- `loss` is used for training regression (`"mse"`) or classification (`"binary_crossentropy"`) problem, and

- `metrics` denotes metrics used to compare with previosly published results: `"accuracy"` for classification and "`mse`" or "`rmse`" for regression.

Parameters `objective` and `direction` are used by the tuner such that `objective=f"val_{metrics}"` and direction is either `"min` or `"max"`.

Parameters `max_trials` denotes the number of trial performed buy the tuner, `patience` is the number of epochs allowed to perform worst than the best one before stopping the current trial. The parameter `execution_per_trial` denotes the number of runs before calculating the results of a trial, it should be set to value greater than 1 for small datasets that have high variance in results.

In [None]:
final_activation = None
loss = "mse"
metrics = "mse"
objective = "val_mse"
direction = "min"
max_trials = 200
patience = 5
executions_per_trial = 3

The following code runs the tuner using the hyperparameter ranges defined above:

In [None]:
# | include: false
# | notest

tuner = find_hyperparameters(
    "auto",
    monotonicity_indicator=monotonicity_indicator,
    hp_params_f=hp_params_f,
    final_activation=final_activation,
    loss=loss,
    metrics=metrics,
    objective=objective,
    direction=direction,
    max_trials=max_trials,
    patience=patience,
    executions_per_trial=executions_per_trial,
    batch_size=batch_size,
    max_epochs=max_epochs,
)

INFO:tensorflow:Reloading Tuner from tuner/auto/tuner0.json
INFO:tensorflow:Oracle triggered exit


In [None]:
# | include: false
# | notest

stats = create_tuner_stats(
    tuner,
    batch_size=batch_size,
    max_epochs=max_epochs,
)

Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848


Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848
1,19,2,elu,0.080618,0.023706,0.149354,0.8,8.420449,0.11067,8.294801,8.576631,627


Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848
1,19,2,elu,0.080618,0.023706,0.149354,0.8,8.420449,0.11067,8.294801,8.576631,627
2,18,2,elu,0.063714,0.017734,0.380232,0.997305,8.489175,0.029429,8.458106,8.52313,597


Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848
1,19,2,elu,0.080618,0.023706,0.149354,0.8,8.420449,0.11067,8.294801,8.576631,627
3,19,2,elu,0.243362,0.094957,0.038402,0.876091,8.45762,0.105302,8.330505,8.592981,627
2,18,2,elu,0.063714,0.017734,0.380232,0.997305,8.489175,0.029429,8.458106,8.52313,597


Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848
1,19,2,elu,0.080618,0.023706,0.149354,0.8,8.420449,0.11067,8.294801,8.576631,627
4,22,2,elu,0.194285,0.120804,0.074635,0.88955,8.431914,0.073258,8.322106,8.512444,885
3,19,2,elu,0.243362,0.094957,0.038402,0.876091,8.45762,0.105302,8.330505,8.592981,627
2,18,2,elu,0.063714,0.017734,0.380232,0.997305,8.489175,0.029429,8.458106,8.52313,597


Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848
1,19,2,elu,0.080618,0.023706,0.149354,0.8,8.420449,0.11067,8.294801,8.576631,627
4,22,2,elu,0.194285,0.120804,0.074635,0.88955,8.431914,0.073258,8.322106,8.512444,885
3,19,2,elu,0.243362,0.094957,0.038402,0.876091,8.45762,0.105302,8.330505,8.592981,627
2,18,2,elu,0.063714,0.017734,0.380232,0.997305,8.489175,0.029429,8.458106,8.52313,597
5,20,2,elu,0.07086,0.012791,0.096718,0.800337,8.525143,0.155735,8.337971,8.68341,811


Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848
1,19,2,elu,0.080618,0.023706,0.149354,0.8,8.420449,0.11067,8.294801,8.576631,627
4,22,2,elu,0.194285,0.120804,0.074635,0.88955,8.431914,0.073258,8.322106,8.512444,885
3,19,2,elu,0.243362,0.094957,0.038402,0.876091,8.45762,0.105302,8.330505,8.592981,627
2,18,2,elu,0.063714,0.017734,0.380232,0.997305,8.489175,0.029429,8.458106,8.52313,597
6,22,2,elu,0.031049,0.050126,0.310785,0.970615,8.497766,0.115313,8.343637,8.620289,885
5,20,2,elu,0.07086,0.012791,0.096718,0.800337,8.525143,0.155735,8.337971,8.68341,811


Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848
1,19,2,elu,0.080618,0.023706,0.149354,0.8,8.420449,0.11067,8.294801,8.576631,627
7,21,2,elu,0.042817,0.04505,0.324661,0.988544,8.421339,0.063357,8.352478,8.520736,848
4,22,2,elu,0.194285,0.120804,0.074635,0.88955,8.431914,0.073258,8.322106,8.512444,885
3,19,2,elu,0.243362,0.094957,0.038402,0.876091,8.45762,0.105302,8.330505,8.592981,627
2,18,2,elu,0.063714,0.017734,0.380232,0.997305,8.489175,0.029429,8.458106,8.52313,597
6,22,2,elu,0.031049,0.050126,0.310785,0.970615,8.497766,0.115313,8.343637,8.620289,885
5,20,2,elu,0.07086,0.012791,0.096718,0.800337,8.525143,0.155735,8.337971,8.68341,811


Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848
1,19,2,elu,0.080618,0.023706,0.149354,0.8,8.420449,0.11067,8.294801,8.576631,627
7,21,2,elu,0.042817,0.04505,0.324661,0.988544,8.421339,0.063357,8.352478,8.520736,848
8,22,2,elu,0.107845,0.032343,0.237459,0.886158,8.430901,0.115722,8.297507,8.565886,885
4,22,2,elu,0.194285,0.120804,0.074635,0.88955,8.431914,0.073258,8.322106,8.512444,885
3,19,2,elu,0.243362,0.094957,0.038402,0.876091,8.45762,0.105302,8.330505,8.592981,627
2,18,2,elu,0.063714,0.017734,0.380232,0.997305,8.489175,0.029429,8.458106,8.52313,597
6,22,2,elu,0.031049,0.050126,0.310785,0.970615,8.497766,0.115313,8.343637,8.620289,885
5,20,2,elu,0.07086,0.012791,0.096718,0.800337,8.525143,0.155735,8.337971,8.68341,811


Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371161,0.084437,8.251875,8.476566,848
9,17,2,elu,0.105021,0.064151,0.18983,0.82854,8.404634,0.149566,8.255271,8.614701,567
1,19,2,elu,0.080618,0.023706,0.149354,0.8,8.420449,0.11067,8.294801,8.576631,627
7,21,2,elu,0.042817,0.04505,0.324661,0.988544,8.421339,0.063357,8.352478,8.520736,848
8,22,2,elu,0.107845,0.032343,0.237459,0.886158,8.430901,0.115722,8.297507,8.565886,885
4,22,2,elu,0.194285,0.120804,0.074635,0.88955,8.431914,0.073258,8.322106,8.512444,885
3,19,2,elu,0.243362,0.094957,0.038402,0.876091,8.45762,0.105302,8.330505,8.592981,627
2,18,2,elu,0.063714,0.017734,0.380232,0.997305,8.489175,0.029429,8.458106,8.52313,597
6,22,2,elu,0.031049,0.050126,0.310785,0.970615,8.497766,0.115313,8.343637,8.620289,885
5,20,2,elu,0.07086,0.012791,0.096718,0.800337,8.525143,0.155735,8.337971,8.68341,811


The following table describes the best models and their hyperparameters found by the tuner:

In [None]:
# | echo: false
# | notest

df = stats.sort_values(by=f"{objective}_mean", ascending=(direction == "min")).head()

df.reset_index(drop=True).T.style

Unnamed: 0,0,1,2,3,4
units,21,17,19,21,22
n_layers,2,2,2,2,2
activation,elu,elu,elu,elu,elu
learning_rate,0.073407,0.105021,0.080618,0.042817,0.107845
weight_decay,0.058583,0.064151,0.023706,0.045050,0.032343
dropout,0.157718,0.189830,0.149354,0.324661,0.237459
decay_rate,0.887923,0.828540,0.800000,0.988544,0.886158
val_mse_mean,8.371161,8.404634,8.420449,8.421339,8.430901
val_mse_std,0.084437,0.149566,0.110670,0.063357,0.115722
val_mse_min,8.251875,8.255271,8.294801,8.352478,8.297507


In [None]:
# | include: false
# | notest

print(df.to_latex(index=False))

\begin{tabular}{rrlrrrrrrrrr}
\toprule
units & n_layers & activation & learning_rate & weight_decay & dropout & decay_rate & val_mse_mean & val_mse_std & val_mse_min & val_mse_max & params \\
\midrule
21 & 2 & elu & 0.073407 & 0.058583 & 0.157718 & 0.887923 & 8.371161 & 0.084437 & 8.251875 & 8.476566 & 848 \\
17 & 2 & elu & 0.105021 & 0.064151 & 0.189830 & 0.828540 & 8.404634 & 0.149566 & 8.255271 & 8.614701 & 567 \\
19 & 2 & elu & 0.080618 & 0.023706 & 0.149354 & 0.800000 & 8.420449 & 0.110670 & 8.294801 & 8.576631 & 627 \\
21 & 2 & elu & 0.042817 & 0.045050 & 0.324661 & 0.988544 & 8.421339 & 0.063357 & 8.352478 & 8.520736 & 848 \\
22 & 2 & elu & 0.107845 & 0.032343 & 0.237459 & 0.886158 & 8.430901 & 0.115722 & 8.297507 & 8.565886 & 885 \\
\bottomrule
\end{tabular}



## The optimal model

These are the best hyperparameters found by previous runs of the tuner:

In [None]:
def final_hp_params_f(hp):
    return dict(
        units=hp.Fixed("units", value=21),
        n_layers=hp.Fixed("n_layers", 2),
        activation=hp.Fixed("activation", value="elu"),
        learning_rate=hp.Fixed("learning_rate", value=0.073407),
        weight_decay=hp.Fixed("weight_decay", value=0.058583),
        dropout=hp.Fixed("dropout", value=0.157718),
        decay_rate=hp.Fixed("decay_rate", value=0.887923),
    )

In [None]:
# | include: false
# | notest


shutil.rmtree("tuner_final/auto", ignore_errors=True)

final_tuner = find_hyperparameters(
    "auto",
    monotonicity_indicator=monotonicity_indicator,
    hp_params_f=final_hp_params_f,
    max_trials=1,
    final_activation=final_activation,
    loss=loss,
    metrics=metrics,
    objective=objective,
    direction=direction,
    batch_size=batch_size,
    max_epochs=1,
    patience=patience,
    executions_per_trial=1,
    dir_root="tuner_final",
)

Trial 1 Complete [00h 00m 03s]
val_mse: 15.842103958129883

Best val_mse So Far: 15.842103958129883
Total elapsed time: 00h 00m 03s
INFO:tensorflow:Oracle triggered exit


In [None]:
# | include: false
# | notest

final_stats = create_tuner_stats(
    final_tuner,
    batch_size=batch_size,
    max_epochs=max_epochs,
)

Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_mse_mean,val_mse_std,val_mse_min,val_mse_max,params
0,21,2,elu,0.073407,0.058583,0.157718,0.887923,8.371155,0.08444,8.251865,8.476567,848


The final evaluation of the optimal model:

In [None]:
# | echo: false
# | notest

final_stats.T.style

Unnamed: 0,0
units,21
n_layers,2
activation,elu
learning_rate,0.073407
weight_decay,0.058583
dropout,0.157718
decay_rate,0.887923
val_mse_mean,8.371155
val_mse_std,0.084440
val_mse_min,8.251865
