In [None]:
# | default_exp _experiments.blog

# Blog

## Running in Google Colab

You can run this experiment in Google Colab by clicking the button below:

<a href="https://colab.research.google.com/github/airtai/monotonic-nn/blob/main/nbs/experiments/Blog.ipynb" target=”_blank”>
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" />
</a>


## Dataset

In [None]:
# | hide

from IPython.display import Markdown, display_markdown

try:
    import google.colab

    in_colab = True
except:
    in_colab = False

if in_colab:
    display(
        Markdown(
            """
### If you see this message, you are running in Google Colab
Along with this interactive tutorial the content of this notebook is organized and formatted for documentation purpuoses. 

You can ignore the '# | hide', '# | notest' and '# | echo: false' comments, they are not important for the tutorial.
    """
        )
    )

Blog Feedback [1] is a dataset containing 54,270 data points from
blog posts. The raw HTML-documents of the blog posts were crawled and processed. The prediction
task associated with the data is the prediction of the number of comments in the upcoming 24 hours.
The feature of the dataset has 276 dimensions, and 8 attributes among them should be monotonically
non-decreasing with the prediction. They are A51, A52, A53, A54, A56, A57, A58, A59. Thus the `monotonicity_indicator` corresponding to these features are set to 1.  As done in [2], we only use the data points with targets smaller than the 90th percentile.




References:

1.   Krisztian Buza. Feedback prediction for blogs. In Data analysis, machine learning and knowledge discovery, pages 145–152. Springer, 2014
2.   Xingchao Liu, Xing Han, Na Zhang, and Qiang Liu. Certified monotonic neural networks. Advances in Neural Information Processing Systems, 33:15427–15438, 2020



In [None]:
monotonicity_indicator = {
    f"feature_{i}": 1 if i in range(50, 54) or i in range(55, 59) else 0
    for i in range(276)
}

In [None]:
# | hide

if in_colab:
    !pip install "monotonic-nn[experiments]"

In [None]:
# | include: false

from airt.keras.experiments import (
    create_tuner_stats,
    find_hyperparameters,
    get_train_n_test_data,
)

In [None]:
# | include: false
import shutil
from os import environ

import tensorflow as tf

In [None]:
# | include: false

environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"

gpus = tf.config.list_physical_devices("GPU")
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.set_visible_devices(gpus[0], "GPU")
        logical_gpus = tf.config.list_logical_devices("GPU")
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)

3 Physical GPUs, 1 Logical GPU


These are a few examples of the dataset:

In [None]:
# | echo: false

train_df, test_df = get_train_n_test_data(dataset_name="blog")
display(train_df.head().T.style)

Unnamed: 0,0,1,2,3,4
feature_0,0.00192,0.00192,0.00064,0.00192,0.00192
feature_1,0.001825,0.001825,0.001825,0.0,0.0
feature_2,0.00292,0.00292,0.0,0.00146,0.00146
feature_3,0.001627,0.001627,0.000651,0.001627,0.001627
feature_4,0.0,0.0,0.0,0.0,0.0
feature_5,0.0,0.0,0.0,0.0,0.0
feature_6,0.0,0.0,0.0,0.0,0.0
feature_7,0.0,0.0,0.0,0.0,0.0
feature_8,0.035901,0.035901,0.035901,0.035901,0.035901
feature_9,0.09625,0.09625,0.09625,0.09625,0.09625


## Hyperparameter search

The choice of the batch size and the maximum number of epochs depends on the dataset size. For this dataset, we use the following values:

In [None]:
batch_size = 256
max_epochs = 30

We use the Type-2 architecture built using `MonoDense` layer with the following set of hyperparameters ranges:

In [None]:
def hp_params_f(hp):
    return dict(
        units=hp.Int("units", min_value=16, max_value=32, step=1),
        n_layers=hp.Int("n_layers", min_value=2, max_value=2),
        activation=hp.Choice("activation", values=["elu"]),
        learning_rate=hp.Float(
            "learning_rate", min_value=1e-4, max_value=1e-2, sampling="log"
        ),
        weight_decay=hp.Float(
            "weight_decay", min_value=3e-2, max_value=0.3, sampling="log"
        ),
        dropout=hp.Float("dropout", min_value=0.0, max_value=0.5, sampling="linear"),
        decay_rate=hp.Float(
            "decay_rate", min_value=0.8, max_value=1.0, sampling="reverse_log"
        ),
    )

The following fixed parameters are used to build the Type-2 architecture for this dataset:

- `final_activation` is used to build the final layer for regression problem (set to `None`) or for the classification problem (`"sigmoid"`),

- `loss` is used for training regression (`"mse"`) or classification (`"binary_crossentropy"`) problem, and

- `metrics` denotes metrics used to compare with previously published results: `"accuracy"` for classification and "`mse`" or "`rmse`" for regression.

Parameters `objective` and `direction` are used by the tuner such that `objective=f"val_{metrics}"` and direction is either `"min` or `"max"`.

Parameters `max_trials` denotes the number of trial performed buy the tuner, `patience` is the number of epochs allowed to perform worst than the best one before stopping the current trial. The parameter `execution_per_trial` denotes the number of runs before calculating the results of a trial, it should be set to value greater than 1 for small datasets that have high variance in results.

In [None]:
final_activation = None
loss = "mse"
metrics = tf.keras.metrics.RootMeanSquaredError()
objective = "val_root_mean_squared_error"
direction = "min"
max_trials = 50
executions_per_trial = 1
patience = 10

In [None]:
# | include: false

# uncomment and wait for a long time to find hyperparameters
find_hyperparams = False

if find_hyperparams:
    tuner = find_hyperparameters(
        "blog",
        dir_root="tuner-2",
        monotonicity_indicator=monotonicity_indicator,
        hp_params_f=hp_params_f,
        final_activation=final_activation,
        loss=loss,
        metrics=metrics,
        objective=objective,
        direction=direction,
        max_trials=max_trials,
        patience=patience,
        executions_per_trial=executions_per_trial,
        batch_size=batch_size,
        max_epochs=max_epochs,
    )
else:
    tuner = None

In [None]:
# | include: false

if tuner is not None:
    stats = create_tuner_stats(
        tuner,
        batch_size=batch_size,
        max_epochs=max_epochs,
    )

The following table describes the best models and their hyperparameters found by the tuner:

In [None]:
# | echo: false

if tuner is not None:
    df = stats.sort_values(
        by=f"{objective}_mean", ascending=(direction == "min")
    ).head()

    display(df.reset_index(drop=True).T.style)

In [None]:
# | include: false

if tuner is not None:
    print(df.to_latex(index=False))

## The optimal model

These are the best hyperparameters found by previous runs of the tuner:

In [None]:
def final_hp_params_f(hp):
    return dict(
        units=hp.Fixed("units", value=4),
        n_layers=hp.Fixed("n_layers", 2),
        activation=hp.Fixed("activation", value="elu"),
        learning_rate=hp.Fixed("learning_rate", value=0.01),
        weight_decay=hp.Fixed("weight_decay", value=0.0),
        dropout=hp.Fixed("dropout", value=0.0),
        decay_rate=hp.Fixed("decay_rate", value=0.95),
    )

In [None]:
# | include: false
# | notest


shutil.rmtree("tuner_final/blog", ignore_errors=True)

final_tuner = find_hyperparameters(
    "blog",
    monotonicity_indicator=monotonicity_indicator,
    hp_params_f=final_hp_params_f,
    max_trials=1,
    final_activation=final_activation,
    loss=loss,
    metrics=metrics,
    objective=objective,
    direction=direction,
    batch_size=batch_size,
    max_epochs=max_epochs,
    patience=patience,
    executions_per_trial=1,
    dir_root="tuner_final",
)

Trial 1 Complete [00h 08m 49s]
val_root_mean_squared_error: 0.15556064248085022

Best val_root_mean_squared_error So Far: 0.15556064248085022
Total elapsed time: 00h 08m 49s
INFO:tensorflow:Oracle triggered exit


In [None]:
# | include: false
# | notest

final_stats = create_tuner_stats(
    final_tuner,
    batch_size=batch_size,
    max_epochs=max_epochs,
)

Unnamed: 0,units,n_layers,activation,learning_rate,weight_decay,dropout,decay_rate,val_root_mean_squared_error_mean,val_root_mean_squared_error_std,val_root_mean_squared_error_min,val_root_mean_squared_error_max,params
0,4,2,elu,0.01,0.0,0.0,0.95,0.154109,0.000568,0.153669,0.154894,1665


The final evaluation of the optimal model:

In [None]:
# | echo: false
# | notest

final_stats.T.style

Unnamed: 0,0
units,4
n_layers,2
activation,elu
learning_rate,0.010000
weight_decay,0.000000
dropout,0.000000
decay_rate,0.950000
val_root_mean_squared_error_mean,0.154109
val_root_mean_squared_error_std,0.000568
val_root_mean_squared_error_min,0.153669
