Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
This repository contains the official code for our [ICML 2024 paper](https://openreview.net/forum?id=VyoY3Wh9Wd). `ifBO` is an efficient Bayesian Optimization algorithm that dynamically selects and incrementally evaluates candidates during the optimization process. It uses a model called the `Freeze-Thaw surrogate (FT-PFN)` to predict the performance of candidate configurations as more resources are allocated. The `main` branch includes the necessary API to use `FT-PFN`. Refer to the following sections:
- [Surrogate API](#surrogate-api): to learn how to initialize and use the surrogate model.
- [Bayesian Optimization with ifBO](#bayesian-optimization-with-ifbo): to understand how to use `ifBO` for Hyperparameter Optimization.
- [Training your own model][#training-your-own-model]: to understand ifBO training pipeline.


> To reproduce experiments from the above paper version, please refer to the branch [`icml-2024`](https://github.com/automl/ifBO/tree/icml-2024).

# Installation

Requires Python 3.11.
Requires Python 3.11 or later.

```bash
pip install -U ifBO
Expand Down Expand Up @@ -139,6 +140,15 @@ neps.run(
)
```

## Training your own model

Train ifBO from scratch with the following command:

```bash
python -m ifbo.train --epochs 20 --output_path your_own_ifbo.model --seq_len 1000
```

For more training options, run ``python -m ifbo.train -h`` or inspect ``ifbo/train.py``.


# Citation
Expand Down
25 changes: 23 additions & 2 deletions ifbo/priors/ftpfn_prior.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,9 +505,30 @@ def forward(self, *x, **kwargs) -> torch.Tensor:
)
return out

class MultiCurvesEncoderSeqLen(torch.nn.Module):
def __init__(self, in_dim: int, out_dim: int, seq_len: int) -> None:
super().__init__()
self.normalizer = torch.nn.Sequential(
encoders.Normalize(0.5, math.sqrt(1 / 12)),
)
self.epoch_enc = torch.nn.Linear(1, out_dim, bias=False)
self.idcurve_enc = torch.nn.Embedding(seq_len + 1, out_dim)
self.configuration_enc = encoders.get_variable_num_features_encoder(encoders.Linear)(
in_dim - 2, out_dim
)

def forward(self, *x, **kwargs) -> torch.Tensor:
x = torch.cat(x, dim=-1)
out = (
self.epoch_enc(self.normalizer(x[..., 1:2]))
+ self.idcurve_enc(x[..., :1].int()).squeeze(2)
+ self.configuration_enc(x[..., 2:])
)
return out


def get_encoder() -> Callable[[int, int], torch.nn.Module]:
return lambda num_features, emsize: MultiCurvesEncoder(num_features, emsize)
def get_encoder(seq_len) -> Callable[[int, int], torch.nn.Module]:
return lambda num_features, emsize: MultiCurvesEncoderSeqLen(num_features, emsize, seq_len)


def sample_curves(
Expand Down
116 changes: 111 additions & 5 deletions ifbo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,26 @@
import time
from typing import Any

import argparse
import torch
from torch import nn
from torch.cuda.amp import autocast
from torch.cuda.amp import GradScaler
from tqdm import tqdm

from ifbo import positional_encodings
from ifbo import utils
from ifbo import utils, encoders, bar_distribution
from ifbo.bar_distribution import BarDistribution
from ifbo.bar_distribution import get_custom_bar_dist
from ifbo.priors import prior
from ifbo.priors import prior, ftpfn_prior
from ifbo.priors.utils import get_batch_to_dataloader
from ifbo.transformer import TransformerModel
from ifbo.utils import get_cosine_schedule_with_warmup
from ifbo.utils import get_openai_lr
from ifbo.utils import init_dist

from ifbo.utils import default_device


class Losses:
def get_cross_entropy_loss(self, num_classes: int) -> nn.CrossEntropyLoss:
Expand Down Expand Up @@ -205,8 +209,8 @@ def train_epoch() -> tuple[float, list[float], float, float, float, float, float
total_loss = 0.0
total_positional_losses = torch.zeros(bptt)
total_positional_losses_recorded = torch.zeros(bptt)
nan_steps = torch.zeros(1)
ignore_steps = torch.zeros(1)
nan_steps = torch.zeros(1).to(device)
ignore_steps = torch.zeros(1).to(device)
before_get_batch = time.time()
assert (
len(dl) % aggregate_k_gradients == 0
Expand Down Expand Up @@ -384,7 +388,7 @@ def apply_batch_wise_criterion(i: int) -> torch.Tensor:
}
if step_callback is not None and rank == 0:
step_callback(metrics_to_log)
nan_steps += nan_share
nan_steps += nan_share.detach()
ignore_steps += (targets == -100).float().mean()
except Exception as e:
print("Invalid step encountered, skipping...")
Expand Down Expand Up @@ -459,3 +463,105 @@ def apply_batch_wise_criterion(i: int) -> torch.Tensor:
return total_loss, total_positional_losses, model.to("cpu"), dl

return None

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Train an ifBO model")

# transformer model parameters
parser.add_argument("--nlayers", type=int, help="Number of layers", default=6)
parser.add_argument("--emsize", type=int, default=512, help="Size of Embeddings")
parser.add_argument("--nhead", type=int, default=4, help="Number of heads")

# PFN parameters
parser.add_argument(
"--num_borders",
type=int,
default=1000,
help="Number of borders considered in Bar distribution",
)

# Prior parameters
parser.add_argument("--seq_len", type=int, required=True, help="Maximum sequence length")
parser.add_argument(
"--num_features",
type=int,
required=False,
help="The total number of features for each datapoint in an example.",
default=12, # has to be at least 3
)
parser.add_argument(
"--power_single_eval_pos_sampler",
type=int,
required=False,
help="Power of an exponential distribution to weight sampling of single eval pos.",
default=-2,
)

# training parameters
parser.add_argument("--epochs", type=int, required=True, help="Number of Training Epochs")
parser.add_argument("--batch_size", type=int, default=25, help="Batch Size for Training")
parser.add_argument("--lr", type=float, default=0.0001, help="Learning Rate")
parser.add_argument("--steps_per_epoch", type=int, default=100, help="Number of Steps per Epoch")
parser.add_argument(
"--train_mixed_precision",
action="store_true",
help="Enable Mixed Precision Training",
)
parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use")

# other parameters
parser.add_argument("--output_path", type=str, required=True, help="Path to save the model")

args = parser.parse_args()

seq_len = args.seq_len

bucket_limits = torch.linspace(0.0, 1.0, args.num_borders).to(default_device)
criterion = bar_distribution.BarDistribution(bucket_limits)

single_eval_pos_gen = utils.get_weighted_single_eval_pos_sampler(
max_len=seq_len,
min_len=0,
p=args.power_single_eval_pos_sampler,
)

configs_train = {
"nlayers": args.nlayers,
"emsize": args.emsize,
"epochs": args.epochs,
"lr": args.lr,
"nhead": args.nhead,
"bptt": seq_len,
"steps_per_epoch": args.steps_per_epoch,
"train_mixed_precision": args.train_mixed_precision,
"batch_size": args.batch_size,
}
configs_train["bptt"] = seq_len
configs_train["nhid"] = args.emsize * 2
configs_train["warmup_epochs"] = args.epochs // 4
configs_train.update(
dict(
priordataloader_class=get_batch_to_dataloader(ftpfn_prior.get_batch),
criterion=criterion,
encoder_generator=ftpfn_prior.get_encoder(seq_len),
y_encoder_generator=encoders.get_normalized_uniform_encoder(
encoders.Linear
),
extra_prior_kwargs_dict={
"num_features": args.num_features,
},
single_eval_pos_gen=single_eval_pos_gen,
style_encoder_generator=None
)
)

total_loss, total_positional_losses, model, dl = train(
**configs_train
)
print(f"Total loss: {total_loss}, Total positional losses: {total_positional_losses}")
torch.save(
model,
args.output_path,
)
print(f"Model saved to {args.output_path}")