From bf31c70cf9835728add2f4dc3d930429b6be991a Mon Sep 17 00:00:00 2001
From: herilalaina <rkt.herilalaina@gmail.com>
Date: Mon, 25 Nov 2024 12:33:24 +0100
Subject: [PATCH 1/2] Bumping version to v0.3.11

---
 ifbo/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ifbo/version.py b/ifbo/version.py
index aa4cd15..91c67cc 100644
--- a/ifbo/version.py
+++ b/ifbo/version.py
@@ -1 +1 @@
-__version__ = "0.3.10"
+__version__ = "0.3.11"

From f10efede2ee8ffacedba04f960e0e197e4d7478b Mon Sep 17 00:00:00 2001
From: herilalaina <rkt.herilalaina@gmail.com>
Date: Wed, 4 Jun 2025 12:12:28 +0200
Subject: [PATCH 2/2] add previous training code in train.py

---
 README.md                  |  12 +++-
 ifbo/priors/ftpfn_prior.py |  25 +++++++-
 ifbo/train.py              | 116 +++++++++++++++++++++++++++++++++++--
 3 files changed, 145 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index bd96edf..b87d040 100644
--- a/README.md
+++ b/README.md
@@ -7,13 +7,14 @@
 This repository contains the official code for our [ICML 2024 paper](https://openreview.net/forum?id=VyoY3Wh9Wd). `ifBO` is an efficient Bayesian Optimization algorithm that dynamically selects and incrementally evaluates candidates during the optimization process. It uses a model called the `Freeze-Thaw surrogate (FT-PFN)` to predict the performance of candidate configurations as more resources are allocated. The `main` branch includes the necessary API to use `FT-PFN`. Refer to the following sections:
 - [Surrogate API](#surrogate-api): to learn how to initialize and use the surrogate model.
 - [Bayesian Optimization with ifBO](#bayesian-optimization-with-ifbo): to understand how to use `ifBO` for Hyperparameter Optimization.
+- [Training your own model][#training-your-own-model]: to understand ifBO training pipeline.
 
 
 > To reproduce experiments from the above paper version, please refer to the branch [`icml-2024`](https://github.com/automl/ifBO/tree/icml-2024).
 
 # Installation
 
-Requires Python 3.11.
+Requires Python 3.11 or later.
 
 ```bash
 pip install -U ifBO
@@ -139,6 +140,15 @@ neps.run(
 )
 ```
 
+## Training your own model
+
+Train ifBO from scratch with the following command:
+
+```bash
+python -m ifbo.train --epochs 20 --output_path your_own_ifbo.model --seq_len 1000
+```
+
+For more training options, run ``python -m ifbo.train -h`` or inspect ``ifbo/train.py``.
 
 
 # Citation
diff --git a/ifbo/priors/ftpfn_prior.py b/ifbo/priors/ftpfn_prior.py
index fa52ae1..e2d2a81 100644
--- a/ifbo/priors/ftpfn_prior.py
+++ b/ifbo/priors/ftpfn_prior.py
@@ -505,9 +505,30 @@ def forward(self, *x, **kwargs) -> torch.Tensor:
         )
         return out
 
+class MultiCurvesEncoderSeqLen(torch.nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, seq_len: int) -> None:
+        super().__init__()
+        self.normalizer = torch.nn.Sequential(
+            encoders.Normalize(0.5, math.sqrt(1 / 12)),
+        )
+        self.epoch_enc = torch.nn.Linear(1, out_dim, bias=False)
+        self.idcurve_enc = torch.nn.Embedding(seq_len + 1, out_dim)
+        self.configuration_enc = encoders.get_variable_num_features_encoder(encoders.Linear)(
+            in_dim - 2, out_dim
+        )
+
+    def forward(self, *x, **kwargs) -> torch.Tensor:
+        x = torch.cat(x, dim=-1)
+        out = (
+            self.epoch_enc(self.normalizer(x[..., 1:2]))
+            + self.idcurve_enc(x[..., :1].int()).squeeze(2)
+            + self.configuration_enc(x[..., 2:])
+        )
+        return out
+
 
-def get_encoder() -> Callable[[int, int], torch.nn.Module]:
-    return lambda num_features, emsize: MultiCurvesEncoder(num_features, emsize)
+def get_encoder(seq_len) -> Callable[[int, int], torch.nn.Module]:
+    return lambda num_features, emsize: MultiCurvesEncoderSeqLen(num_features, emsize, seq_len)
 
 
 def sample_curves(
diff --git a/ifbo/train.py b/ifbo/train.py
index cbd5d81..4951c44 100755
--- a/ifbo/train.py
+++ b/ifbo/train.py
@@ -6,6 +6,7 @@
 import time
 from typing import Any
 
+import argparse
 import torch
 from torch import nn
 from torch.cuda.amp import autocast
@@ -13,15 +14,18 @@
 from tqdm import tqdm
 
 from ifbo import positional_encodings
-from ifbo import utils
+from ifbo import utils, encoders, bar_distribution
 from ifbo.bar_distribution import BarDistribution
 from ifbo.bar_distribution import get_custom_bar_dist
-from ifbo.priors import prior
+from ifbo.priors import prior, ftpfn_prior
+from ifbo.priors.utils import get_batch_to_dataloader
 from ifbo.transformer import TransformerModel
 from ifbo.utils import get_cosine_schedule_with_warmup
 from ifbo.utils import get_openai_lr
 from ifbo.utils import init_dist
 
+from ifbo.utils import default_device
+
 
 class Losses:
     def get_cross_entropy_loss(self, num_classes: int) -> nn.CrossEntropyLoss:
@@ -205,8 +209,8 @@ def train_epoch() -> tuple[float, list[float], float, float, float, float, float
         total_loss = 0.0
         total_positional_losses = torch.zeros(bptt)
         total_positional_losses_recorded = torch.zeros(bptt)
-        nan_steps = torch.zeros(1)
-        ignore_steps = torch.zeros(1)
+        nan_steps = torch.zeros(1).to(device)
+        ignore_steps = torch.zeros(1).to(device)
         before_get_batch = time.time()
         assert (
             len(dl) % aggregate_k_gradients == 0
@@ -384,7 +388,7 @@ def apply_batch_wise_criterion(i: int) -> torch.Tensor:
                         }
                         if step_callback is not None and rank == 0:
                             step_callback(metrics_to_log)
-                        nan_steps += nan_share
+                        nan_steps += nan_share.detach()
                         ignore_steps += (targets == -100).float().mean()
                 except Exception as e:
                     print("Invalid step encountered, skipping...")
@@ -459,3 +463,105 @@ def apply_batch_wise_criterion(i: int) -> torch.Tensor:
         return total_loss, total_positional_losses, model.to("cpu"), dl
 
     return None
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Train an ifBO model")
+
+    # transformer model parameters
+    parser.add_argument("--nlayers", type=int, help="Number of layers", default=6)
+    parser.add_argument("--emsize", type=int, default=512, help="Size of Embeddings")
+    parser.add_argument("--nhead", type=int, default=4, help="Number of heads")
+
+    # PFN parameters
+    parser.add_argument(
+        "--num_borders",
+        type=int,
+        default=1000,
+        help="Number of borders considered in Bar distribution",
+    )
+
+    # Prior parameters
+    parser.add_argument("--seq_len", type=int, required=True, help="Maximum sequence length")
+    parser.add_argument(
+        "--num_features",
+        type=int,
+        required=False,
+        help="The total number of features for each datapoint in an example.",
+        default=12, # has to be at least 3
+    )
+    parser.add_argument(
+        "--power_single_eval_pos_sampler",
+        type=int,
+        required=False,
+        help="Power of an exponential distribution to weight sampling of single eval pos.",
+        default=-2,
+    )
+
+    # training parameters
+    parser.add_argument("--epochs", type=int, required=True, help="Number of Training Epochs")
+    parser.add_argument("--batch_size", type=int, default=25, help="Batch Size for Training")
+    parser.add_argument("--lr", type=float, default=0.0001, help="Learning Rate")
+    parser.add_argument("--steps_per_epoch", type=int, default=100, help="Number of Steps per Epoch")
+    parser.add_argument(
+        "--train_mixed_precision",
+        action="store_true",
+        help="Enable Mixed Precision Training",
+    )
+    parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use")
+
+    # other parameters
+    parser.add_argument("--output_path", type=str, required=True, help="Path to save the model")
+
+    args = parser.parse_args()
+
+    seq_len = args.seq_len
+
+    bucket_limits = torch.linspace(0.0, 1.0, args.num_borders).to(default_device)
+    criterion = bar_distribution.BarDistribution(bucket_limits)
+
+    single_eval_pos_gen = utils.get_weighted_single_eval_pos_sampler(
+        max_len=seq_len,
+        min_len=0,
+        p=args.power_single_eval_pos_sampler,
+    )
+
+    configs_train = {
+        "nlayers": args.nlayers,
+        "emsize": args.emsize,
+        "epochs": args.epochs,
+        "lr": args.lr,
+        "nhead": args.nhead,
+        "bptt": seq_len,  
+        "steps_per_epoch": args.steps_per_epoch,
+        "train_mixed_precision": args.train_mixed_precision,
+        "batch_size": args.batch_size,
+    }
+    configs_train["bptt"] = seq_len
+    configs_train["nhid"] = args.emsize * 2
+    configs_train["warmup_epochs"] = args.epochs // 4
+    configs_train.update(
+        dict(
+            priordataloader_class=get_batch_to_dataloader(ftpfn_prior.get_batch),
+            criterion=criterion,
+            encoder_generator=ftpfn_prior.get_encoder(seq_len),
+            y_encoder_generator=encoders.get_normalized_uniform_encoder(
+                encoders.Linear
+            ),
+            extra_prior_kwargs_dict={
+                "num_features": args.num_features,
+            },
+            single_eval_pos_gen=single_eval_pos_gen,
+            style_encoder_generator=None
+        )
+    )
+
+    total_loss, total_positional_losses, model, dl = train(
+        **configs_train
+    )
+    print(f"Total loss: {total_loss}, Total positional losses: {total_positional_losses}")
+    torch.save(
+        model,
+        args.output_path,
+    )
+    print(f"Model saved to {args.output_path}")
+