From 28628987ca2e89be02761286ad5b2e83683b375f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Tue, 2 Sep 2025 12:22:01 +0200 Subject: [PATCH 1/3] fix: make entrypoint tests compatible with old python versions --- tests/test_entrypoints.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/test_entrypoints.py b/tests/test_entrypoints.py index ceced778d..04df6812e 100644 --- a/tests/test_entrypoints.py +++ b/tests/test_entrypoints.py @@ -25,8 +25,6 @@ def test_entrypoints(): def test_readers_and_writers_entrypoints(): - import importlib.metadata - # Map of expected entry points for readers and writers expected_readers = { "spark": "from_spark", @@ -47,9 +45,16 @@ def test_readers_and_writers_entrypoints(): "polars": "to_polars", "parquet": "write_parquet", } - eps = importlib.metadata.entry_points() - readers = {ep.name for ep in eps.select(group="edsnlp_readers")} - writers = {ep.name for ep in eps.select(group="edsnlp_writers")} + eps = entry_points() + if hasattr(eps, "select"): + readers_eps = eps.select(group="edsnlp_readers") + writers_eps = eps.select(group="edsnlp_writers") + else: + readers_eps = eps.get("edsnlp_readers", []) + writers_eps = eps.get("edsnlp_writers", []) + + readers = {ep.name for ep in readers_eps} + writers = {ep.name for ep in writers_eps} for name in expected_readers: assert name in readers, f"Reader entry point '{name}' is missing" for name in expected_writers: From be4c2ff77f30da569650b6c883385b174d4d5d97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Tue, 2 Sep 2025 19:49:40 +0200 Subject: [PATCH 2/3] docs: fix docs tutorials and warn in training func for mismatched pipe names --- docs/tutorials/training-ner.md | 19 ++++++---- docs/tutorials/training-span-classifier.md | 10 +++-- edsnlp/training/trainer.py | 43 ++++++++++++++-------- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/docs/tutorials/training-ner.md b/docs/tutorials/training-ner.md index c3c151e82..50ddc5614 100644 --- a/docs/tutorials/training-ner.md +++ b/docs/tutorials/training-ner.md @@ -233,7 +233,7 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li import edsnlp from edsnlp.training import train, ScheduledOptimizer, TrainingData from edsnlp.metrics.ner import NerExactMetric - from edsnlp.training.loggers import CSVLogger, RichLogger, WandbLogger + from edsnlp.training.loggers import CSVLogger, RichLogger, WandBLogger import edsnlp.pipes as eds import torch @@ -242,6 +242,7 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li nlp.add_pipe( # The NER pipe will be a CRF model eds.ner_crf( + name="ner", mode="joint", target_span_getter="gold_spans", # Set spans as both to ents and in separate `ent.label` groups @@ -280,19 +281,21 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li optim=torch.optim.Adam, module=nlp, total_steps=max_steps, - groups={ - "^transformer": { - "lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 0 "max_value": 5e-5,}, + groups=[ + { + "selector": "transformer", + "lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 0, "max_value": 5e-5,}, }, - "": { - "lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 3e-4 "max_value": 3e-4,}, + { + "selector": ".*", + "lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 3e-4, "max_value": 3e-4,}, }, - }, + ], ) # loggers = [ - CSVLogger(), + CSVLogger.draft(), # draft as we will let the train function specify the logging_dir RichLogger( fields={ "step": {}, diff --git a/docs/tutorials/training-span-classifier.md b/docs/tutorials/training-span-classifier.md index ce8cd61d2..83aa88b17 100644 --- a/docs/tutorials/training-span-classifier.md +++ b/docs/tutorials/training-span-classifier.md @@ -265,8 +265,9 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li # 🎛️ OPTIMIZER (here it will be the same as thedefault one) optimizer = ScheduledOptimizer.draft( # (2)! optim=torch.optim.AdamW, - groups={ - "biopsy_classifier[.]embedding": { + groups=[ + { + "selector": "biopsy_classifier[.]embedding", "lr": { "@schedules": "linear", "warmup_rate": 0.1, @@ -274,7 +275,8 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li "max_value": 5e-5, }, }, - ".*": { + { + "selector": ".*", "lr": { "@schedules": "linear", "warmup_rate": 0.1, @@ -282,7 +284,7 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li "max_value": 3e-4, }, }, - } + ] ) # 🚀 TRAIN diff --git a/edsnlp/training/trainer.py b/edsnlp/training/trainer.py index 2400aa2b1..3c199599a 100644 --- a/edsnlp/training/trainer.py +++ b/edsnlp/training/trainer.py @@ -676,6 +676,14 @@ def train( total_steps=max_steps, ) + for td in train_data: + if not (td.pipe_names is None or td.pipe_names <= trainable_pipe_names): + raise ValueError( + f"Training data pipe names {td.pipe_names} should be a subset of " + f"the trainable pipe names {trainable_pipe_names}, or left to None " + f"use this dataset for all trainable components." + ) + for phase_i, pipe_names in enumerate(phases): trained_pipes_local: Dict[str, TorchComponent] = { n: nlp.get_pipe(n) for n in pipe_names @@ -688,6 +696,14 @@ def train( if td.pipe_names is None or set(td.pipe_names) & set(pipe_names) ] + if len(phase_training_data) == 0: + raise ValueError( + f"No training data found for phase {phase_i + 1} with components " + f"{', '.join(pipe_names)}. Make sure that these components are " + f"listed in the 'pipe_names' attribute of at least one of the " + f"provided training data." + ) + with nlp.select_pipes(disable=trainable_pipe_names - set(pipe_names)): accelerator.print(f"Phase {phase_i + 1}: training {', '.join(pipe_names)}") set_seed(seed) @@ -700,19 +716,17 @@ def train( grad_params.add(param) param.requires_grad_(has_grad_param) - accelerator.print( - "Optimizing groups:" - + "".join( - "\n - {} weight tensors ({:,} parameters){}".format( + accelerator.print("Optimizing groups:") + for g in optim.param_groups: + accelerator.print( + " - {} weight tensors ({:,} parameters){}".format( len([p for p in g["params"] if p in grad_params]), sum([p.numel() for p in g["params"] if p in grad_params]), ": " + " & ".join(g.get("selectors", "*")) if "selectors" in g else "", ) - for g in optim.param_groups ) - ) accelerator.print( f"Keeping frozen {len(all_params - grad_params):} weight tensors " f"({sum(p.numel() for p in all_params - grad_params):,} parameters)" @@ -720,17 +734,14 @@ def train( nlp.train(True) - iterator = iter( - zip( - *( - td(nlp, device).set_processing( - num_cpu_workers=num_workers, - process_start_method="spawn", - ) - for td in phase_training_data - ) + phase_datasets = [ + td(nlp, device).set_processing( + num_cpu_workers=num_workers, + process_start_method="spawn", ) - ) + for td in phase_training_data + ] + iterator = iter(zip(*(phase_datasets))) (accel_optim, trained_pipes) = accelerator.prepare(optim, trained_pipes) if hasattr(accel_optim.optimizer, "initialize"): accel_optim.optimizer.initialize() From 4fa377da85f3f0560f04f12613fbd6aa5b76c9b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Tue, 2 Sep 2025 10:05:00 +0200 Subject: [PATCH 3/3] chore: bump version to 0.18.0 --- README.md | 4 ++-- changelog.md | 8 ++++---- docs/index.md | 4 ++-- edsnlp/__init__.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index e27a7838d..a09d870a5 100644 --- a/README.md +++ b/README.md @@ -34,13 +34,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) ! You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/). ```shell -pip install edsnlp==0.17.2 +pip install edsnlp==0.18.0 ``` or if you want to use the trainable components (using pytorch) ```shell -pip install "edsnlp[ml]==0.17.2" +pip install "edsnlp[ml]==0.18.0" ``` ### A first pipeline diff --git a/changelog.md b/changelog.md index ee6b31512..37ddb27da 100644 --- a/changelog.md +++ b/changelog.md @@ -1,8 +1,8 @@ # Changelog -## Unreleased +## v0.18.0 (2025-07-02) -## Added +### Added - Added support for multiple loggers (`tensorboard`, `wandb`, `comet_ml`, `aim`, `mlflow`, `clearml`, `dvclive`, `csv`, `json`, `rich`) in `edsnlp.train` via the `logger` parameter. Default is [`json` and `rich`] for backward compatibility. - Sub batch sizes for gradient accumulation can now be defined as simple "splits" of the original batch, e.g. `batch_size = 10000 tokens` and `sub_batch_size = 5 splits` to accumulate batches of 2000 tokens. @@ -12,7 +12,7 @@ - New `Training a span classifier` tutorial, and reorganized deep-learning docs - `ScheduledOptimizer` now warns when a parameter selector does not match any parameter. -## Fixed +### Fixed - `use_section` in `eds.history` should now correctly handle cases when there are other sections following history sections. - Added clickable snippets in the documentation for more registered functions @@ -22,7 +22,7 @@ - :ambulance: Until now, `post_init` was applied **after** the instantiation of the optimizer : if the model discovered new labels, and therefore changed its parameter tensors to reflect that, these new tensors were not taken into account by the optimizer, which could likely lead to subpar performance. Now, `post_init` is applied **before** the optimizer is instantiated, so that the optimizer can correctly handle the new tensors. - Added missing entry points for readers and writers in the registry, including `write_parquet` and support for `polars` in `pyproject.toml`. Now all implemented readers and writers are correctly registered as entry points. -## Changed +### Changed - Sections cues in `eds.history` are now section titles, and not the full section. - :boom: Validation metrics are now found under the root field `validation` in the training logs (e.g. `metrics['validation']['ner']['micro']['f']`) diff --git a/docs/index.md b/docs/index.md index 3e6152a4a..c1869d29b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,13 +15,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) ! You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/). ```{: data-md-color-scheme="slate" } -pip install edsnlp==0.17.2 +pip install edsnlp==0.18.0 ``` or if you want to use the trainable components (using pytorch) ```{: data-md-color-scheme="slate" } -pip install "edsnlp[ml]==0.17.2" +pip install "edsnlp[ml]==0.18.0" ``` ### A first pipeline diff --git a/edsnlp/__init__.py b/edsnlp/__init__.py index 31cce4bd8..372d40bd6 100644 --- a/edsnlp/__init__.py +++ b/edsnlp/__init__.py @@ -15,7 +15,7 @@ import edsnlp.pipes from . import reducers -__version__ = "0.17.2" +__version__ = "0.18.0" BASE_DIR = Path(__file__).parent