Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) !
You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/).

```shell
pip install edsnlp==0.17.2
pip install edsnlp==0.18.0
```

or if you want to use the trainable components (using pytorch)

```shell
pip install "edsnlp[ml]==0.17.2"
pip install "edsnlp[ml]==0.18.0"
```

### A first pipeline
Expand Down
8 changes: 4 additions & 4 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Changelog

## Unreleased
## v0.18.0 (2025-07-02)

## Added
### Added

- Added support for multiple loggers (`tensorboard`, `wandb`, `comet_ml`, `aim`, `mlflow`, `clearml`, `dvclive`, `csv`, `json`, `rich`) in `edsnlp.train` via the `logger` parameter. Default is [`json` and `rich`] for backward compatibility.
- Sub batch sizes for gradient accumulation can now be defined as simple "splits" of the original batch, e.g. `batch_size = 10000 tokens` and `sub_batch_size = 5 splits` to accumulate batches of 2000 tokens.
Expand All @@ -12,7 +12,7 @@
- New `Training a span classifier` tutorial, and reorganized deep-learning docs
- `ScheduledOptimizer` now warns when a parameter selector does not match any parameter.

## Fixed
### Fixed

- `use_section` in `eds.history` should now correctly handle cases when there are other sections following history sections.
- Added clickable snippets in the documentation for more registered functions
Expand All @@ -22,7 +22,7 @@
- :ambulance: Until now, `post_init` was applied **after** the instantiation of the optimizer : if the model discovered new labels, and therefore changed its parameter tensors to reflect that, these new tensors were not taken into account by the optimizer, which could likely lead to subpar performance. Now, `post_init` is applied **before** the optimizer is instantiated, so that the optimizer can correctly handle the new tensors.
- Added missing entry points for readers and writers in the registry, including `write_parquet` and support for `polars` in `pyproject.toml`. Now all implemented readers and writers are correctly registered as entry points.

## Changed
### Changed

- Sections cues in `eds.history` are now section titles, and not the full section.
- :boom: Validation metrics are now found under the root field `validation` in the training logs (e.g. `metrics['validation']['ner']['micro']['f']`)
Expand Down
4 changes: 2 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) !
You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/).

```{: data-md-color-scheme="slate" }
pip install edsnlp==0.17.2
pip install edsnlp==0.18.0
```

or if you want to use the trainable components (using pytorch)

```{: data-md-color-scheme="slate" }
pip install "edsnlp[ml]==0.17.2"
pip install "edsnlp[ml]==0.18.0"
```

### A first pipeline
Expand Down
19 changes: 11 additions & 8 deletions docs/tutorials/training-ner.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
import edsnlp
from edsnlp.training import train, ScheduledOptimizer, TrainingData
from edsnlp.metrics.ner import NerExactMetric
from edsnlp.training.loggers import CSVLogger, RichLogger, WandbLogger
from edsnlp.training.loggers import CSVLogger, RichLogger, WandBLogger
import edsnlp.pipes as eds
import torch

Expand All @@ -242,6 +242,7 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
nlp.add_pipe(
# The NER pipe will be a CRF model
eds.ner_crf(
name="ner",
mode="joint",
target_span_getter="gold_spans",
# Set spans as both to ents and in separate `ent.label` groups
Expand Down Expand Up @@ -280,19 +281,21 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
optim=torch.optim.Adam,
module=nlp,
total_steps=max_steps,
groups={
"^transformer": {
"lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 0 "max_value": 5e-5,},
groups=[
{
"selector": "transformer",
"lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 0, "max_value": 5e-5,},
},
"": {
"lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 3e-4 "max_value": 3e-4,},
{
"selector": ".*",
"lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 3e-4, "max_value": 3e-4,},
},
},
],
)

#
loggers = [
CSVLogger(),
CSVLogger.draft(), # draft as we will let the train function specify the logging_dir
RichLogger(
fields={
"step": {},
Expand Down
10 changes: 6 additions & 4 deletions docs/tutorials/training-span-classifier.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,24 +265,26 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
# 🎛️ OPTIMIZER (here it will be the same as thedefault one)
optimizer = ScheduledOptimizer.draft( # (2)!
optim=torch.optim.AdamW,
groups={
"biopsy_classifier[.]embedding": {
groups=[
{
"selector": "biopsy_classifier[.]embedding",
"lr": {
"@schedules": "linear",
"warmup_rate": 0.1,
"start_value": 0.,
"max_value": 5e-5,
},
},
".*": {
{
"selector": ".*",
"lr": {
"@schedules": "linear",
"warmup_rate": 0.1,
"start_value": 3e-4,
"max_value": 3e-4,
},
},
}
]
)

# 🚀 TRAIN
Expand Down
2 changes: 1 addition & 1 deletion edsnlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import edsnlp.pipes
from . import reducers

__version__ = "0.17.2"
__version__ = "0.18.0"

BASE_DIR = Path(__file__).parent

Expand Down
43 changes: 27 additions & 16 deletions edsnlp/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,14 @@ def train(
total_steps=max_steps,
)

for td in train_data:
if not (td.pipe_names is None or td.pipe_names <= trainable_pipe_names):
raise ValueError(
f"Training data pipe names {td.pipe_names} should be a subset of "
f"the trainable pipe names {trainable_pipe_names}, or left to None "
f"use this dataset for all trainable components."
)

for phase_i, pipe_names in enumerate(phases):
trained_pipes_local: Dict[str, TorchComponent] = {
n: nlp.get_pipe(n) for n in pipe_names
Expand All @@ -688,6 +696,14 @@ def train(
if td.pipe_names is None or set(td.pipe_names) & set(pipe_names)
]

if len(phase_training_data) == 0:
raise ValueError(
f"No training data found for phase {phase_i + 1} with components "
f"{', '.join(pipe_names)}. Make sure that these components are "
f"listed in the 'pipe_names' attribute of at least one of the "
f"provided training data."
)

with nlp.select_pipes(disable=trainable_pipe_names - set(pipe_names)):
accelerator.print(f"Phase {phase_i + 1}: training {', '.join(pipe_names)}")
set_seed(seed)
Expand All @@ -700,37 +716,32 @@ def train(
grad_params.add(param)
param.requires_grad_(has_grad_param)

accelerator.print(
"Optimizing groups:"
+ "".join(
"\n - {} weight tensors ({:,} parameters){}".format(
accelerator.print("Optimizing groups:")
for g in optim.param_groups:
accelerator.print(
" - {} weight tensors ({:,} parameters){}".format(
len([p for p in g["params"] if p in grad_params]),
sum([p.numel() for p in g["params"] if p in grad_params]),
": " + " & ".join(g.get("selectors", "*"))
if "selectors" in g
else "",
)
for g in optim.param_groups
)
)
accelerator.print(
f"Keeping frozen {len(all_params - grad_params):} weight tensors "
f"({sum(p.numel() for p in all_params - grad_params):,} parameters)"
)

nlp.train(True)

iterator = iter(
zip(
*(
td(nlp, device).set_processing(
num_cpu_workers=num_workers,
process_start_method="spawn",
)
for td in phase_training_data
)
phase_datasets = [
td(nlp, device).set_processing(
num_cpu_workers=num_workers,
process_start_method="spawn",
)
)
for td in phase_training_data
]
iterator = iter(zip(*(phase_datasets)))
(accel_optim, trained_pipes) = accelerator.prepare(optim, trained_pipes)
if hasattr(accel_optim.optimizer, "initialize"):
accel_optim.optimizer.initialize()
Expand Down
15 changes: 10 additions & 5 deletions tests/test_entrypoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ def test_entrypoints():


def test_readers_and_writers_entrypoints():
import importlib.metadata

# Map of expected entry points for readers and writers
expected_readers = {
"spark": "from_spark",
Expand All @@ -47,9 +45,16 @@ def test_readers_and_writers_entrypoints():
"polars": "to_polars",
"parquet": "write_parquet",
}
eps = importlib.metadata.entry_points()
readers = {ep.name for ep in eps.select(group="edsnlp_readers")}
writers = {ep.name for ep in eps.select(group="edsnlp_writers")}
eps = entry_points()
if hasattr(eps, "select"):
readers_eps = eps.select(group="edsnlp_readers")
writers_eps = eps.select(group="edsnlp_writers")
else:
readers_eps = eps.get("edsnlp_readers", [])
writers_eps = eps.get("edsnlp_writers", [])

readers = {ep.name for ep in readers_eps}
writers = {ep.name for ep in writers_eps}
for name in expected_readers:
assert name in readers, f"Reader entry point '{name}' is missing"
for name in expected_writers:
Expand Down
Loading