Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python-version: ["3.7", "3.8", "3.9"]
python-version: ["3.7", "3.8", "3.9", "3.10"]
steps:
- uses: actions/checkout@v2

Expand Down Expand Up @@ -75,10 +75,17 @@ jobs:
run: |
pip install poetry
pip install -e '.[dev,setup]' pytest-xdist pip
if: matrix.python-version != '3.10'
# uv venv
# source .venv/bin/activate
# uv pip install -e '.[dev,setup]' pytest-xdist pip

- name: Install dependencies
run: |
pip install poetry
pip install -e '.[dev-no-ml,setup]' pytest-xdist pip
if: matrix.python-version == '3.10'

- name: Test with Pytest on Python ${{ matrix.python-version }}
env:
UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }}
Expand Down
2 changes: 2 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
2. in multiprocessing mode, ensure that the same data is shuffled in the same way in all workers
- Bubble BaseComponent instantiation errors correctly
- Improved support for multi-gpu gradient accumulation (only sync the gradients at the end of the accumulation), now controled by the optiona `sub_batch_size` argument of `TrainingData`.
- Support again edsnlp without pytorch installed
- We now test that edsnlp works without pytorch installed

## v0.14.0 (2024-11-14)

Expand Down
11 changes: 5 additions & 6 deletions edsnlp/processing/multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from tqdm import tqdm

from edsnlp.core.stream import Stage, Stream, StreamSentinel
from edsnlp.core.torch_component import _caches
from edsnlp.data.base import BatchWriter
from edsnlp.utils.collections import (
batch_compress_dict,
Expand Down Expand Up @@ -229,7 +228,7 @@ def cpu_count(): # pragma: no cover


try:
import torch
import torch.multiprocessing

from edsnlp.utils.torch import dump, load

Expand All @@ -242,7 +241,8 @@ def load(file, *args, map_location=None, **kwargs):
return dill.load(f, *args, **kwargs)
return dill.load(file, *args, **kwargs)

dump = dill.dump
def dump(obj, file, skip_tensors=False, *args, **kwargs):
return dill.dump(obj, file, *args, **kwargs)


if os.environ.get("TORCH_SHARING_STRATEGY"): # pragma: no cover
Expand Down Expand Up @@ -698,7 +698,6 @@ class GPUWorker(Worker):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.lock = threading.Lock()
self.max_cache = 0

def process_items(self, stage):
autocast = self.stream.autocast
Expand Down Expand Up @@ -727,7 +726,6 @@ def process_items(self, stage):
name = f"from-{self.uid}_to-stage-{stage + 1}_of-{cpu_id}"
queue = self.data_queues[name]
item = (batch_id, batch)
self.max_cache = max(self.max_cache, len(_caches))

# Do NOT put during lock, otherwise this may lead to a deadlock
# in multi-stage (n + 1 where n > 1) scenarios where stage 1
Expand Down Expand Up @@ -1185,7 +1183,8 @@ def feed_queue(self, queue, items):
for q in queues:
if q is queue:
queue.put(STOP)
queue.close()
if hasattr(queue, "close"):
queue.close()
if hasattr(queue, "join_thread"):
queue.join_thread()

Expand Down
6 changes: 3 additions & 3 deletions edsnlp/training/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,9 @@ def __init__(
group["selectors"] = sources
group["params"] = params
cliques.append(group)
cliques = [
{k: v for k, v in group.items() if v is not None} for group in cliques
]
cliques = reversed(
[{k: v for k, v in group.items() if v is not None} for group in cliques]
)

if isinstance(optim, str):
optim = (
Expand Down
47 changes: 37 additions & 10 deletions edsnlp/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,17 @@ def set_flat_stats(x, stats):

@validate_arguments
class GenericScorer:
def __init__(self, speed=True, batch_size: Union[int, str] = 1, **scorers):
def __init__(
self,
speed: bool = True,
batch_size: Union[int, str] = 1,
autocast: Union[bool, Any] = None,
**scorers,
):
self.scorers = scorers
self.speed = speed
self.batch_size = batch_size
self.autocast = autocast

def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
scores = {}
Expand All @@ -115,7 +122,14 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
# Speed
if self.speed:
t0 = time.time()
list(nlp.pipe(d.copy() for d in tqdm(docs, desc="Computing model speed")))
list(
nlp.pipe(
d.copy() for d in tqdm(docs, desc="Computing model speed")
).set_processing(
batch_size=self.batch_size,
autocast=self.autocast,
)
)
duration = time.time() - t0
scores["speed"] = dict(
wps=sum(len(d) for d in docs) / duration,
Expand All @@ -139,7 +153,8 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
with nlp.select_pipes(enable=ner_pipes):
ner_preds = list(
nlp.pipe(tqdm(clean_ner_docs, desc="Predicting")).set_processing(
batch_size=self.batch_size
batch_size=self.batch_size,
autocast=self.autocast,
)
)
for name, scorer in ner_scorers.items():
Expand Down Expand Up @@ -167,7 +182,8 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
with nlp.select_pipes(disable=ner_pipes):
qlf_preds = list(
nlp.pipe(tqdm(clean_qlf_docs, desc="Predicting")).set_processing(
batch_size=self.batch_size
batch_size=self.batch_size,
autocast=self.autocast,
)
)
for name, scorer in span_attr_scorers.items():
Expand All @@ -176,7 +192,12 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
# Custom scorers
for name, scorer in scorers.items():
pred_docs = [d.copy() for d in tqdm(docs, desc="Copying docs")]
preds = list(nlp.pipe(tqdm(pred_docs, desc="Predicting")))
preds = list(
nlp.pipe(tqdm(pred_docs, desc="Predicting")).set_processing(
batch_size=self.batch_size,
autocast=self.autocast,
)
)
scores[name] = scorer(docs, preds)

return scores
Expand Down Expand Up @@ -242,7 +263,7 @@ def __init__(
self,
data: Stream,
batch_size: BatchSizeArg,
shuffle: str,
shuffle: Union[str, Literal[False]],
sub_batch_size: Optional[BatchSizeArg] = None,
pipe_names: Optional[Collection[str]] = None,
post_init: bool = True,
Expand Down Expand Up @@ -453,7 +474,7 @@ def train(
os.makedirs(output_dir, exist_ok=True)
if config_meta is not None: # pragma: no cover
print(config_meta["unresolved_config"].to_yaml_str())
config_meta["unresolved_config"].to_disk(output_dir / "training_config.yml")
config_meta["unresolved_config"].to_disk(output_dir / "train_config.yml")

validation_interval = validation_interval or max_steps // 10
checkpoint_interval = checkpoint_interval or validation_interval
Expand Down Expand Up @@ -515,10 +536,12 @@ def train(
accelerator.print(
"Optimizing groups:"
+ "".join(
"\n - {} {} weight tensors ({:,} parameters)".format(
g.get("selector", "*") + ":" if "selector" in g else "",
"\n - {} weight tensors ({:,} parameters){}".format(
len([p for p in g["params"] if p in grad_params]),
sum([p.numel() for p in g["params"] if p in grad_params]),
": " + " & ".join(g.get("selectors", "*"))
if "selectors" in g
else "",
)
for g in optim.param_groups
)
Expand Down Expand Up @@ -563,7 +586,11 @@ def train(
disable=not is_main_process,
smoothing=0.3,
):
if is_main_process and (step % validation_interval) == 0:
if (
is_main_process
and step > 0
and (step % validation_interval) == 0
):
scores = scorer(nlp, val_docs) if val_docs else {}
all_metrics.append(
{
Expand Down
51 changes: 15 additions & 36 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies = [
"pysimstring>=1.2.1",
"regex",
"spacy>=3.2,<3.8",
"confit>=0.7.0",
"confit>=0.7.3",
"tqdm",
"umls-downloader>=0.1.1",
"numpy>=1.15.0,<1.23.2; python_version<'3.8'",
Expand All @@ -36,7 +36,7 @@ dependencies = [
"pydantic-core<2.0.0; python_version<'3.8'",
]
[project.optional-dependencies]
dev = [
dev-no-ml = [
"pre-commit>=2.0.0; python_version<'3.8'",
"pre-commit>=2.21.0; python_version>='3.8'",
"pytest>=7.1.0",
Expand All @@ -48,35 +48,12 @@ dev = [
"pyspark",
"polars",

# Machine Learning
"rich-logger>=0.3.1",
"torch>=1.13.0",
"foldedtensor>=0.3.2",
"safetensors>=0.3.0",
"transformers>=4.0.0,<5.0.0",
"accelerate>=0.20.3,<1.0.0",
"mlconjug3<3.9.0",
"scikit-learn>=1.0.0",

# Docs (same as docs group)
"mike~=1.1.2",
"mkdocs-charts-plugin==0.0.8",
"mkdocs-img2fig-plugin==0.9.3",
"mkdocs-material~=9.2.0",
"mkdocs-section-index==0.3.4",
"mkdocs~=1.5.2",
"mkdocstrings~=0.20",
"mkdocstrings-python~=1.1",
"mkdocs-minify-plugin",
"mkdocs-redirects>=1.2.1;python_version>='3.8'",
"pybtex~=0.24.0",
"pathspec>=0.11.1", # required by vendored mkdocs-autorefs PR
"astunparse",
"griffe<0.39",
"jedi",
"html5lib",
"edsnlp[docs-no-ml]",
]
docs = [
docs-no-ml = [
"mike~=1.1.2",
"mkdocs-charts-plugin==0.0.8",
"mkdocs-img2fig-plugin==0.9.3",
Expand All @@ -93,15 +70,6 @@ docs = [
"griffe<0.39",
"jedi",
"html5lib",

"torch>=1.13.0",
"foldedtensor>=0.3.2",
"transformers>=4.0.0,<5.0.0",
"safetensors>=0.3.0",
"rich-logger>=0.3.1",
]
setup = [
"typer"
]
ml = [
"rich-logger>=0.3.1",
Expand All @@ -111,6 +79,17 @@ ml = [
"transformers>=4.0.0,<5.0.0",
"accelerate>=0.20.3,<1.0.0",
]
docs = [
"edsnlp[docs-no-ml]",
"edsnlp[ml]",
]
dev = [
"edsnlp[dev-no-ml]",
"edsnlp[ml]",
]
setup = [
"typer"
]

[project.urls]
"Source Code" = "https://github.com/aphp/edsnlp"
Expand Down
10 changes: 10 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
except AttributeError:
pass
logging.basicConfig(level=logging.INFO)
try:
import torch.nn
except ImportError:
torch = None

pytest.importorskip("rich")


@fixture(scope="session", params=["eds", "fr"])
Expand Down Expand Up @@ -75,11 +81,15 @@ def make_ml_pipeline():

@fixture()
def ml_nlp():
if torch is None:
pytest.skip("torch not installed", allow_module_level=False)
return make_ml_pipeline()


@fixture(scope="session")
def frozen_ml_nlp():
if torch is None:
pytest.skip("torch not installed", allow_module_level=False)
return make_ml_pipeline()


Expand Down
6 changes: 6 additions & 0 deletions tests/data/test_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
import edsnlp
from edsnlp.utils.collections import ld_to_dl

try:
import torch.nn
except ImportError:
torch = None


def test_map_batches():
items = [1, 2, 3, 4, 5]
Expand Down Expand Up @@ -30,6 +35,7 @@ def test_flat_iterable(num_cpu_workers):


@pytest.mark.parametrize("num_gpu_workers", [0, 1, 2])
@pytest.mark.skipif(torch is None, reason="torch not installed")
def test_map_gpu(num_gpu_workers):
import torch

Expand Down
6 changes: 5 additions & 1 deletion tests/pipelines/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,8 @@ def test_import_all():

for name in dir(edsnlp.pipes):
if not name.startswith("_") and "endlines" not in name:
getattr(edsnlp.pipes, name)
try:
getattr(edsnlp.pipes, name)
except (ImportError, AttributeError) as e:
if "torch" in str(e):
pass
10 changes: 10 additions & 0 deletions tests/pipelines/trainable/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pytest

try:
import torch.nn
except ImportError:
torch = None

if torch is None:
pytest.skip("torch not installed", allow_module_level=True)
pytest.importorskip("rich")
Loading
Loading