aphp · percevalw · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -36,7 +36,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        python-version: ["3.7", "3.8", "3.9"]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
     steps:
       - uses: actions/checkout@v2
 
@@ -75,10 +75,17 @@ jobs:
         run: |
           pip install poetry
           pip install -e '.[dev,setup]' pytest-xdist pip
+        if: matrix.python-version != '3.10'
 #          uv venv
 #          source .venv/bin/activate
 #          uv pip install -e '.[dev,setup]' pytest-xdist pip
 
+      - name: Install dependencies
+        run: |
+          pip install poetry
+          pip install -e '.[dev-no-ml,setup]' pytest-xdist pip
+        if: matrix.python-version == '3.10'
+
       - name: Test with Pytest on Python ${{ matrix.python-version }}
         env:
           UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }}

diff --git a/changelog.md b/changelog.md
@@ -22,6 +22,8 @@
   2. in multiprocessing mode, ensure that the same data is shuffled in the same way in all workers
 - Bubble BaseComponent instantiation errors correctly
 - Improved support for multi-gpu gradient accumulation (only sync the gradients at the end of the accumulation), now controled by the optiona `sub_batch_size` argument of `TrainingData`.
+- Support again edsnlp without pytorch installed
+- We now test that edsnlp works without pytorch installed
 
 ## v0.14.0 (2024-11-14)
 

diff --git a/edsnlp/processing/multiprocessing.py b/edsnlp/processing/multiprocessing.py
@@ -32,7 +32,6 @@
 from tqdm import tqdm
 
 from edsnlp.core.stream import Stage, Stream, StreamSentinel
-from edsnlp.core.torch_component import _caches
 from edsnlp.data.base import BatchWriter
 from edsnlp.utils.collections import (
     batch_compress_dict,
@@ -229,7 +228,7 @@ def cpu_count():  # pragma: no cover
 
 
 try:
-    import torch
+    import torch.multiprocessing
 
     from edsnlp.utils.torch import dump, load
 
@@ -242,7 +241,8 @@ def load(file, *args, map_location=None, **kwargs):
                 return dill.load(f, *args, **kwargs)
         return dill.load(file, *args, **kwargs)
 
-    dump = dill.dump
+    def dump(obj, file, skip_tensors=False, *args, **kwargs):
+        return dill.dump(obj, file, *args, **kwargs)
 
 
 if os.environ.get("TORCH_SHARING_STRATEGY"):  # pragma: no cover
@@ -698,7 +698,6 @@ class GPUWorker(Worker):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.lock = threading.Lock()
-        self.max_cache = 0
 
     def process_items(self, stage):
         autocast = self.stream.autocast
@@ -727,7 +726,6 @@ def process_items(self, stage):
                     name = f"from-{self.uid}_to-stage-{stage + 1}_of-{cpu_id}"
                     queue = self.data_queues[name]
                     item = (batch_id, batch)
-                    self.max_cache = max(self.max_cache, len(_caches))
 
                 # Do NOT put during lock, otherwise this may lead to a deadlock
                 # in multi-stage (n + 1 where n > 1) scenarios where stage 1
@@ -1185,7 +1183,8 @@ def feed_queue(self, queue, items):
             for q in queues:
                 if q is queue:
                     queue.put(STOP)
-            queue.close()
+            if hasattr(queue, "close"):
+                queue.close()
             if hasattr(queue, "join_thread"):
                 queue.join_thread()
 

diff --git a/edsnlp/training/optimizer.py b/edsnlp/training/optimizer.py
@@ -260,9 +260,9 @@ def __init__(
                         group["selectors"] = sources
                         group["params"] = params
                         cliques.append(group)
-            cliques = [
-                {k: v for k, v in group.items() if v is not None} for group in cliques
-            ]
+            cliques = reversed(
+                [{k: v for k, v in group.items() if v is not None} for group in cliques]
+            )
 
             if isinstance(optim, str):
                 optim = (

diff --git a/edsnlp/training/trainer.py b/edsnlp/training/trainer.py
@@ -102,10 +102,17 @@ def set_flat_stats(x, stats):
 
 @validate_arguments
 class GenericScorer:
-    def __init__(self, speed=True, batch_size: Union[int, str] = 1, **scorers):
+    def __init__(
+        self,
+        speed: bool = True,
+        batch_size: Union[int, str] = 1,
+        autocast: Union[bool, Any] = None,
+        **scorers,
+    ):
         self.scorers = scorers
         self.speed = speed
         self.batch_size = batch_size
+        self.autocast = autocast
 
     def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
         scores = {}
@@ -115,7 +122,14 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
         # Speed
         if self.speed:
             t0 = time.time()
-            list(nlp.pipe(d.copy() for d in tqdm(docs, desc="Computing model speed")))
+            list(
+                nlp.pipe(
+                    d.copy() for d in tqdm(docs, desc="Computing model speed")
+                ).set_processing(
+                    batch_size=self.batch_size,
+                    autocast=self.autocast,
+                )
+            )
             duration = time.time() - t0
             scores["speed"] = dict(
                 wps=sum(len(d) for d in docs) / duration,
@@ -139,7 +153,8 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
             with nlp.select_pipes(enable=ner_pipes):
                 ner_preds = list(
                     nlp.pipe(tqdm(clean_ner_docs, desc="Predicting")).set_processing(
-                        batch_size=self.batch_size
+                        batch_size=self.batch_size,
+                        autocast=self.autocast,
                     )
                 )
             for name, scorer in ner_scorers.items():
@@ -167,7 +182,8 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
             with nlp.select_pipes(disable=ner_pipes):
                 qlf_preds = list(
                     nlp.pipe(tqdm(clean_qlf_docs, desc="Predicting")).set_processing(
-                        batch_size=self.batch_size
+                        batch_size=self.batch_size,
+                        autocast=self.autocast,
                     )
                 )
             for name, scorer in span_attr_scorers.items():
@@ -176,7 +192,12 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
         # Custom scorers
         for name, scorer in scorers.items():
             pred_docs = [d.copy() for d in tqdm(docs, desc="Copying docs")]
-            preds = list(nlp.pipe(tqdm(pred_docs, desc="Predicting")))
+            preds = list(
+                nlp.pipe(tqdm(pred_docs, desc="Predicting")).set_processing(
+                    batch_size=self.batch_size,
+                    autocast=self.autocast,
+                )
+            )
             scores[name] = scorer(docs, preds)
 
         return scores
@@ -242,7 +263,7 @@ def __init__(
         self,
         data: Stream,
         batch_size: BatchSizeArg,
-        shuffle: str,
+        shuffle: Union[str, Literal[False]],
         sub_batch_size: Optional[BatchSizeArg] = None,
         pipe_names: Optional[Collection[str]] = None,
         post_init: bool = True,
@@ -453,7 +474,7 @@ def train(
         os.makedirs(output_dir, exist_ok=True)
         if config_meta is not None:  # pragma: no cover
             print(config_meta["unresolved_config"].to_yaml_str())
-            config_meta["unresolved_config"].to_disk(output_dir / "training_config.yml")
+            config_meta["unresolved_config"].to_disk(output_dir / "train_config.yml")
 
     validation_interval = validation_interval or max_steps // 10
     checkpoint_interval = checkpoint_interval or validation_interval
@@ -515,10 +536,12 @@ def train(
             accelerator.print(
                 "Optimizing groups:"
                 + "".join(
-                    "\n - {} {} weight tensors ({:,} parameters)".format(
-                        g.get("selector", "*") + ":" if "selector" in g else "",
+                    "\n - {} weight tensors ({:,} parameters){}".format(
                         len([p for p in g["params"] if p in grad_params]),
                         sum([p.numel() for p in g["params"] if p in grad_params]),
+                        ": " + " & ".join(g.get("selectors", "*"))
+                        if "selectors" in g
+                        else "",
                     )
                     for g in optim.param_groups
                 )
@@ -563,7 +586,11 @@ def train(
                     disable=not is_main_process,
                     smoothing=0.3,
                 ):
-                    if is_main_process and (step % validation_interval) == 0:
+                    if (
+                        is_main_process
+                        and step > 0
+                        and (step % validation_interval) == 0
+                    ):
                         scores = scorer(nlp, val_docs) if val_docs else {}
                         all_metrics.append(
                             {

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "pysimstring>=1.2.1",
     "regex",
     "spacy>=3.2,<3.8",
-    "confit>=0.7.0",
+    "confit>=0.7.3",
     "tqdm",
     "umls-downloader>=0.1.1",
     "numpy>=1.15.0,<1.23.2; python_version<'3.8'",
@@ -36,7 +36,7 @@ dependencies = [
     "pydantic-core<2.0.0; python_version<'3.8'",
 ]
 [project.optional-dependencies]
-dev = [
+dev-no-ml = [
     "pre-commit>=2.0.0; python_version<'3.8'",
     "pre-commit>=2.21.0; python_version>='3.8'",
     "pytest>=7.1.0",
@@ -48,35 +48,12 @@ dev = [
     "pyspark",
     "polars",
 
-    # Machine Learning
-    "rich-logger>=0.3.1",
-    "torch>=1.13.0",
-    "foldedtensor>=0.3.2",
-    "safetensors>=0.3.0",
-    "transformers>=4.0.0,<5.0.0",
-    "accelerate>=0.20.3,<1.0.0",
     "mlconjug3<3.9.0",
     "scikit-learn>=1.0.0",
 
-    # Docs (same as docs group)
-    "mike~=1.1.2",
-    "mkdocs-charts-plugin==0.0.8",
-    "mkdocs-img2fig-plugin==0.9.3",
-    "mkdocs-material~=9.2.0",
-    "mkdocs-section-index==0.3.4",
-    "mkdocs~=1.5.2",
-    "mkdocstrings~=0.20",
-    "mkdocstrings-python~=1.1",
-    "mkdocs-minify-plugin",
-    "mkdocs-redirects>=1.2.1;python_version>='3.8'",
-    "pybtex~=0.24.0",
-    "pathspec>=0.11.1",  # required by vendored mkdocs-autorefs PR
-    "astunparse",
-    "griffe<0.39",
-    "jedi",
-    "html5lib",
+    "edsnlp[docs-no-ml]",
 ]
-docs = [
+docs-no-ml = [
     "mike~=1.1.2",
     "mkdocs-charts-plugin==0.0.8",
     "mkdocs-img2fig-plugin==0.9.3",
@@ -93,15 +70,6 @@ docs = [
     "griffe<0.39",
     "jedi",
     "html5lib",
-
-    "torch>=1.13.0",
-    "foldedtensor>=0.3.2",
-    "transformers>=4.0.0,<5.0.0",
-    "safetensors>=0.3.0",
-    "rich-logger>=0.3.1",
-]
-setup = [
-    "typer"
 ]
 ml = [
     "rich-logger>=0.3.1",
@@ -111,6 +79,17 @@ ml = [
     "transformers>=4.0.0,<5.0.0",
     "accelerate>=0.20.3,<1.0.0",
 ]
+docs = [
+    "edsnlp[docs-no-ml]",
+    "edsnlp[ml]",
+]
+dev = [
+    "edsnlp[dev-no-ml]",
+    "edsnlp[ml]",
+]
+setup = [
+    "typer"
+]
 
 [project.urls]
 "Source Code" = "https://github.com/aphp/edsnlp"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -19,6 +19,12 @@
 except AttributeError:
     pass
 logging.basicConfig(level=logging.INFO)
+try:
+    import torch.nn
+except ImportError:
+    torch = None
+
+pytest.importorskip("rich")
 
 
 @fixture(scope="session", params=["eds", "fr"])
@@ -75,11 +81,15 @@ def make_ml_pipeline():
 
 @fixture()
 def ml_nlp():
+    if torch is None:
+        pytest.skip("torch not installed", allow_module_level=False)
     return make_ml_pipeline()
 
 
 @fixture(scope="session")
 def frozen_ml_nlp():
+    if torch is None:
+        pytest.skip("torch not installed", allow_module_level=False)
     return make_ml_pipeline()
 
 

diff --git a/tests/data/test_stream.py b/tests/data/test_stream.py
@@ -3,6 +3,11 @@
 import edsnlp
 from edsnlp.utils.collections import ld_to_dl
 
+try:
+    import torch.nn
+except ImportError:
+    torch = None
+
 
 def test_map_batches():
     items = [1, 2, 3, 4, 5]
@@ -30,6 +35,7 @@ def test_flat_iterable(num_cpu_workers):
 
 
 @pytest.mark.parametrize("num_gpu_workers", [0, 1, 2])
+@pytest.mark.skipif(torch is None, reason="torch not installed")
 def test_map_gpu(num_gpu_workers):
     import torch
 

diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
@@ -13,4 +13,8 @@ def test_import_all():
 
     for name in dir(edsnlp.pipes):
         if not name.startswith("_") and "endlines" not in name:
-            getattr(edsnlp.pipes, name)
+            try:
+                getattr(edsnlp.pipes, name)
+            except (ImportError, AttributeError) as e:
+                if "torch" in str(e):
+                    pass
diff --git a/tests/pipelines/trainable/conftest.py b/tests/pipelines/trainable/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+try:
+    import torch.nn
+except ImportError:
+    torch = None
+
+if torch is None:
+    pytest.skip("torch not installed", allow_module_level=True)
+pytest.importorskip("rich")