From 7cfcf4e8582d9f4fdf21bff9b85fc8ccc75e6d8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 14 Nov 2024 15:35:01 +0100 Subject: [PATCH 1/4] fix: support nlp-less components --- docs/concepts/pipeline.md | 34 +++++++++++++- docs/scripts/clickable_snippets.py | 4 +- docs/tutorials/make-a-training-script.md | 1 + edsnlp/core/pipeline.py | 33 +++++-------- edsnlp/core/registries.py | 33 ++++++++++++- edsnlp/data/converters.py | 3 +- tests/test_pipeline.py | 60 +++++++++++++++++++++++- tests/training/qlf_config.yml | 2 + 8 files changed, 140 insertions(+), 30 deletions(-) diff --git a/docs/concepts/pipeline.md b/docs/concepts/pipeline.md index 23b83919b..8d9fcda5f 100644 --- a/docs/concepts/pipeline.md +++ b/docs/concepts/pipeline.md @@ -60,12 +60,42 @@ To create your first EDS-NLP pipeline, run the following code. We provide severa nlp.add_pipe("eds.negation") ``` -=== "From a config file" +=== "From a YAML config file" + + You can also create a pipeline from a configuration file. This is useful when you plan on changing the pipeline configuration often. + + ```{ .yaml title="config.yml" } + nlp: + "@core": pipeline + lang: eds + components: + sentences: + "@factory": eds.sentences + + matcher: + "@factory": eds.matcher + regex: + smoker: ["fume", "clope"] + + negation: + "@factory": eds.negation + ``` + + and then load the pipeline with: + + ```{ .python .no-check } + import edsnlp + + nlp = edsnlp.load("config.yml") + ``` + +=== "From a INI config file" You can also create a pipeline from a configuration file. This is useful when you plan on changing the pipeline configuration often. ```{ .cfg title="config.cfg" } [nlp] + @core = "pipeline" lang = "eds" pipeline = ["sentences", "matcher", "negation"] @@ -100,7 +130,7 @@ from pathlib import Path nlp("Le patient ne fume pas") # Processing multiple documents -model.pipe([text1, text2]) +nlp.pipe([text1, text2]) ``` For more information on how to use the pipeline, refer to the [Inference](/inference) page. diff --git a/docs/scripts/clickable_snippets.py b/docs/scripts/clickable_snippets.py index 4704d7cff..2b901448a 100644 --- a/docs/scripts/clickable_snippets.py +++ b/docs/scripts/clickable_snippets.py @@ -184,7 +184,7 @@ def replace_link(match): # Re-insert soups into the output for soup, start, end in reversed(soups): - output = output[:start] + str(soup) + output[end:] + output = output[:start] + str(soup.find("code")) + output[end:] output = regex.sub(HREF_REGEX, replace_link, output) @@ -202,7 +202,7 @@ def convert_html_to_code( cls, html_content: str ) -> Tuple[BeautifulSoup, str, list, list]: pre_html_content = "
" + html_content + "
" - soup = BeautifulSoup(pre_html_content, "html5lib") + soup = list(BeautifulSoup(pre_html_content, "html5lib").children)[0] code_element = soup.find("code") line_lengths = [0] diff --git a/docs/tutorials/make-a-training-script.md b/docs/tutorials/make-a-training-script.md index 806122872..29b4130ef 100644 --- a/docs/tutorials/make-a-training-script.md +++ b/docs/tutorials/make-a-training-script.md @@ -395,6 +395,7 @@ print(nlp.config.to_yaml_str()) ```yaml title="config.yml" nlp: + "@core": "pipeline" lang: "eds" components: ner: diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py index 74f47a726..d92f99925 100644 --- a/edsnlp/core/pipeline.py +++ b/edsnlp/core/pipeline.py @@ -1,7 +1,6 @@ import contextlib import functools import importlib -import inspect import os import re import shutil @@ -10,6 +9,7 @@ import sysconfig import warnings from enum import Enum +from inspect import Parameter, signature from pathlib import Path from types import FunctionType from typing import ( @@ -105,7 +105,7 @@ def __init__( vocab_config: Type[BaseDefaults] = None, meta: Dict[str, Any] = None, pipeline: Optional[Sequence[str]] = None, - components: Dict[str, CurriedFactory] = {}, + components: Dict[str, Any] = {}, disable: AsList[str] = EMPTY_LIST, enable: AsList[str] = EMPTY_LIST, exclude: AsList = EMPTY_LIST, @@ -232,17 +232,18 @@ def create_pipe( Pipe """ try: - curried: CurriedFactory = Config( + pipe = Config( { "@factory": factory, **(config if config is not None else {}), } ).resolve(registry=registry) - if name is None: - name = inspect.signature(curried.factory).parameters.get("name").default - if name is None or name == inspect.Parameter.empty: - name = factory - pipe = curried.instantiate(nlp=self, path=(name,)) + if isinstance(pipe, CurriedFactory): + if name is None: + name = signature(pipe.factory).parameters.get("name").default + if name is None or name == Parameter.empty: + name = factory + pipe = pipe.instantiate(nlp=self, path=(name,)) except ConfitValidationError as e: raise e.with_traceback(None) return pipe @@ -413,8 +414,8 @@ def pipe( inputs: Iterable[Union[str, Doc]] The inputs to create the Docs from, or Docs directly. n_process: int - Deprecated. Use the ".set(num_cpu_workers=n_process)" method on the returned - data stream instead. + Deprecated. Use the ".set_processing(num_cpu_workers=n_process)" method + on the returned data stream instead. The number of parallel workers to use. If 0, the operations will be executed sequentially. @@ -589,16 +590,6 @@ def _add_pipes( enable: Container[str], disable: Container[str], ): - # Since components are actually resolved as curried factories, - # we need to instantiate them here - for name, component in components.items(): - if not isinstance(component, CurriedFactory): - raise ValueError( - f"Component {repr(name)} is not instantiable (got {component}). " - f"Please make sure that you didn't forget to add a '@factory' " - f"key to the component config." - ) - try: components = CurriedFactory.instantiate(components, nlp=self) except ConfitValidationError as e: @@ -1215,7 +1206,7 @@ def load( elif is_package: # Load as package available_kwargs = {"overrides": overrides, **pipe_selection} - signature_kwargs = inspect.signature(module.load).parameters + signature_kwargs = signature(module.load).parameters kwargs = { name: available_kwargs[name] for name in signature_kwargs diff --git a/edsnlp/core/registries.py b/edsnlp/core/registries.py index c3b9a5409..8628b5f4a 100644 --- a/edsnlp/core/registries.py +++ b/edsnlp/core/registries.py @@ -75,12 +75,38 @@ def maybe_nlp(self) -> Union["CurriedFactory", Any]: ------- Union["CurriedFactory", Any] """ + from edsnlp.core.pipeline import Pipeline, PipelineProtocol + sig = inspect.signature(self.factory) - # and sig.parameters["nlp"].default is sig.empty - if "nlp" not in sig.parameters or "nlp" in self.kwargs: + if ( + not ( + "nlp" in sig.parameters + and ( + sig.parameters["nlp"].default is sig.empty + or sig.parameters["nlp"].annotation in (Pipeline, PipelineProtocol) + ) + ) + or "nlp" in self.kwargs + ) and not self.search_curried_factory(self.kwargs): return self.factory(**self.kwargs) return self + @classmethod + def search_curried_factory(cls, obj): + if isinstance(obj, CurriedFactory): + return obj + elif isinstance(obj, dict): + for value in obj.values(): + result = cls.search_curried_factory(value) + if result is not None: + return result + elif isinstance(obj, (tuple, list, set)): + for value in obj: + result = cls.search_curried_factory(value) + if result is not None: + return result + return None + def instantiate( obj: Any, nlp: "edsnlp.Pipeline", @@ -177,6 +203,9 @@ def __getattr__(self, name): raise AttributeError(name) self._raise_curried_factory_error() + def __repr__(self): + return f"CurriedFactory({self.factory})" + glob = [] diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py index c8c262354..1bf1e6d2b 100644 --- a/edsnlp/data/converters.py +++ b/edsnlp/data/converters.py @@ -644,7 +644,6 @@ def __call__(self, doc): def get_dict2doc_converter( converter: Union[str, Callable], kwargs ) -> Tuple[Callable, Dict]: - kwargs_to_init = False if not callable(converter): available = edsnlp.registry.factory.get_available() try: @@ -666,7 +665,7 @@ def get_dict2doc_converter( f"Cannot find converter for format {converter}. " f"Available converters are {', '.join(available)}" ) - if isinstance(converter, type) or kwargs_to_init: + if isinstance(converter, type): return converter(**kwargs), {} return converter, validate_kwargs(converter, kwargs) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 9c0ff82c1..d4734498d 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -130,8 +130,8 @@ def test_disk_serialization(tmp_path, ml_nlp): [components.ner] @factory = "eds.ner_crf" embedding = ${components.transformer} -target_span_getter = ["ents", "ner-preds"] mode = "independent" +target_span_getter = ["ents", "ner-preds"] labels = ["PERSON", "GIFT"] infer_span_setter = false window = 40 @@ -254,6 +254,41 @@ def test_config_validation_error(): assert "got 'error-mode'" in str(e.value) +@edsnlp.registry.factory.register("test_wrapper", spacy_compatible=False) +class WrapperComponent: + def __init__(self, *, copy_list, copy_dict, sub): + pass + + +fail_config_sub = """ +nlp: + lang: "eds" + components: + wrapper: + "@factory": "test_wrapper" + + copy_list: + - ${nlp.components.wrapper.sub} + + copy_dict: + key: ${nlp.components.wrapper.sub} + + sub: + "@factory": "eds.matcher" + terms: 100.0 # clearly wrong + + matcher_copy: ${nlp.components.wrapper.sub} +""" + + +def test_config_sub_validation_error(): + with pytest.raises(ConfitValidationError): + Pipeline.from_config(Config.from_yaml_str(fail_config_sub)) + + fix = {"nlp": {"components": {"wrapper": {"sub": {"terms": {"pattern": ["ok"]}}}}}} + Pipeline.from_config(Config.from_yaml_str(fail_config_sub).merge(fix)) + + def test_add_pipe_validation_error(): model = edsnlp.blank("eds") with pytest.raises(ConfitValidationError) as e: @@ -407,3 +442,26 @@ def test_repr(frozen_ml_nlp): "ner": eds.ner_crf })""" ) + + +@edsnlp.registry.factory.register("test_nlp_less", spacy_compatible=False) +class NlpLessComponent: + def __init__(self, nlp=None, name: str = "nlp_less", *, value: int): + self.value = value + self.name = name + + def __call__(self, doc): + return doc + + +def test_nlp_less_component(): + component = NlpLessComponent(value=42) + assert component.value == 42 + + config = """ +[component] +@factory = "test_nlp_less" +value = 42 +""" + component = Config.from_str(config).resolve(registry=registry)["component"] + assert component.value == 42 diff --git a/tests/training/qlf_config.yml b/tests/training/qlf_config.yml index 960ad857b..884a8e349 100644 --- a/tests/training/qlf_config.yml +++ b/tests/training/qlf_config.yml @@ -1,5 +1,7 @@ # 🤖 PIPELINE DEFINITION nlp: + "@core": pipeline + lang: eds components: From 359c9d5b62ee99d903c3ab59d7e892b166e562e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Fri, 15 Nov 2024 02:08:09 +0100 Subject: [PATCH 2/4] fix: redirect measurements import to quantities --- edsnlp/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/edsnlp/__init__.py b/edsnlp/__init__.py index 152cfa253..3ab460806 100644 --- a/edsnlp/__init__.py +++ b/edsnlp/__init__.py @@ -52,6 +52,10 @@ def find_spec(self, fullname, path, target=None): # pragma: no cover new_name = fullname.replace("span_qualifier", "span_classifier") spec = importlib.util.spec_from_loader(fullname, AliasLoader(new_name)) return spec + if "measurements" in fullname.split("."): + new_name = fullname.replace("measurements", "quantities") + spec = importlib.util.spec_from_loader(fullname, AliasLoader(new_name)) + return spec class AliasLoader(importlib.abc.Loader): From f33afd770c8c40358564b1cb5ce95e86cb7296f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 14 Nov 2024 21:17:05 +0100 Subject: [PATCH 3/4] ci: test build only with pytorch cpu --- .github/workflows/release.yml | 2 +- .github/workflows/test-build.yml | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b6b9198d0..a2b733941 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,7 +33,7 @@ jobs: uses: pypa/cibuildwheel@v2.21.3 env: CIBW_ARCHS_MACOS: "x86_64 arm64" - PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" + CIBW_ENVIRONMENT: PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu - uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index ac64a28b3..569849669 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -4,6 +4,7 @@ name: Test Build + on: workflow_dispatch: pull_request: @@ -27,6 +28,8 @@ jobs: uses: pypa/cibuildwheel@v2.16.5 env: CIBW_ARCHS_MACOS: "x86_64 arm64" + CIBW_ENVIRONMENT: PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu + build_sdist: name: Build source distribution From 733a7fe2e3d737e95514ba3e3cf70a5762688445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 14 Nov 2024 12:53:11 +0100 Subject: [PATCH 4/4] chore: bump version to 0.14.0 --- README.md | 4 ++-- changelog.md | 2 +- docs/index.md | 4 ++-- edsnlp/__init__.py | 2 +- pyproject.toml | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 6f5602224..f30dca7d7 100644 --- a/README.md +++ b/README.md @@ -34,13 +34,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) ! You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/). ```shell -pip install edsnlp==0.13.1 +pip install edsnlp==0.14.0 ``` or if you want to use the trainable components (using pytorch) ```shell -pip install "edsnlp[ml]==0.13.1" +pip install "edsnlp[ml]==0.14.0" ``` ### A first pipeline diff --git a/changelog.md b/changelog.md index 9836cadd0..133f20fc3 100644 --- a/changelog.md +++ b/changelog.md @@ -1,6 +1,6 @@ # Changelog -## Unreleased +## v0.14.0 (2024-11-14) ### Added diff --git a/docs/index.md b/docs/index.md index 546abc9fe..e3ac71610 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,13 +15,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) ! You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/). ```{: data-md-color-scheme="slate" } -pip install edsnlp==0.13.1 +pip install edsnlp==0.14.0 ``` or if you want to use the trainable components (using pytorch) ```{: data-md-color-scheme="slate" } -pip install "edsnlp[ml]==0.13.1" +pip install "edsnlp[ml]==0.14.0" ``` ### A first pipeline diff --git a/edsnlp/__init__.py b/edsnlp/__init__.py index 3ab460806..620685214 100644 --- a/edsnlp/__init__.py +++ b/edsnlp/__init__.py @@ -15,7 +15,7 @@ import edsnlp.pipes from . import reducers -__version__ = "0.13.1" +__version__ = "0.14.0" BASE_DIR = Path(__file__).parent diff --git a/pyproject.toml b/pyproject.toml index a071d4088..eb8e7dfb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "pysimstring>=1.2.1", "regex", "spacy>=3.2,<3.8", - "confit>=0.5.5", + "confit>=0.7.0", "tqdm", "umls-downloader>=0.1.1", "numpy>=1.15.0,<1.23.2; python_version<'3.8'",