From a0ae653e434c4d8da7e81d50f6f869686134ad68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 10 Oct 2024 10:39:14 +0200 Subject: [PATCH 1/2] chore: bump version to 0.13.1 --- README.md | 4 ++-- changelog.md | 8 ++++---- docs/index.md | 4 ++-- pyproject.toml | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a27044b72d..6f56022241 100644 --- a/README.md +++ b/README.md @@ -34,13 +34,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) ! You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/). ```shell -pip install edsnlp==0.13.0 +pip install edsnlp==0.13.1 ``` or if you want to use the trainable components (using pytorch) ```shell -pip install "edsnlp[ml]==0.13.0" +pip install "edsnlp[ml]==0.13.1" ``` ### A first pipeline diff --git a/changelog.md b/changelog.md index 71dac6ec13..b4ec18b139 100644 --- a/changelog.md +++ b/changelog.md @@ -1,19 +1,19 @@ # Changelog -## Unreleased +## v0.13.1 ### Added - `eds.tables` accepts a minimum_table_size (default 2) argument to reduce pollution -- `RuleBasedQualifier` now expose a `process` method that only returns qualified entities and token without actually tagging them, defering this task to the `__call__` method. +- `RuleBasedQualifier` now expose a `process` method that only returns qualified entities and token without actually tagging them, deferring this task to the `__call__` method. - Added new patterns for metastasis detection. Developed on CT-Scan reports. - Added citation of articles ### Fixed -- Disorder and Behavor pipes don't use a "PRESENT" or "ABSENT" `status` anymore. Instead, `status=None` by default, +- Disorder and Behavior pipes don't use a "PRESENT" or "ABSENT" `status` anymore. Instead, `status=None` by default, and `ent._.negation` is set to True instead of setting `status` to "ABSENT". To this end, the *tobacco* and *alcohol* - now use the `NegationQualifier` internaly. + now use the `NegationQualifier` internally. - Numbers are now only detected without trying to remove the pollution in between digits, ie `55 @ 77777` could be detected as a full number before, but not anymore. - Fix fsspec open file encoding to "utf-8". diff --git a/docs/index.md b/docs/index.md index 68ad9babb4..546abc9fe3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,13 +15,13 @@ Check out our interactive [demo](https://aphp.github.io/edsnlp/demo/) ! You can install EDS-NLP via `pip`. We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/). ```{: data-md-color-scheme="slate" } -pip install edsnlp==0.13.0 +pip install edsnlp==0.13.1 ``` or if you want to use the trainable components (using pytorch) ```{: data-md-color-scheme="slate" } -pip install "edsnlp[ml]==0.13.0" +pip install "edsnlp[ml]==0.13.1" ``` ### A first pipeline diff --git a/pyproject.toml b/pyproject.toml index 83e4e85c75..7aa3491df7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "pytz", "pysimstring>=1.2.1", "regex", - "spacy>=3.1,<3.8", + "spacy>=3.2,<3.8", "confit>=0.5.5", "tqdm", "umls-downloader>=0.1.1", From b75bc4ffa1392e44d230e3695a949b517fa899ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 10 Oct 2024 12:30:55 +0200 Subject: [PATCH 2/2] ci: test imports up to python 3.12 and remove scikit-learn dependency --- .github/workflows/release.yml | 1 + .github/workflows/tests.yml | 5 +-- changelog.md | 1 + edsnlp/pipes/core/endlines/endlines.py | 12 ++++--- edsnlp/pipes/misc/quantities/quantities.py | 40 ++++++++++------------ pyproject.toml | 1 - tests/pipelines/test_pipelines.py | 2 +- 7 files changed, 33 insertions(+), 29 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 31f0028f5c..46f98bcc51 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,6 +33,7 @@ jobs: uses: pypa/cibuildwheel@v2.16.5 env: CIBW_ARCHS_MACOS: "x86_64 arm64" + PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" - uses: actions/upload-artifact@v2 with: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d0a0093823..728547434b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -154,7 +154,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.7", "3.8", "3.9"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v2 @@ -172,6 +172,7 @@ jobs: - name: Install library run: | - pip install . + pip install ".[ml]" pytest + pytest tests/pipelines/test_pipelines.py # uv venv # uv pip install . diff --git a/changelog.md b/changelog.md index b4ec18b139..c88ce1d74b 100644 --- a/changelog.md +++ b/changelog.md @@ -20,6 +20,7 @@ ### Changed - Rename `eds.measurements` to `eds.quantities` +- scikit-learn (used in `eds.endlines`) is no longer installed by default when installing `edsnlp[ml]` ## v0.13.0 diff --git a/edsnlp/pipes/core/endlines/endlines.py b/edsnlp/pipes/core/endlines/endlines.py index 42003017d3..6d0deb2752 100644 --- a/edsnlp/pipes/core/endlines/endlines.py +++ b/edsnlp/pipes/core/endlines/endlines.py @@ -22,6 +22,10 @@ class EndLinesMatcher(GenericMatcher): Behind the scenes, it uses a `endlinesmodel` instance, which is an unsupervised algorithm based on the work of [@zweigenbaum2016]. + !!! warning "Installation" + + To use this component, you need to install the `scikit-learn` library. + Training -------- ```python @@ -93,12 +97,12 @@ class EndLinesMatcher(GenericMatcher): Extensions ---------- - The `eds.endlines` pipeline declares one extension, on both `Span` and `Token` - objects. The `end_line` attribute is a boolean, set to `True` if the pipeline + The `eds.endlines` pipe declares one extension, on both `Span` and `Token` + objects. The `end_line` attribute is a boolean, set to `True` if the pipe predicts that the new line is an end line character. Otherwise, it is set to `False` if the new line is classified as a space. - The pipeline also sets the `excluded` custom attribute on newlines that are + The pipe also sets the `excluded` custom attribute on newlines that are classified as spaces. It lets downstream matchers skip excluded tokens (see [normalisation](/pipes/core/normalisation/)) for more detail. @@ -113,7 +117,7 @@ class EndLinesMatcher(GenericMatcher): Authors and citation -------------------- - The `eds.endlines` pipeline was developed by AP-HP's Data Science team based on + The `eds.endlines` pipe was developed by AP-HP's Data Science team based on the work of [@zweigenbaum2016]. ''' diff --git a/edsnlp/pipes/misc/quantities/quantities.py b/edsnlp/pipes/misc/quantities/quantities.py index fee085220f..bc57d4d0cb 100644 --- a/edsnlp/pipes/misc/quantities/quantities.py +++ b/edsnlp/pipes/misc/quantities/quantities.py @@ -612,7 +612,7 @@ def __init__( as_ents: bool = False, span_setter: Optional[SpanSetterArg] = None, use_tables: bool = True, - measurements: Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]] = None # deprecated # noqa: E501 + measurements: Optional[Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]]] = None, # deprecated # noqa: E501 ): if measurements: @@ -632,7 +632,7 @@ def __init__( "Skipping that step." ) - self.all_quantities = (quantities == "all") + self.all_quantities = quantities == "all" if self.all_quantities: quantities = [] @@ -659,9 +659,7 @@ def __init__( self.extract_ranges = extract_ranges self.range_patterns = range_patterns self.span_getter = ( - validate_span_getter(span_getter) - if span_getter is not None - else None + validate_span_getter(span_getter) if span_getter is not None else None ) self.merge_mode = merge_mode self.before_snippet_limit = before_snippet_limit @@ -676,10 +674,7 @@ def __init__( "ents": as_ents, "measurements": True, "quantities": True, - **{ - name: [name] - for name in self.measure_names.values() - } + **{name: [name] for name in self.measure_names.values()}, } super().__init__(nlp=nlp, name=name, span_setter=span_setter) @@ -1033,10 +1028,17 @@ def get_matches_before(i): table_pd = table._.to_pd_table(as_spans=True) # Find out the number's row for _, row in table_pd.iterrows(): - start_line = next((item.start for item in row - if item is not None), None) - end_line = next((item.end for item in reversed(row) - if item is not None), None) + start_line = next( + (item.start for item in row if item is not None), None + ) + end_line = next( + ( + item.end + for item in reversed(row) + if item is not None + ), + None, + ) if start_line is None: continue @@ -1136,10 +1138,7 @@ def is_within_row(x): else: ent.label_ = self.measure_names[dims] - ent._.set( - ent.label_, - SimpleQuantity(value, unit_norm, self.unit_registry) - ) + ent._.set(ent.label_, SimpleQuantity(value, unit_norm, self.unit_registry)) quantities.append(ent) @@ -1224,9 +1223,7 @@ def merge_quantities_in_ranges(self, quantities: List[Span]) -> List[Span]: ] if len(matching_patterns): try: - new_value = RangeQuantity.from_quantities( - last._.value, ent._.value - ) + new_value = RangeQuantity.from_quantities(last._.value, ent._.value) merged[-1] = last = last.doc[ last.start if matching_patterns[0][0] is None @@ -1296,7 +1293,8 @@ def __call__(self, doc): existing = ( list(get_spans(doc, self.span_getter)) if self.span_getter is not None - else ()) + else () + ) snippets = ( dict.fromkeys(ent.sent for ent in existing) if self.span_getter is not None diff --git a/pyproject.toml b/pyproject.toml index 7aa3491df7..9f2bacf645 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,7 +105,6 @@ ml = [ "safetensors>=0.3.0", "transformers>=4.0.0,<5.0.0", "accelerate>=0.20.3,<1.0.0", - "scikit-learn>=1.0.0", ] [project.urls] diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index fa09cc8e64..5962669501 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -12,5 +12,5 @@ def test_import_all(): import edsnlp.pipes for name in dir(edsnlp.pipes): - if not name.startswith("_"): + if not name.startswith("_") and "endlines" not in name: getattr(edsnlp.pipes, name)