From 9fa4541fa27a62a074f94ab457c16389e65b989e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Tue, 15 Apr 2025 09:11:04 +0200 Subject: [PATCH 01/11] chore: update ruff --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bfaa2cd705..efdf4e9ecd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: # ruff - repo: https://github.com/charliermarsh/ruff-pre-commit # Ruff version. - rev: 'v0.6.4' + rev: 'v0.9.6' hooks: - id: ruff args: ['--config', 'pyproject.toml', '--fix', '--show-fixes'] From 22f6568bdef1ffefaac01fea43802096e3ba2454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 2 May 2024 16:33:36 +0200 Subject: [PATCH 02/11] feat: expose qualifiers default patterns as a default_patterns class attr --- changelog.md | 1 + edsnlp/pipes/qualifiers/family/family.py | 2 ++ edsnlp/pipes/qualifiers/history/history.py | 2 ++ edsnlp/pipes/qualifiers/hypothesis/hypothesis.py | 2 ++ edsnlp/pipes/qualifiers/negation/negation.py | 2 ++ edsnlp/pipes/qualifiers/reported_speech/reported_speech.py | 2 ++ 6 files changed, 11 insertions(+) diff --git a/changelog.md b/changelog.md index d1121ef2d7..b6f27b7c7a 100644 --- a/changelog.md +++ b/changelog.md @@ -5,6 +5,7 @@ ### Added - Support for numpy>2.0, and formal support for Python 3.11 and Python 3.12 +- Expose the defaults patterns of `eds.negation`, `eds.hypothesis`, `eds.family`, `eds.history` and `eds.reported_speech` under a `eds.negation.default_patterns` attribute ### Fixed diff --git a/edsnlp/pipes/qualifiers/family/family.py b/edsnlp/pipes/qualifiers/family/family.py index e979e71dec..6f2150585a 100644 --- a/edsnlp/pipes/qualifiers/family/family.py +++ b/edsnlp/pipes/qualifiers/family/family.py @@ -133,6 +133,8 @@ class FamilyContextQualifier(RuleBasedQualifier): The `eds.family` component was developed by AP-HP's Data Science team. """ + default_patterns = patterns + def __init__( self, nlp: PipelineProtocol, diff --git a/edsnlp/pipes/qualifiers/history/history.py b/edsnlp/pipes/qualifiers/history/history.py index 2dc8d56d85..2b4f3c7623 100644 --- a/edsnlp/pipes/qualifiers/history/history.py +++ b/edsnlp/pipes/qualifiers/history/history.py @@ -197,6 +197,8 @@ class HistoryQualifier(RuleBasedQualifier): The `eds.history` component was developed by AP-HP's Data Science team. """ + default_patterns = patterns + history_limit: timedelta def __init__( diff --git a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py index 924d2cf63f..156016a011 100644 --- a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py +++ b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py @@ -168,6 +168,8 @@ class HypothesisQualifier(RuleBasedQualifier): The `eds.hypothesis` pipeline was developed by AP-HP's Data Science team. """ + default_patterns = patterns + def __init__( self, nlp: PipelineProtocol, diff --git a/edsnlp/pipes/qualifiers/negation/negation.py b/edsnlp/pipes/qualifiers/negation/negation.py index fb2c7878fc..98c0a78a25 100644 --- a/edsnlp/pipes/qualifiers/negation/negation.py +++ b/edsnlp/pipes/qualifiers/negation/negation.py @@ -170,6 +170,8 @@ class NegationQualifier(RuleBasedQualifier): The `eds.negation` component was developed by AP-HP's Data Science team. """ + default_patterns = patterns + def __init__( self, nlp: PipelineProtocol, diff --git a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py index 77b0cbe913..d87352e20c 100644 --- a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py +++ b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py @@ -134,6 +134,8 @@ class ReportedSpeechQualifier(RuleBasedQualifier): The `eds.reported_speech` component was developed by AP-HP's Data Science team. """ + default_patterns = patterns + def __init__( self, nlp: PipelineProtocol, From 7a7eee82069dd5e714a8e596439a5e478692ac07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 2 May 2024 16:36:36 +0200 Subject: [PATCH 03/11] feat: add context_getter argument to eds.matcher --- changelog.md | 1 + edsnlp/pipes/core/matcher/matcher.py | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/changelog.md b/changelog.md index b6f27b7c7a..5489bafce5 100644 --- a/changelog.md +++ b/changelog.md @@ -6,6 +6,7 @@ - Support for numpy>2.0, and formal support for Python 3.11 and Python 3.12 - Expose the defaults patterns of `eds.negation`, `eds.hypothesis`, `eds.family`, `eds.history` and `eds.reported_speech` under a `eds.negation.default_patterns` attribute +- Added a `context_getter` SpanGetter argument to the `eds.matcher` class to only retrieve entities inside the spans returned by the getter ### Fixed diff --git a/edsnlp/pipes/core/matcher/matcher.py b/edsnlp/pipes/core/matcher/matcher.py index e824e9f9dd..fe9874b312 100644 --- a/edsnlp/pipes/core/matcher/matcher.py +++ b/edsnlp/pipes/core/matcher/matcher.py @@ -9,6 +9,7 @@ from edsnlp.matchers.simstring import SimstringMatcher from edsnlp.matchers.utils import Patterns from edsnlp.pipes.base import BaseNERComponent, SpanSetterArg +from edsnlp.utils.span_getters import SpanGetterArg, get_spans class GenericMatcher(BaseNERComponent): @@ -102,6 +103,7 @@ def __init__( term_matcher: Literal["exact", "simstring"] = "exact", term_matcher_config: Dict[str, Any] = {}, span_setter: SpanSetterArg = {"ents": True}, + context_getter: Optional[SpanGetterArg] = None, ): super().__init__(nlp=nlp, name=name, span_setter=span_setter) @@ -114,6 +116,7 @@ def __init__( regex = regex or {} self.attr = attr + self.context_getter = context_getter if term_matcher == "exact": self.phrase_matcher = EDSPhraseMatcher( @@ -163,10 +166,16 @@ def process(self, doc: Doc) -> List[Span]: List of Spans returned by the matchers. """ - matches = self.phrase_matcher(doc, as_spans=True) - regex_matches = self.regex_matcher(doc, as_spans=True) - - spans = list(matches) + list(regex_matches) + contexts = ( + list(get_spans(doc, self.context_getter)) + if self.context_getter is not None + else [doc] + ) + spans: List[Span] = [] + for context in contexts: + matches = self.phrase_matcher(context, as_spans=True) + regex_matches = self.regex_matcher(context, as_spans=True) + spans.extend(list(matches) + list(regex_matches)) return spans From ada9871f1190b8fbf6cbf7be65ef28cd17f06f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Mon, 14 Apr 2025 09:47:47 +0200 Subject: [PATCH 04/11] fix: better error messages --- edsnlp/pipes/core/contextual_matcher/factory.py | 4 ++-- edsnlp/utils/bindings.py | 4 ++-- edsnlp/utils/lazy_module.py | 3 ++- tests/pipelines/test_pipelines.py | 12 ++++++++++++ 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/edsnlp/pipes/core/contextual_matcher/factory.py b/edsnlp/pipes/core/contextual_matcher/factory.py index 38badbb6fa..a75422ad5c 100644 --- a/edsnlp/pipes/core/contextual_matcher/factory.py +++ b/edsnlp/pipes/core/contextual_matcher/factory.py @@ -15,6 +15,6 @@ ) create_component = registry.factory.register( - "eds.contextual-matcher", - deprecated=["contextual-matcher"], + "eds.contextual_matcher", + deprecated=["eds.contextual-matcher", "contextual-matcher"], )(ContextualMatcher) diff --git a/edsnlp/utils/bindings.py b/edsnlp/utils/bindings.py index 17933cd5df..d02e769a4a 100644 --- a/edsnlp/utils/bindings.py +++ b/edsnlp/utils/bindings.py @@ -53,7 +53,7 @@ def make_binding_getter(attribute: Union[str, Binding]): f"def getter(span):\n" f" try:\n" f" return {path} == value\n" - f" except AttributeError:\n" + f" except (AttributeError, KeyError):\n" f" return False\n", ctx, ctx, @@ -66,7 +66,7 @@ def make_binding_getter(attribute: Union[str, Binding]): f"def getter(span):\n" f" try:\n" f" return {path}\n" - f" except AttributeError:\n" + f" except (AttributeError, KeyError):\n" f" return None\n", ctx, ctx, diff --git a/edsnlp/utils/lazy_module.py b/edsnlp/utils/lazy_module.py index 6378760b64..ef2b98e40a 100644 --- a/edsnlp/utils/lazy_module.py +++ b/edsnlp/utils/lazy_module.py @@ -67,6 +67,7 @@ def __getattr__(name): ------- """ + imported_module_name = module_globals["__name__"] if name in module_paths: module_path, module_name = module_paths[name] result = getattr( @@ -80,7 +81,7 @@ def __getattr__(name): ) module_globals[name] = result return result - raise AttributeError(f"module {__name__} has no attribute {name}") + raise AttributeError(f"module {imported_module_name} has no attribute {name}") def __dir__(): """ diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 8c12941c01..9500b46407 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -1,3 +1,8 @@ +import pytest + +import edsnlp + + def test_pipelines(doc): assert len(doc.ents) == 3 patient, _, anomalie = doc.ents @@ -18,3 +23,10 @@ def test_import_all(): except (ImportError, AttributeError) as e: if "torch" in str(e): pass + + +def test_non_existing_pipe(): + with pytest.raises(AttributeError) as e: + getattr(edsnlp.pipes, "non_existing_pipe") + + assert str(e.value) == "module edsnlp.pipes has no attribute non_existing_pipe" From e0b2e48178128e39bfa35fe2c0de09ebec3825ad Mon Sep 17 00:00:00 2001 From: Perceval Wajsburt Date: Fri, 3 May 2024 08:53:23 +0000 Subject: [PATCH 05/11] feat: add required and include args & support for span_getter in contextual_matcher and context window syntax --- changelog.md | 3 + docs/assets/stylesheets/extra.css | 17 + docs/pipes/core/contextual-matcher.md | 335 +++-------- edsnlp/matchers/regex.py | 4 +- .../contextual_matcher/contextual_matcher.py | 541 +++++++++--------- .../pipes/core/contextual_matcher/models.py | 374 +++++++----- .../pipes/ner/disorders/diabetes/diabetes.py | 4 +- edsnlp/utils/span_getters.py | 245 +++++++- edsnlp/utils/typing.py | 3 +- pyproject.toml | 2 +- .../pipelines/core/test_contextual_matcher.py | 55 +- tests/utils/test_span_getters.py | 76 ++- 12 files changed, 948 insertions(+), 711 deletions(-) diff --git a/changelog.md b/changelog.md index 5489bafce5..1dccd03113 100644 --- a/changelog.md +++ b/changelog.md @@ -7,6 +7,9 @@ - Support for numpy>2.0, and formal support for Python 3.11 and Python 3.12 - Expose the defaults patterns of `eds.negation`, `eds.hypothesis`, `eds.family`, `eds.history` and `eds.reported_speech` under a `eds.negation.default_patterns` attribute - Added a `context_getter` SpanGetter argument to the `eds.matcher` class to only retrieve entities inside the spans returned by the getter +- Added a `filter_expr` parameter to scorers to filter the documents to score +- Added a new `required` field to `eds.contextual_matcher` assign patterns to only match if the required field has been found, and an `include` parameter (similar to `exclude`) to search for required patterns without assigning them to the entity +- Added context strings (e.g., "words[0:5] | sent[0:1]") to the `eds.contextual_matcher` component to allow for more complex patterns in the selection of the window around the trigger spans ### Fixed diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index ef6c443e43..da8fe9706e 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -190,3 +190,20 @@ a.discrete-link { font-size: 1rem; align-content: center; } + +.doc-param-details .subdoc { + padding: 0; + box-shadow: none; + border-color: var(--md-typeset-table-color); +} + +.doc-param-details .subdoc > div > div > div> table { + padding: 0; + box-shadow: none; + border: none; +} + +.doc-param-details .subdoc > summary { + margin: 0; + font-weight: normal; +} diff --git a/docs/pipes/core/contextual-matcher.md b/docs/pipes/core/contextual-matcher.md index 30f4afb25b..245750885e 100644 --- a/docs/pipes/core/contextual-matcher.md +++ b/docs/pipes/core/contextual-matcher.md @@ -1,148 +1,8 @@ - # Contextual Matcher {: #edsnlp.pipes.core.contextual_matcher.factory.create_component } -During feature extraction, it may be necessary to search for additional patterns in their neighborhood, namely: - -- patterns to discard irrelevant entities -- patterns to enrich these entities and store some information - -For example, to extract mentions of non-benign cancers, we need to discard all extractions that mention "benin" in their immediate neighborhood. -Although such a filtering is feasible using a regular expression, it essentially requires modifying each of the regular expressions. - -The ContextualMatcher allows to perform this extraction in a clear and concise way. - -## The configuration file - -The whole ContextualMatcher pipeline component is basically defined as a list of **pattern dictionaries**. -Let us see step by step how to build such a list using the example stated just above. - -### a. Finding mentions of cancer - -To do this, we can build either a set of `terms` or a set of `regex`. `terms` will be used to search for exact matches in the text. While less flexible, -it is faster than using regex. In our case we could use the following lists (which are of course absolutely not exhaustives): - -```python -terms = [ - "cancer", - "tumeur", -] - -regex = [ - r"adeno(carcinom|[\s-]?k)", - "neoplas", - "melanom", -] -``` - -Maybe we want to exclude mentions of benign cancers: - -```python -benign = "benign|benin" -``` - -### b. Find mention of a *stage* and extract its value - -For this we will forge a RegEx with one capturing group (basically a pattern enclosed in parentheses): - -```python -stage = "stade (I{1,3}V?|[1234])" -``` - -This will extract stage between 1 and 4 - -We can add a second regex to try to capture if the cancer is in a metastasis stage or not: - -```python -metastase = "(metasta)" -``` - -### c. The complete configuration - -We can now put everything together: - -```python -cancer = dict( - source="Cancer solide", - regex=regex, - terms=terms, - regex_attr="NORM", - exclude=dict( - regex=benign, - window=3, - ), - assign=[ - dict( - name="stage", - regex=stage, - window=(-10, 10), - replace_entity=False, - reduce_mode=None, - ), - dict( - name="metastase", - regex=metastase, - window=10, - replace_entity=False, - reduce_mode="keep_last", - ), - ], -) -``` - -Here the configuration consists of a single dictionary. We might want to also include lymphoma in the matcher: - -```python -lymphome = dict( - source="Lymphome", - regex=["lymphom", "lymphangio"], - regex_attr="NORM", - exclude=dict( - regex=["hodgkin"], # (1) - window=3, - ), -) -``` - -1. We are excluding "Lymphome de Hodgkin" here - -In this case, the configuration can be concatenated in a list: +EDS-NLP provides simple pattern matchers like `eds.matcher` to extract regular expressions, specific phrases, or perform lexical similarity matching on documents. However, certain use cases require examining the context around matched entities to filter out irrelevant matches or enrich them with additional information. For example, to extract mentions of malignant cancers, we need to exclude matches that have “benin” mentioned nearby : `eds.contextual_matcher` was built to address such needs. -```python -patterns = [cancer, lymphome] -``` - -## Available parameters for more flexibility - -3 main parameters can be used to refine how entities will be formed - -### The `include_assigned` parameter - -Following the previous example, you might want your extracted entities to **include**, if found, the cancer stage and the metastasis status. This can be achieved by setting `include_assigned=True` in the pipe configuration. - -For instance, from the sentence "Le patient a un cancer au stade 3", the extracted entity will be: - -- "cancer" if `include_assigned=False` -- "cancer au stade 3" if `include_assigned=True` - -### The `reduce_mode` parameter - -It may happen that an assignment matches more than once. For instance, in the (nonsensical) sentence "Le patient a un cancer au stade 3 et au stade 4", both "stade 3" and "stade 4" will be matched by the `stage` assign key. Depending on your use case, you may want to keep all the extractions, or just one. - -- If `reduce_mode=None` (default), all extractions are kept in a list -- If `reduce_mode="keep_first"`, only the extraction closest to the main matched entity will be kept (in this case, it would be "stade 3" since it is the closest to "cancer") -- If `reduce_mode=="keep_last"`, only the furthest extraction is kept. - -### The `replace_entity` parameter - -This parameter can be se to `True` **only for a single assign key per dictionary**. This limitation comes from the purpose of this parameter: If set to `True`, the corresponding `assign` key will be returned as the entity, instead of the match itself. For clarity, let's take the same sentence "Le patient a un cancer au stade 3" as an example: - -- if `replace_entity=True` in the `stage` assign key, then the extracted entity will be "stade 3" instead of "cancer" -- if `replace_entity=False` for every assign key, the returned entity will be, as expected, "cancer" - -**Please notice** that with `replace_entity` set to True, if the correponding assign key matches nothing, the entity will be discarded. - - -## Examples +## Example ```python import edsnlp, edsnlp.pipes as eds @@ -153,14 +13,69 @@ nlp.add_pipe(eds.sentences()) nlp.add_pipe(eds.normalizer()) nlp.add_pipe( eds.contextual_matcher( - patterns=patterns, + patterns=[ + dict( + terms=["cancer", "tumeur"], # (1)! + regex=[r"adeno(carcinom|[\s-]?k)", "neoplas", "melanom"], # (2)! + regex_attr="NORM", # (3)! + exclude=dict( + regex="benign|benin", # (4)! + window=3, # (5)! + ), + assign=[ + dict( + name="stage", # (6)! + regex="stade (I{1,3}V?|[1234])", # (7)! + window="words[-10:10]", # (8)! + replace_entity=False, # (9)! + reduce_mode=None, # (10)! + ), + dict( + name="metastase", # (11)! + regex="(metasta)", # (12)! + window=10, # (13)! + replace_entity=False, # (14)! + reduce_mode="keep_last", # (15)! + ), + ], + ), + dict( + source="Lymphome", # (16)! + regex=["lymphom", "lymphangio"], # (17)! + regex_attr="NORM", # (18)! + exclude=dict( + regex=["hodgkin"], # (19)! + window=3, # (20)! + ), + ), + ], label="cancer", ), ) ``` -Let us see what we can get from this pipeline with a few examples - +1. Exact match terms (faster than regex, but less flexible) +2. Regex for flexible matching +3. Apply regex on normalized text +4. Regex to exclude benign mentions +5. Window size for exclusion check +6. Extract cancer stage +7. Stage regex pattern +8. Window range for stage extraction. Visit the documentation of [ContextWindow][edsnlp.utils.span_getters.ContextWindow] for more information about this syntax. +9. Do not use these matches as replacement for the anchor (default behavior) +10. Keep all matches +11. Detect metastasis +12. Regex for metastasis detection +13. Window size for detection +14. Keep main entity +15. Keep furthest extraction +16. Source label for lymphoma +17. Regex patterns for lymphoma +18. Apply regex on normalized text +19. Exclude Hodgkin lymphoma +20. Window size for exclusion + +Let's explore some examples using this pipeline: === "Simple match" @@ -181,7 +96,7 @@ Let us see what we can get from this pipeline with a few examples === "Exclusion rule" - Let us check that when a *benign* mention is present, the extraction is excluded: + Check exclusion with a benign mention: ```python txt = "Le patient a eu un cancer relativement bénin il y a 5 ans" @@ -193,8 +108,7 @@ Let us see what we can get from this pipeline with a few examples === "Extracting additional infos" - All informations extracted from the provided `assign` configuration can be found in the `assigned` attribute - under the form of a dictionary: + Additional information extracted via `assign` configurations is available in the `assigned` attribute: ```python txt = "Le patient a eu un cancer de stade 3." @@ -204,124 +118,39 @@ Let us see what we can get from this pipeline with a few examples # Out: {'stage': '3'} ``` -However, most of the configuration is provided in the `patterns` key, as a **pattern dictionary** or a **list of pattern dictionaries** - -## The pattern dictionary - -### Description +## Better control over the final extracted entities -A patterr is a nested dictionary with the following keys: +Three main parameters refine how entities are extracted: -=== "`source`" +#### `include_assigned` - A label describing the pattern +Following the previous example, if you want extracted entities to include the cancer stage or metastasis status (if found), set `include_assigned=True` in the pipe configuration. -=== "`regex`" +For instance, from the sentence "Le patient a un cancer au stade 3": - A single Regex or a list of Regexes +- If `include_assigned=False`, the extracted entity is "cancer" +- If `include_assigned=True`, the extracted entity is "cancer au stade 3" -=== "`regex_attr`" +#### `reduce_mode` - An attributes to overwrite the given `attr` when matching with Regexes. +Sometimes, an assignment matches multiple times. For example, in the sentence "Le patient a un cancer au stade 3 et au stade 4", both "stade 3" and "stade 4" match the `stage` key. Depending on your use case: -=== "`terms`" +- `reduce_mode=None` (default): Keeps all matched extractions in a list +- `reduce_mode="keep_first"`: Keeps only the extraction closest to the main matched entity ("stade 3" in this case) +- `reduce_mode="keep_last"`: Keeps only the furthest extraction - A single term or a list of terms (for exact matches) +#### `replace_entity` -=== "`exclude`" +This parameter can be set to `True` **for only one assign key per dictionary**. If set to `True`, the matched assignment replaces the main entity. - A dictionary (or list of dictionaries) to define exclusion rules. Exclusion rules are given as Regexes, and if a - match is found in the surrounding context of an extraction, the extraction is removed. Each dictionary should have the following keys: +Example using "Le patient a un cancer au stade 3": - === "`window`" +- With `replace_entity=True` for the `stage` key, the entity extracted is "stade 3" +- With `replace_entity=False`, the entity extracted remains "cancer" - Size of the context to use (in number of words). You can provide the window as: +**Note**: With `replace_entity=True`, if the corresponding assign key matches nothing, the entity is discarded. - - A positive integer, in this case the used context will be taken **after** the extraction - - A negative integer, in this case the used context will be taken **before** the extraction - - A tuple of integers `(start, end)`, in this case the used context will be the snippet from `start` tokens before the extraction to `end` tokens after the extraction - - === "`regex`" - - A single Regex or a list of Regexes. - -=== "`assign`" - - A dictionary to refine the extraction. Similarily to the `exclude` key, you can provide a dictionary to - use on the context **before** and **after** the extraction. - - === "`name`" - - A name (string) - - === "`window`" - - Size of the context to use (in number of words). You can provide the window as: - - - A positive integer, in this case the used context will be taken **after** the extraction - - A negative integer, in this case the used context will be taken **before** the extraction - - A tuple of integers `(start, end)`, in this case the used context will be the snippet from `start` tokens before the extraction to `end` tokens after the extraction - - === "`regex`" - - A dictionary where keys are labels and values are **Regexes with a single capturing group** - - === "`replace_entity`" - - If set to `True`, the match from the corresponding assign key will be used as entity, instead of the main match. See [this paragraph][the-replace_entity-parameter] - - === "`reduce_mode`" - - Set how multiple assign matches are handled. See the documentation of the [`reduce_mode` parameter][the-reduce_mode-parameter] - -### A full pattern dictionary example - -```python -dict( - source="AVC", - regex=[ - "accidents? vasculaires? cerebr", - ], - terms="avc", - regex_attr="NORM", - exclude=[ - dict( - regex=["service"], - window=3, - ), - dict( - regex=[" a "], - window=-2, - ), - ], - assign=[ - dict( - name="neo", - regex=r"(neonatal)", - expand_entity=True, - window=3, - ), - dict( - name="trans", - regex="(transitoire)", - expand_entity=True, - window=3, - ), - dict( - name="hemo", - regex=r"(hemorragique)", - expand_entity=True, - window=3, - ), - dict( - name="risk", - regex=r"(risque)", - expand_entity=False, - window=-3, - ), - ], -) -``` +The primary configuration is provided in the `patterns` key as either a **pattern dictionary** or a **list of pattern dictionaries**. ::: edsnlp.pipes.core.contextual_matcher.factory.create_component options: @@ -329,4 +158,4 @@ dict( ## Authors and citation -The `eds.matcher` pipeline component was developed by AP-HP's Data Science team. +The `eds.contextual_matcher` pipeline component was developed by AP-HP's Data Science team. diff --git a/edsnlp/matchers/regex.py b/edsnlp/matchers/regex.py index 681788535a..4c19212386 100644 --- a/edsnlp/matchers/regex.py +++ b/edsnlp/matchers/regex.py @@ -1,6 +1,6 @@ import re from bisect import bisect_left, bisect_right -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from loguru import logger from spacy.tokens import Doc, Span @@ -465,7 +465,7 @@ def __call__( doclike: Union[Doc, Span], as_spans=False, return_groupdict=False, - ) -> Union[Span, Tuple[Span, Dict[str, Any]]]: + ) -> Iterator[Union[Span, Tuple[Span, Dict[str, Any]]]]: """ Performs matching. Yields matches. diff --git a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py index 88db08e418..58cc5fe310 100644 --- a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py +++ b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py @@ -1,9 +1,6 @@ import re import warnings -from collections import defaultdict -from functools import lru_cache -from operator import attrgetter -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Generator, Iterable, Optional, Union from confit import VisibleDeprecationWarning from loguru import logger @@ -12,25 +9,11 @@ from edsnlp.core import PipelineProtocol from edsnlp.matchers.phrase import EDSPhraseMatcher from edsnlp.matchers.regex import RegexMatcher, create_span -from edsnlp.matchers.utils import get_text from edsnlp.pipes.base import BaseNERComponent, SpanSetterArg -from edsnlp.utils.collections import flatten_once -from edsnlp.utils.typing import cast +from edsnlp.utils.doc_to_text import get_text +from edsnlp.utils.span_getters import get_spans -from . import models - - -@lru_cache(64) -def get_window( - doclike: Union[Doc, Span], window: Tuple[int, int], limit_to_sentence: bool -): - start_limit = doclike.sent.start if limit_to_sentence else 0 - end_limit = doclike.sent.end if limit_to_sentence else len(doclike.doc) - - start = max(doclike.start + window[0], start_limit) - end = min(doclike.end + window[1], end_limit) - - return doclike.doc[start:end] +from .models import FullConfig, SingleAssignModel, SingleConfig class ContextualMatcher(BaseNERComponent): @@ -44,8 +27,13 @@ class ContextualMatcher(BaseNERComponent): spaCy `Language` object. name : Optional[str] The name of the pipe - patterns : Union[Dict[str, Any], List[Dict[str, Any]]] - The configuration dictionary + patterns : AsList[SingleConfig] + ??? subdoc "The patterns to match" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleConfig + options: + only_parameters: "no-header" + show_toc: false assign_as_span : bool Whether to store eventual extractions defined via the `assign` key as Spans or as string @@ -75,7 +63,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: Optional[str] = "contextual_matcher", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]], + patterns: FullConfig, assign_as_span: bool = False, alignment_mode: str = "expand", attr: str = "NORM", @@ -87,7 +75,7 @@ def __init__( label: Optional[str] = None, span_setter: SpanSetterArg = {"ents": True}, ): - if label is None and label_name is not None: + if label is None and label_name is not None: # pragma: no cover warnings.warn( "`label_name` is deprecated, use `label` instead.", VisibleDeprecationWarning, @@ -104,136 +92,103 @@ def __init__( self.ignore_excluded = ignore_excluded self.ignore_space_tokens = ignore_space_tokens self.alignment_mode = alignment_mode - self.regex_flags = regex_flags + self.regex_flags: Union[re.RegexFlag, int] = regex_flags self.include_assigned = include_assigned - # Configuration parsing - patterns = cast(models.FullConfig, patterns) - self.patterns = {pattern.source: pattern for pattern in patterns} - - # Matchers for the anchors - self.phrase_matcher = EDSPhraseMatcher( - nlp.vocab, - attr=attr, - ignore_excluded=ignore_excluded, - ignore_space_tokens=ignore_space_tokens, - ) - self.regex_matcher = RegexMatcher( - attr=attr, - flags=regex_flags, - ignore_excluded=ignore_excluded, - ignore_space_tokens=ignore_space_tokens, - alignment_mode=alignment_mode, - ) - - self.phrase_matcher.build_patterns( - nlp=nlp, - terms={ - source: { - "patterns": p.terms, - } - for source, p in self.patterns.items() - }, - ) - self.regex_matcher.build_patterns( - regex={ - source: { - "regex": p.regex, - "attr": p.regex_attr, - "flags": p.regex_flags, + for pattern in patterns: + phrase_matcher = EDSPhraseMatcher( + nlp.vocab, + attr=attr, + ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, + ) + phrase_matcher.build_patterns( + nlp=nlp, + terms={ + "terms": { + "patterns": pattern.terms, + } + }, + ) + pattern.phrase_matcher = phrase_matcher + + regex_matcher = RegexMatcher( + attr=attr, + flags=regex_flags, + ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, + alignment_mode=alignment_mode, + ) + regex_matcher.build_patterns( + regex={ + "regex": { + "regex": pattern.regex, + "attr": pattern.regex_attr, + "flags": pattern.regex_flags, + } } - for source, p in self.patterns.items() - } - ) - - self.exclude_matchers = defaultdict( - list - ) # Will contain all the exclusion matchers - self.assign_matchers = defaultdict(list) # Will contain all the assign matchers - - # Will contain the reduce mode (for each source and assign matcher) - self.reduce_mode = {} - - # Will contain the name of the assign matcher from which - # entity will be replaced (for each source) - self.replace_key = {} - - for source, p in self.patterns.items(): - p = p.model_dump() - - for exclude in p["exclude"]: - exclude_matcher = RegexMatcher( - attr=exclude["regex_attr"] or p["regex_attr"] or self.attr, - flags=exclude["regex_flags"] - or p["regex_flags"] - or self.regex_flags, - ignore_excluded=ignore_excluded, - ignore_space_tokens=ignore_space_tokens, - alignment_mode="expand", - ) - - exclude_matcher.build_patterns(regex={"exclude": exclude["regex"]}) - - self.exclude_matchers[source].append( - dict( - matcher=exclude_matcher, - window=exclude["window"], - limit_to_sentence=exclude["limit_to_sentence"], + ) + pattern.regex_matcher = regex_matcher + + for exclude in pattern.exclude: + if exclude.regex is not None: + matcher = RegexMatcher( + attr=exclude.regex_attr or pattern.regex_attr or self.attr, + flags=exclude.regex_flags + or pattern.regex_flags + or self.regex_flags, + ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, + alignment_mode="expand", ) - ) + matcher.build_patterns(regex={"exclude": exclude.regex}) + exclude.regex_matcher = matcher + + for include in pattern.include: + if include.regex is not None: + matcher = RegexMatcher( + attr=include.regex_attr or pattern.regex_attr or self.attr, + flags=include.regex_flags + or pattern.regex_flags + or self.regex_flags, + ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, + alignment_mode="expand", + ) + matcher.build_patterns(regex={"include": include.regex}) + include.regex_matcher = matcher - replace_key = None + # replace_key = None - for assign in p["assign"]: - assign_matcher = RegexMatcher( - attr=assign["regex_attr"] or p["regex_attr"] or self.attr, - flags=assign["regex_flags"] or p["regex_flags"] or self.regex_flags, + for assign in pattern.assign: + assign.regex_matcher = RegexMatcher( + attr=assign.regex_attr or pattern.regex_attr or self.attr, + flags=assign.regex_flags or pattern.regex_flags or self.regex_flags, ignore_excluded=ignore_excluded, ignore_space_tokens=ignore_space_tokens, alignment_mode=alignment_mode, span_from_group=True, ) - - assign_matcher.build_patterns( - regex={assign["name"]: assign["regex"]}, + assign.regex_matcher.build_patterns( + regex={assign.name: assign.regex}, ) - self.assign_matchers[source].append( - dict( - name=assign["name"], - matcher=assign_matcher, - window=assign["window"], - limit_to_sentence=assign["limit_to_sentence"], - replace_entity=assign["replace_entity"], - reduce_mode=assign["reduce_mode"], - ) - ) - - if assign["replace_entity"]: - # We know that there is only one assign name - # with `replace_entity==True` - # from PyDantic validation - replace_key = assign["name"] - - self.replace_key[source] = replace_key - - self.reduce_mode[source] = { - d["name"]: d["reduce_mode"] for d in self.assign_matchers[source] - } - - self.set_extensions() + self.patterns = patterns def set_extensions(self) -> None: + """ + Define the extensions used by the component + """ super().set_extensions() if not Span.has_extension("assigned"): Span.set_extension("assigned", default=dict()) if not Span.has_extension("source"): Span.set_extension("source", default=None) - def filter_one(self, span: Span) -> Span: + def filter_one(self, span: Span, pattern) -> Optional[Span]: """ - Filter extracted entity based on the "exclusion filter" mentioned - in the configuration + Filter extracted entity based on the exclusion and inclusion filters of + the configuration. Parameters ---------- @@ -245,32 +200,35 @@ def filter_one(self, span: Span) -> Span: Optional[Span] None if the span was filtered, the span else """ - source = span.label_ to_keep = True - for matcher in self.exclude_matchers[source]: - window = matcher["window"] - limit_to_sentence = matcher["limit_to_sentence"] - snippet = get_window( - doclike=span, - window=window, - limit_to_sentence=limit_to_sentence, - ) + for exclude in pattern.exclude: + snippet = exclude.window(span) if ( - next( - matcher["matcher"](snippet, as_spans=True), - None, - ) - is not None + exclude.regex_matcher is not None + and next(exclude.regex_matcher(snippet), None) is not None + or exclude.span_getter is not None + and next(get_spans(snippet, exclude.regex_matcher), None) is not None + ): + to_keep = False + break + + for include in pattern.include: + snippet = include.window(span) + + if ( + include.regex_matcher is not None + and next(include.regex_matcher(snippet), None) is None + or include.span_getter is not None + and next(get_spans(snippet, include.regex_matcher), None) is None ): to_keep = False - logger.trace(f"Entity {span} was filtered out") break if to_keep: return span - def assign_one(self, span: Span) -> Span: + def assign_one(self, span: Span, pattern) -> Iterable[Span]: """ Get additional information in the context of each entity. This function will populate two custom attributes: @@ -285,159 +243,164 @@ def assign_one(self, span: Span) -> Span: Returns ------- - Span - Span with additional information + List[Span] + Spans with additional information """ - - if span is None: - yield from [] - return - - source = span.label_ - assigned_dict = models.AssignDict(reduce_mode=self.reduce_mode[source]) replace_key = None - for matcher in self.assign_matchers[source]: - attr = self.patterns[source].regex_attr or matcher["matcher"].default_attr - window = matcher["window"] - limit_to_sentence = matcher["limit_to_sentence"] - replace_entity = matcher["replace_entity"] # Boolean - - snippet = get_window( - doclike=span, - window=window, - limit_to_sentence=limit_to_sentence, - ) - - # Getting the matches - assigned_list = list(matcher["matcher"].match(snippet)) - - assigned_list = [ - (span, span, matcher["matcher"].regex[0][0]) - if not match.groups() - else ( - span, - create_span( - doclike=snippet, - start_char=match.start(0), - end_char=match.end(0), - key=matcher["matcher"].regex[0][0], - attr=matcher["matcher"].regex[0][2], - alignment_mode=matcher["matcher"].regex[0][5], - ignore_excluded=matcher["matcher"].regex[0][3], - ignore_space_tokens=matcher["matcher"].regex[0][4], - ), - matcher["matcher"].regex[0][0], - ) - for (span, match) in assigned_list - ] - - # assigned_list now contains tuples with - # - the first element being the span extracted from the group - # - the second element being the full match + # Assigned matches is a list of tuples, each containing: + # - the span matched by the "assign" regex (or returned by the span getter) + # - the span corresponding to the match group of the regex (or the full match, + # ie same as above) + assigned_dict = {} + reduce_modes = {} + attrs = {} + + for assign in pattern.assign: + assign: SingleAssignModel + window = assign.window + snippet = window(span) + reduce_modes[assign.name] = assign.reduce_mode + matcher: RegexMatcher = assign.regex_matcher + attrs[assign.name] = matcher.regex[0][2] + if matcher is not None: + # Getting the matches + matches = list(matcher.match(snippet)) + assigned = [ + (matched_span, matched_span) + if not re_match.groups() + else ( + matched_span, + create_span( + doclike=snippet, + start_char=re_match.start(0), + end_char=re_match.end(0), + key=matcher.regex[0][0], + attr=matcher.regex[0][2], + alignment_mode=matcher.regex[0][5], + ignore_excluded=matcher.regex[0][3], + ignore_space_tokens=matcher.regex[0][4], + ), + # matcher.regex[0][0], + ) + for (matched_span, re_match) in matches + ] + if assign.span_getter is not None: + assigned = [ + (matched_span, matched_span) + for matched_span in get_spans(snippet, assign.span_getter) + # if matched_span.start >= snippet.start + # and matched_span.end <= snippet.end + ] + + if assign.required and not assigned: + logger.trace(f"Entity {span} was filtered out") + return [] - if not assigned_list: # No match was found + if len(assigned) == 0: continue - for assigned in assigned_list: - if assigned is None: - continue - if replace_entity: - replace_key = assigned[2] - - # Using he overrid `__setitem__` method from AssignDict here: - assigned_dict[assigned[2]] = { - "span": assigned[1], # Full span - "value_span": assigned[0], # Span of the group - "value_text": get_text( - assigned[0], - attr=attr, - ignore_excluded=self.ignore_excluded, - ), # Text of the group - } - logger.trace(f"Assign key {matcher['name']} matched on entity {span}") - if replace_key is None and self.replace_key[source] is not None: - # There should have been a replacement, but none was found - # So we discard the entity - return + if assign.replace_entity: + replace_key = assign.name + if assign.reduce_mode == "keep_first": # closest + assigned = [min(assigned, key=lambda e: abs(e[0].start - span.start))] + elif assign.reduce_mode == "keep_last": + assigned = [max(assigned, key=lambda e: abs(e[0].start - span.start))] - # Entity replacement - if replace_key is not None: - replacables = assigned_dict[replace_key]["span"] - kept_ents = ( - replacables if isinstance(replacables, list) else [replacables] - ).copy() + assigned_dict[assign.name] = assigned - if self.include_assigned: - # We look for the closest - closest = min( - kept_ents, - key=lambda e: abs(e.start - span.start), + # Several cases: + # 1. should_have_replacement and include_assigned is True + # -> pick closest assigned span where replace = True + # -> + if replace_key is not None: + replacements = sorted( + assigned_dict[replace_key], + key=lambda e: abs(e[0].start - span.start), + ) + assigned_dict[replace_key] = replacements + + ext = { + n: None + if reduce_modes[n] is not None and len(g) == 0 + else [s[0] for s in g][slice(None) if reduce_modes[n] is None else 0] + if self.assign_as_span + else [ + get_text( + s[0], + attr=attrs[n], + ignore_excluded=self.ignore_excluded, + ignore_space_tokens=self.ignore_space_tokens, ) - kept_ents.remove(closest) - - expandables = list( - flatten_once( - [ - a["span"] - for k, a in assigned_dict.items() - if k != replace_key - ] - ) - ) + [span, closest] + for s in g + ][slice(None) if reduce_modes[n] is None else 0] + for n, g in assigned_dict.items() + } - closest = Span( + if replace_key is None: + if self.include_assigned: + merged = [span, *(x[1] for name, g in assigned_dict.items() for x in g)] + span = Span( span.doc, - min(expandables, key=attrgetter("start")).start, - max(expandables, key=attrgetter("end")).end, + min(s.start for s in merged), + max(s.end for s in merged), span.label_, ) - - kept_ents.append(closest) - kept_ents.sort(key=attrgetter("start")) - - for replaced in kept_ents: - # Propagating attributes from the anchor - replaced._.source = source - replaced.label_ = self.label - + span._.source = pattern.source + span.label_ = self.label + span._.assigned = ext + new_spans = [span] else: - # Entity expansion - expandables = [ - s - for s in flatten_once([a["span"] for a in assigned_dict.values()]) - if s is not None - ] - - if self.include_assigned and expandables: + if self.include_assigned: + # we will merge spans from other assign groups + the main span + # to the closest "most central" assign span. + [closest_replacement, *rest_replacements] = assigned_dict[replace_key] + other_spans = [ + x[1] + for name, g in assigned_dict.items() + if name != replace_key + for x in g + ] + merged = [closest_replacement[1], span, *other_spans] span = Span( span.doc, - min(s.start for s in expandables + [span] if s is not None), - max(s.end for s in expandables + [span] if s is not None), + min(s.start for s in merged), + max(s.end for s in merged), span.label_, ) + new_spans = [span, *(s[1] for s in rest_replacements)] + else: + new_spans = [x[1] for x in assigned_dict[replace_key]] + for idx, span in enumerate(new_spans): + span._.source = pattern.source + span.label_ = self.label + span._.assigned = { + k: v[idx] if ((k == replace_key) and reduce_modes[k] is None) else v + for k, v in ext.items() + } - span._.source = source - span.label_ = self.label - kept_ents = [span] - - key = "value_span" if self.assign_as_span else "value_text" - - for idx, e in enumerate(kept_ents): - e._.assigned = { - k: v[key][idx] - if ((k == replace_key) and self.reduce_mode[source][k] is None) - else v[key] - for k, v in assigned_dict.items() - } + return new_spans - yield from kept_ents + def process_one(self, span: Span, pattern: SingleConfig): + """ + Processes one span, applying both the filters and the assignments - def process_one(self, span): - filtered = self.filter_one(span) - yield from self.assign_one(filtered) + Parameters + ---------- + span: Span + Span object + pattern: SingleConfig + + Yields + ------ + span: + Filtered spans, with optional assignments + """ + span = self.filter_one(span, pattern) + if span is not None: + yield from self.assign_one(span, pattern) - def process(self, doc: Doc) -> List[Span]: + def process(self, doc: Doc) -> Generator[Span, None, None]: """ Process the document, looking for named entities. @@ -452,12 +415,18 @@ def process(self, doc: Doc) -> List[Span]: List of detected spans. """ - matches = self.phrase_matcher(doc, as_spans=True) - regex_matches = list(self.regex_matcher(doc, as_spans=True)) - - spans = (*matches, *regex_matches) - for span in spans: - yield from self.process_one(span) + for pattern in self.patterns: + for span in ( + *pattern.phrase_matcher(doc, as_spans=True), + *pattern.regex_matcher(doc, as_spans=True), + *( + get_spans(doc, pattern.span_getter) + if pattern.span_getter is not None + else [] + ), + ): + spans = list(self.process_one(span, pattern)) + yield from spans def __call__(self, doc: Doc) -> Doc: """ diff --git a/edsnlp/pipes/core/contextual_matcher/models.py b/edsnlp/pipes/core/contextual_matcher/models.py index 4d8a6e1c51..1107463288 100644 --- a/edsnlp/pipes/core/contextual_matcher/models.py +++ b/edsnlp/pipes/core/contextual_matcher/models.py @@ -1,167 +1,203 @@ import re -from typing import List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, List, Optional, Union -import pydantic import regex from pydantic import BaseModel, Extra from edsnlp.matchers.utils import ListOrStr -from edsnlp.utils.typing import Validated, cast +from edsnlp.utils.span_getters import ( + ContextWindow, + SentenceContextWindow, + SpanGetterArg, +) +from edsnlp.utils.typing import AsList Flags = Union[re.RegexFlag, int] -Window = Union[ - Tuple[int, int], - List[int], - int, -] try: - from pydantic import field_validator + from pydantic import field_validator, model_validator def validator(x, allow_reuse=True, pre=False): return field_validator(x, mode="before" if pre else "after") + + def root_validator(allow_reuse=True, pre=False): + return model_validator(mode="before" if pre else "after") + + except ImportError: - from pydantic import validator - - -def normalize_window(cls, v): - if isinstance(v, list): - assert ( - len(v) == 2 - ), "`window` should be a tuple/list of two integer, or a single integer" - v = tuple(v) - if isinstance(v, int): - assert v != 0, "The provided `window` should not be 0" - if v < 0: - return (v, 0) - if v > 0: - return (0, v) - assert v[0] < v[1], "The provided `window` should contain at least 1 token" - return v - - -class AssignDict(dict): - """ - Custom dictionary that overrides the __setitem__ method - depending on the reduce_mode - """ - - def __init__(self, reduce_mode: dict): - super().__init__() - self.reduce_mode = reduce_mode - self._setitem_ = self.__setitem_options__() - - def __missing__(self, key): - return ( - { - "span": [], - "value_span": [], - "value_text": [], - } - if self.reduce_mode[key] is None - else {} + from pydantic import root_validator, validator + + +def validate_window(cls, values): + if isinstance(values.get("regex"), str): + values["regex"] = [values["regex"]] + window = values.get("window") + if window is None or isinstance(window, (int, tuple, list)): + values["limit_to_sentence"] = True + window = values.get("window") + if window is not None: + values["window"] = ContextWindow.validate(window) + if values.get("limit_to_sentence"): + values["window"] = ( + SentenceContextWindow(0, 0) & values.get("window") + if window is not None + else SentenceContextWindow(0, 0) ) - - def __setitem__(self, key, value): - self._setitem_[self.reduce_mode[key]](key, value) - - def __setitem_options__(self): - def keep_list(key, value): - old_values = self.__getitem__(key) - value["span"] = old_values["span"] + [value["span"]] - value["value_span"] = old_values["value_span"] + [value["value_span"]] - value["value_text"] = old_values["value_text"] + [value["value_text"]] - - dict.__setitem__(self, key, value) - - def keep_first(key, value): - old_values = self.__getitem__(key) - if ( - old_values.get("span") is None - or value["span"].start <= old_values["span"].start - ): - dict.__setitem__(self, key, value) - - def keep_last(key, value): - old_values = self.__getitem__(key) - if ( - old_values.get("span") is None - or value["span"].start >= old_values["span"].start - ): - dict.__setitem__(self, key, value) - - return { - None: keep_list, - "keep_first": keep_first, - "keep_last": keep_last, - } + return values class SingleExcludeModel(BaseModel): + """ + A dictionary to define exclusion rules. Exclusion rules are given as Regexes, and + if a match is found in the surrounding context of an extraction, the extraction is + removed. Each dictionary should have the following keys: + + Parameters + ---------- + regex : ListOrStr + A single Regex or a list of Regexes + regex_attr : Optional[str] + An attributes to overwrite the given `attr` when matching with Regexes. + regex_flags : re.RegexFlag + Regex flags + span_getter : Optional[SpanGetterArg] + A span getter to pick the assigned spans from already extracted entities. + window : Optional[ContextWindow] + Context window to search for patterns around the anchor. Defaults to "sent" ( + i.e. the sentence of the anchor span). + """ + + span_getter: Optional[SpanGetterArg] = None regex: ListOrStr = [] - window: Window - limit_to_sentence: Optional[bool] = True - regex_flags: Optional[Flags] = None regex_attr: Optional[str] = None + regex_flags: Union[re.RegexFlag, int] = None - @validator("regex") - def exclude_regex_validation(cls, v): - if isinstance(v, str): - v = [v] - return v + limit_to_sentence: Optional[bool] = None + window: Optional[ContextWindow] = None + regex_matcher: Optional[Any] = None - _normalize_window = validator("window", allow_reuse=True)(normalize_window) - if pydantic.VERSION < "2": - model_dump = BaseModel.dict + validate_window = root_validator(pre=True, allow_reuse=True)(validate_window) -class ExcludeModel(Validated): - @classmethod - def validate(cls, v, config=None): - if not isinstance(v, list): - v = [v] - return [cast(SingleExcludeModel, x) for x in v] +class SingleIncludeModel(BaseModel): + """ + A dictionary to define inclusion rules. Inclusion rules are given as Regexes, and + if a match isn't found in the surrounding context of an extraction, the extraction + is removed. Each dictionary should have the following keys: + + Parameters + ---------- + regex : ListOrStr + A single Regex or a list of Regexes + regex_attr : Optional[str] + An attributes to overwrite the given `attr` when matching with Regexes. + regex_flags : re.RegexFlag + Regex flags + span_getter : Optional[SpanGetterArg] + A span getter to pick the assigned spans from already extracted entities. + window : Optional[ContextWindow] + Context window to search for patterns around the anchor. Defaults to "sent" ( + i.e. the sentence of the anchor span). + """ + + span_getter: Optional[SpanGetterArg] = None + regex: ListOrStr = [] + regex_attr: Optional[str] = None + regex_flags: Union[re.RegexFlag, int] = None + + limit_to_sentence: Optional[bool] = None + window: Optional[ContextWindow] = None + + regex_matcher: Optional[Any] = None + + validate_window = root_validator(pre=True, allow_reuse=True)(validate_window) + - if pydantic.VERSION < "2": - model_dump = BaseModel.dict +class ExcludeModel(AsList[SingleExcludeModel]): + """ + A list of `SingleExcludeModel` objects. If a single config is passed, + it will be automatically converted to a list of a single element. + """ + + +class IncludeModel(AsList[SingleIncludeModel]): + """ + A list of `SingleIncludeModel` objects. If a single config is passed, + it will be automatically converted to a list of a single element. + """ class SingleAssignModel(BaseModel): + """ + A dictionary to refine the extraction. Similarly to the `exclude` key, you can + provide a dictionary to use on the context **before** and **after** the extraction. + + Parameters + ---------- + span_getter : Optional[SpanGetterArg] + A span getter to pick the assigned spans from already extracted entities + in the doc. + regex : ListOrStr + A single Regex or a list of Regexes + regex_attr : Optional[str] + An attributes to overwrite the given `attr` when matching with Regexes. + regex_flags : re.RegexFlag + Regex flags + window : Optional[ContextWindow] + Context window to search for patterns around the anchor. Defaults to "sent" ( + i.e. the sentence of the anchor span). + replace_entity : Optional[bool] + If set to `True`, the match from the corresponding assign key will be used as + entity, instead of the main match. + See [this paragraph][the-replace_entity-parameter] + reduce_mode : Optional[Flags] + Set how multiple assign matches are handled. See the documentation of the + [`reduce_mode` parameter][the-reduce_mode-parameter] + required : Optional[str] + If set to `True`, the assign key must match for the extraction to be kept. If + it does not match, the extraction is discarded. + name : str + A name (string) + """ + name: str - regex: str - window: Window - limit_to_sentence: Optional[bool] = True - regex_flags: Optional[Flags] = None + + span_getter: Optional[SpanGetterArg] = None + regex: ListOrStr = [] regex_attr: Optional[str] = None + regex_flags: Union[re.RegexFlag, int] = None + + limit_to_sentence: Optional[bool] = None + window: Optional[ContextWindow] = None replace_entity: bool = False reduce_mode: Optional[str] = None + required: Optional[bool] = False - @validator("regex") + regex_matcher: Optional[Any] = None + + @validator("regex", allow_reuse=True) def check_single_regex_group(cls, pat): - compiled_pat = regex.compile( - pat - ) # Using regex to allow multiple fgroups with same name - n_groups = compiled_pat.groups - assert n_groups == 1, ( - "The pattern {pat} should have only one capturing group, not {n_groups}" - ).format( - pat=pat, - n_groups=n_groups, - ) + for single_pat in pat: + compiled_pat = regex.compile( + single_pat + ) # Using regex to allow multiple fgroups with same name + n_groups = compiled_pat.groups + assert n_groups == 1, ( + f"The pattern {single_pat} should have exactly one capturing group, " + f"not {n_groups}" + ) return pat - _normalize_window = validator("window", allow_reuse=True)(normalize_window) - if pydantic.VERSION < "2": - model_dump = BaseModel.dict + validate_window = root_validator(pre=True, allow_reuse=True)(validate_window) -class AssignModel(Validated): - @classmethod - def item_to_list(cls, v, config=None): - if not isinstance(v, list): - v = [v] - return [cast(SingleAssignModel, x) for x in v] +class AssignModel(AsList[SingleAssignModel]): + """ + A list of `SingleAssignModel` objects that should have at most + one element with `replace_entity=True`. If a single config is passed, + it will be automatically converted to a list of a single element. + """ @classmethod def name_uniqueness(cls, v, config=None): @@ -172,39 +208,86 @@ def name_uniqueness(cls, v, config=None): @classmethod def replace_uniqueness(cls, v, config=None): replace = [item for item in v if item.replace_entity] - assert ( - len(replace) <= 1 - ), "Only 1 assign element can be set with `replace_entity=True`" + assert len(replace) <= 1, ( + "Only 1 assign element can be set with `replace_entity=True`" + ) return v @classmethod def __get_validators__(cls): - yield cls.item_to_list + yield cls.validate yield cls.name_uniqueness yield cls.replace_uniqueness - if pydantic.VERSION < "2": - model_dump = BaseModel.dict + +if TYPE_CHECKING: + ExcludeModel = List[SingleExcludeModel] # noqa: F811 + IncludeModel = List[SingleIncludeModel] # noqa: F811 + AssignModel = List[SingleAssignModel] # noqa: F811 class SingleConfig(BaseModel, extra=Extra.forbid): - source: str + """ + A single configuration for the contextual matcher. + + Parameters + ---------- + span_getter : Optional[SpanGetterArg] + A span getter to pick the assigned spans from already extracted entities + in the doc. + regex : ListOrStr + A single Regex or a list of Regexes + regex_attr : Optional[str] + An attributes to overwrite the given `attr` when matching with Regexes. + regex_flags: re.RegexFlag + Regex flags + terms : Union[re.RegexFlag, int] + A single term or a list of terms (for exact matches) + exclude : AsList[SingleExcludeModel] + ??? subdoc "One or more exclusion patterns" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleExcludeModel + options: + only_parameters: "no-header" + include : AsList[SingleIncludeModel] + ??? subdoc "One or more inclusion patterns" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleIncludeModel + options: + only_parameters: "no-header" + assign : AsList[SingleAssignModel] + ??? subdoc "One or more assignment patterns" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleAssignModel + options: + only_parameters: "no-header" + source : str + A label describing the pattern + + """ + + source: Optional[str] = None + + span_getter: Optional[SpanGetterArg] = None terms: ListOrStr = [] regex: ListOrStr = [] regex_attr: Optional[str] = None regex_flags: Union[re.RegexFlag, int] = None - exclude: Optional[ExcludeModel] = [] - assign: Optional[AssignModel] = [] - if pydantic.VERSION < "2": - model_dump = BaseModel.dict + exclude: ExcludeModel = [] + include: IncludeModel = [] + assign: AssignModel = [] -class FullConfig(Validated): - @classmethod - def pattern_to_list(cls, v, config=None): - if not isinstance(v, list): - v = [v] - return [cast(SingleConfig, item) for item in v] + regex_matcher: Optional[Any] = None + phrase_matcher: Optional[Any] = None + + +class FullConfig(AsList[SingleConfig]): + """ + A list of `SingleConfig` objects that should have distinct `source` fields. + If a single config is passed, it will be automatically converted to a list of + a single element. + """ @classmethod def source_uniqueness(cls, v, config=None): @@ -214,8 +297,9 @@ def source_uniqueness(cls, v, config=None): @classmethod def __get_validators__(cls): - yield cls.pattern_to_list + yield cls.validate yield cls.source_uniqueness - if pydantic.VERSION < "2": - model_dump = BaseModel.dict + +if TYPE_CHECKING: + FullConfig = List[SingleConfig] # noqa: F811 diff --git a/edsnlp/pipes/ner/disorders/diabetes/diabetes.py b/edsnlp/pipes/ner/disorders/diabetes/diabetes.py index 1eef8a885a..f2d2467ec5 100644 --- a/edsnlp/pipes/ner/disorders/diabetes/diabetes.py +++ b/edsnlp/pipes/ner/disorders/diabetes/diabetes.py @@ -8,7 +8,6 @@ from edsnlp.matchers.regex import RegexMatcher from edsnlp.matchers.utils import get_text from edsnlp.pipes.base import SpanSetterArg -from edsnlp.pipes.core.contextual_matcher.contextual_matcher import get_window from ..base import DisorderMatcher from .patterns import COMPLICATIONS, default_patterns @@ -141,8 +140,7 @@ def has_far_complications(self, span: Span): Handles the common case where complications are listed as bullet points, sometimes fairly far from the anchor. """ - window = (0, 50) - context = get_window(span, window, limit_to_sentence=False) + context = span.doc[span.start : span.end + 50] if next(iter(self.complication_matcher(context)), None) is not None: return True return False diff --git a/edsnlp/utils/span_getters.py b/edsnlp/utils/span_getters.py index b1b8a156b3..23f2fb8ecd 100644 --- a/edsnlp/utils/span_getters.py +++ b/edsnlp/utils/span_getters.py @@ -1,3 +1,4 @@ +import abc from collections import defaultdict from typing import ( TYPE_CHECKING, @@ -12,6 +13,7 @@ Union, ) +import numpy as np from pydantic import NonNegativeInt from spacy.tokens import Doc, Span @@ -35,18 +37,34 @@ ] -def get_spans(doc, span_getter): +def get_spans(doclike, span_getter): if span_getter is None: - yield doc[:] + yield doclike[:] return if callable(span_getter): - yield from span_getter(doc) + yield from span_getter(doclike) return - for key, span_filter in span_getter.items(): - if key == "*": - candidates = (span for group in doc.spans.values() for span in group) + for k, span_filter in span_getter.items(): + if isinstance(doclike, Doc): + if k == "*": + candidates = (s for grp in doclike.spans.values() for s in grp) + else: + candidates = doclike.spans.get(k, ()) if k != "ents" else doclike.ents else: - candidates = doc.spans.get(key, ()) if key != "ents" else doc.ents + doc = doclike.doc + if k == "*": + candidates = ( + s + for grp in doc.spans.values() + for s in grp + if not (s.end < doclike.start or s.start > doclike.end) + ) + else: + candidates = ( + s + for s in (doc.spans.get(k, ()) if k != "ents" else doc.ents) + if not (s.end < doclike.start or s.start > doclike.end) + ) if span_filter is True: yield from candidates else: @@ -251,8 +269,9 @@ class make_span_context_getter: Parameters ---------- context_words : Union[NonNegativeInt, Tuple[NonNegativeInt, NonNegativeInt]] - Minimum number of words to include on each side of the span. It could be asymmetric. - For example (5,2) will include 5 words before the start of the span and 2 after the end of the span + Minimum number of words to include on each side of the span. It could be + asymmetric. For example (5,2) will include 5 words before the start of the + span and 2 after the end of the span context_sents : Optional[ Union[NonNegativeInt, Tuple[NonNegativeInt, NonNegativeInt]] ] = 1 @@ -264,7 +283,7 @@ class make_span_context_getter: By default, 0 if the document has no sentence annotations, 1 otherwise. - """ # noqa: E501 + """ def __init__( self, @@ -284,9 +303,9 @@ def __init__( ) else: self.context_sents_left, self.context_sents_right = context_sents - assert ( - sum(context_sents) != 1 - ), "Asymmetric sentence context should not be (0,1) or (1,0)" + assert sum(context_sents) != 1, ( + "Asymmetric sentence context should not be (0,1) or (1,0)" + ) self.span_getter = validate_span_getter(span_getter, optional=True) def __call__(self, span: Union[Doc, Span]) -> Union[Span, List[Span]]: @@ -321,3 +340,203 @@ def __call__(self, span: Union[Doc, Span]) -> Union[Span, List[Span]]: end = max(end, max_end_sent) return span.doc[start:end] + + +class ContextWindowMeta(abc.ABCMeta): + pass + + +class ContextWindow(abc.ABC, metaclass=ContextWindowMeta): + """ + A ContextWindow specifies how much additional context (such as sentences or words) + should be included relative to an anchor span. For example, one might define a + context window that extracts the sentence immediately preceding and following the + anchor span, or one that extends the span by a given number of words before and + after. + + ContextWindow objects can be combined using logical operations to create more + complex context windows. For example, one can create a context window that includes + either words from a -10 to +10 range or words from the sentence. + + + Examples + -------- + ```python + from confit import validate_arguments + from edsnlp.utils.span_getters import ContextWindow + from spacy.tokens import Span + + + @validate_arguments + def apply_context(span: Span, ctx: ContextWindow): + # ctx will be parsed and cast as a ContextWindow + return ctx(span) + + + # Will return a span with the 10 words before and after the span + # and words of the current sentence and the next sentence. + apply_context(span, "words[-10:10] | sents[0:1]") + + # Will return the span covering at most the -5 and +5 words + # around the span and the current sentence of the span. + apply_context(span, "words[-5:5] & sent") + ``` + + !!! warning "Indexing" + + Unlike standard Python sequence slicing, `sents[0:0]` returns + the current sentence, not an empty span. + """ + + @abc.abstractmethod + def __call__(self, span: Span) -> Span: + pass + + # logical ops + def __and__(self, other: "ContextWindow"): + # fmt: off + return IntersectionContextWindow([ + *(self.contexts if isinstance(self, IntersectionContextWindow) else (self,)), # noqa: E501 + *(other.contexts if isinstance(other, IntersectionContextWindow) else (other,)) # noqa: E501 + ]) + # fmt: on + + def __or__(self, other: "ContextWindow"): + # fmt: off + return UnionContextWindow([ + *(self.contexts if isinstance(self, UnionContextWindow) else (self,)), + *(other.contexts if isinstance(other, UnionContextWindow) else (other,)) + ]) + # fmt: on + + @classmethod + def parse(cls, query): + try: + return eval( + query, + {}, + { + "words": WordContextWindow, + "sents": SentenceContextWindow, + "sent": SentenceContextWindow(0, 0), + }, + ) + except NameError: + raise ValueError( + "Only queries containing vars `words[before:after]`, " + "`sents[before:after]` and `sent` are allowed to " + f"define a context getter, got {query!r}" + ) + + @classmethod + def validate(cls, obj, config=None): + if isinstance(obj, cls): + return obj + if isinstance(obj, str): + return cls.parse(obj) + if isinstance(obj, tuple): + assert len(obj) == 2 + return WordContextWindow(*obj) + if isinstance(obj, int): + assert obj != 0, "The provided `window` should not be 0" + return WordContextWindow(obj, 0) if obj < 0 else WordContextWindow(0, obj) + raise ValueError(f"Invalid context: {obj}") + + @classmethod + def __get_validators__(cls): + yield cls.validate + + +class LeafContextWindowMeta(ContextWindowMeta): + def __getitem__(cls, item) -> Span: + assert isinstance(item, slice) + before = item.start + after = item.stop + return cls(before, after) + + +class LeafContextWindow(ContextWindow, metaclass=LeafContextWindowMeta): + pass + + +class WordContextWindow(LeafContextWindow): + def __init__( + self, + before: Optional[int] = None, + after: Optional[int] = None, + ): + self.before = before + self.after = after + + def __call__(self, span): + start = span.start + self.before if self.before is not None else 0 + end = span.end + self.after if self.after is not None else len(span.doc) + return span.doc[max(0, start) : min(len(span.doc), end)] + + def __repr__(self): + return "words[{}:{}]".format(self.before, self.after) + + +class SentenceContextWindow(LeafContextWindow): + def __init__( + self, + before: Optional[int] = None, + after: Optional[int] = None, + ): + self.before = before + self.after = after + + def __call__(self, span): + sent_starts = span.doc.to_array("SENT_START") == 1 + sent_indices = sent_starts.cumsum() + sent_indices = sent_indices - sent_indices[span.start] + + start_idx = end_idx = None + if self.before is not None: + start = sent_starts & (sent_indices == self.before) + x = np.flatnonzero(start) + start_idx = x[-1] if len(x) else 0 + + if self.after is not None: + end = sent_starts & (sent_indices == self.after + 1) + x = np.flatnonzero(end) + end_idx = x[0] - 1 if len(x) else len(span.doc) + + return span.doc[start_idx:end_idx] + + def __repr__(self): + return "sents[{}:{}]".format(self.before, self.after) + + +class UnionContextWindow(ContextWindow): + def __init__( + self, + contexts: AsList[ContextWindow], + ): + self.contexts = contexts + + def __call__(self, span): + results = [context(span) for context in self.contexts] + min_word = min([span.start for span in results]) + max_word = max([span.end for span in results]) + return span.doc[min_word:max_word] + + def __repr__(self): + return " | ".join(repr(context) for context in self.contexts) + + +class IntersectionContextWindow(ContextWindow): + def __init__( + self, + contexts: AsList[ContextWindow], + ): + self.contexts = contexts + + def __call__(self, span): + results = [context(span) for context in self.contexts] + min_word = max([span.start for span in results]) + max_word = min([span.end for span in results]) + return span.doc[min_word:max_word] + + def __repr__(self): + return " & ".join(repr(context) for context in self.contexts) diff --git a/edsnlp/utils/typing.py b/edsnlp/utils/typing.py index 5dd675c213..5b017a60e5 100644 --- a/edsnlp/utils/typing.py +++ b/edsnlp/utils/typing.py @@ -29,7 +29,8 @@ def __get_pydantic_core_schema__(cls, source, handler): class MetaAsList(type): def __init__(cls, name, bases, dct): super().__init__(name, bases, dct) - cls.type_ = Any + type_ = next((base.type_ for base in bases if hasattr(base, "type_")), Any) + cls.type_ = type_ @functools.lru_cache(maxsize=None) def __getitem__(self, item): diff --git a/pyproject.toml b/pyproject.toml index 16a2bfcc86..fa115c1fca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -386,7 +386,7 @@ ignore-nested-functions = true ignore-nested-classes = true ignore-setters = true fail-under = 40 -exclude = ["setup.py", "docs", "build", "tests"] +exclude = ["setup.py", "docs", "build", "tests", "edsnlp/pipes/core/contextual_matcher/models.py"] verbose = 0 quiet = false whitelist-regex = [] diff --git a/tests/pipelines/core/test_contextual_matcher.py b/tests/pipelines/core/test_contextual_matcher.py index 7f4aaf6e79..4d022031c5 100644 --- a/tests/pipelines/core/test_contextual_matcher.py +++ b/tests/pipelines/core/test_contextual_matcher.py @@ -1,8 +1,14 @@ +import os + import pytest +import edsnlp +import edsnlp.pipes as eds from edsnlp.utils.examples import parse_example from edsnlp.utils.extensions import rgetattr +os.environ["CONFIT_DEBUG"] = "1" + EXAMPLES = [ """ Le patient présente une métastasis sur un cancer métastasé au stade 3 voire au stade 4. @@ -156,7 +162,6 @@ @pytest.mark.parametrize("params,example", list(zip(ALL_PARAMS, EXAMPLES))) def test_contextual(blank_nlp, params, example): - include_assigned, replace_entity, reduce_mode_stage, reduce_mode_metastase = params blank_nlp.add_pipe( @@ -225,9 +230,49 @@ def test_contextual(blank_nlp, params, example): assert len(doc.ents) == len(entities) for entity, ent in zip(entities, doc.ents): - for modifier in entity.modifiers: + assert rgetattr(ent, modifier.key) == modifier.value, ( + f"{modifier.key} labels don't match." + ) + - assert ( - rgetattr(ent, modifier.key) == modifier.value - ), f"{modifier.key} labels don't match." +def test_contextual_matcher_include(blank_nlp): + if not isinstance(blank_nlp, edsnlp.Pipeline): + pytest.skip("Only running for edsnlp.Pipeline") + blank_nlp.add_pipe( + eds.quantities( + span_setter=["sizes"], + quantities=["size"], + ), + ) + blank_nlp.add_pipe( + eds.contextual_matcher( + name="tumor_size", + label="tumor_size", + assign_as_span=True, + patterns=[ + dict( + source="tumor_size", + terms=["cancer", "tumeur"], + regex_attr="NORM", + include=dict(regex="mamm", window="sents[-1:1]"), + assign=dict( + name="size", + span_getter="sizes", + reduce_mode="first", + required=True, + ), + ) + ], + ), + ) + doc = blank_nlp("""\ +Bilan mammaire: +La tumeur est de 3 cm. +Tumeur au pied sans changement. +Tumeur mammaire benigne. +""") + assert len(doc.ents) == 1 + ent = doc.ents[0] + assert ent.label_ == "tumor_size" + assert ent._.assigned["size"]._.value.cm == 3 diff --git a/tests/utils/test_span_getters.py b/tests/utils/test_span_getters.py index 6535141812..b401fb3bfc 100644 --- a/tests/utils/test_span_getters.py +++ b/tests/utils/test_span_getters.py @@ -1,8 +1,17 @@ +import pytest +from confit import validate_arguments + import edsnlp -from edsnlp.utils.span_getters import make_span_context_getter +import edsnlp.pipes as eds +from edsnlp.utils.span_getters import ( + ContextWindow, + get_spans, + make_span_context_getter, + validate_span_setter, +) -def test_span_context_getter_symmetric(lang): +def test_span_context_getter(lang): nlp = edsnlp.blank(lang) nlp.add_pipe("eds.normalizer") nlp.add_pipe("eds.sentences") @@ -45,6 +54,28 @@ def test_span_context_getter_symmetric(lang): ] +def test_span_getter_on_span(): + nlp = edsnlp.blank("eds") + nlp.add_pipe(eds.sentences()) + nlp.add_pipe( + eds.matcher( + terms={"animal": ["snake", "dog"]}, + span_setter=["ents", "animals"], + ) + ) + doc = nlp( + "There was a snake. " + "His friend was a dog. " + "He liked baking cakes. " + "But since he had no hands, he was a bad baker. " + ) + sents = list(doc.sents) + assert str(list(get_spans(sents[0], validate_span_setter("ents")))) == "[snake]" + assert str(list(get_spans(sents[0], validate_span_setter("animals")))) == "[snake]" + assert str(list(get_spans(doc[5:], validate_span_setter("animals")))) == "[dog]" + assert str(list(get_spans(doc[5:], validate_span_setter("*")))) == "[dog]" + + def test_span_context_getter_asymmetric(lang): nlp = edsnlp.blank(lang) nlp.add_pipe("eds.normalizer") @@ -98,3 +129,44 @@ def test_span_context_getter_asymmetric(lang): assert [span_getter(s).text for s in doc.ents] == [ "This is a sentence. This is another sentence with a kangaroo. This is a third one." # noqa: E501 ] + + +def test_context_getter_syntax(): + @validate_arguments + def get_snippet(span, context: ContextWindow): + return context(span) + + nlp = edsnlp.blank("eds") + nlp.add_pipe("eds.normalizer") + nlp.add_pipe("eds.sentences") + nlp.add_pipe("eds.matcher", config={"terms": {"dog": "dog"}}) + doc = nlp( + "There was a snake. " + "His friend was a dog. " + "He liked baking cakes. " + "But since he had no hands, he was a bad baker. " + ) + + assert ( + get_snippet(doc.ents[0], "words[-5:5]").text + == ". His friend was a dog. He liked baking cakes" + ) + + assert get_snippet(doc.ents[0], "words[-5:5] & sent").text == "His friend was a dog" + + assert ( + get_snippet(doc.ents[0], "words[-5:8] | sents[-1:1]").text + == "There was a snake. His friend was a dog. He liked baking cakes. " + "But since" + ) + + +def test_invalid_context_getter_syntax(): + @validate_arguments + def apply_context(context: ContextWindow): + pass + + apply_context("sents[-2:2]") + + with pytest.raises(ValueError): + apply_context("stuff[-2:2]") From 6b2a91818487206c3e22b53b985cc44b8428a7d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Mon, 14 Apr 2025 19:55:35 +0200 Subject: [PATCH 06/11] test: fix assert insertion in docs code blocks tests, and fix tests and warnings --- changelog.md | 3 +- docs/assets/fragments/alcohol-examples.md | 8 +- .../peptic-ulcer-disease-examples.md | 2 +- docs/assets/fragments/tobacco-examples.md | 4 +- docs/pipes/core/contextual-matcher.md | 16 ++- docs/tutorials/detecting-dates.md | 25 ++-- docs/tutorials/reason.md | 3 +- docs/utilities/tests/blocs.md | 8 +- .../contextual_matcher/contextual_matcher.py | 5 +- .../pipes/core/contextual_matcher/models.py | 8 +- edsnlp/pipes/core/endlines/endlines.py | 4 + .../consultation_dates/consultation_dates.py | 2 +- edsnlp/pipes/misc/dates/dates.py | 10 +- edsnlp/pipes/misc/dates/models.py | 5 +- edsnlp/pipes/misc/split/split.py | 3 +- edsnlp/pipes/misc/tables/tables.py | 2 +- edsnlp/pipes/ner/behaviors/alcohol/alcohol.py | 9 +- .../pipes/ner/behaviors/alcohol/patterns.py | 3 +- edsnlp/pipes/ner/behaviors/tobacco/tobacco.py | 35 ++++-- edsnlp/pipes/ner/disorders/aids/aids.py | 7 +- edsnlp/pipes/ner/disorders/base.py | 7 +- .../cerebrovascular_accident.py | 7 +- edsnlp/pipes/ner/disorders/ckd/ckd.py | 7 +- .../congestive_heart_failure.py | 7 +- .../connective_tissue_disease.py | 7 +- edsnlp/pipes/ner/disorders/copd/copd.py | 7 +- .../pipes/ner/disorders/dementia/dementia.py | 7 +- .../pipes/ner/disorders/diabetes/diabetes.py | 7 +- .../ner/disorders/hemiplegia/hemiplegia.py | 7 +- .../pipes/ner/disorders/leukemia/leukemia.py | 7 +- .../disorders/liver_disease/liver_disease.py | 7 +- .../pipes/ner/disorders/lymphoma/lymphoma.py | 7 +- .../myocardial_infarction.py | 7 +- .../peptic_ulcer_disease.py | 7 +- .../peripheral_vascular_disease.py | 7 +- .../ner/disorders/solid_tumor/solid_tumor.py | 7 +- edsnlp/pipes/qualifiers/history/history.py | 16 ++- edsnlp/tune.py | 5 +- edsnlp/utils/span_getters.py | 23 ++-- edsnlp/utils/typing.py | 5 +- tests/conftest.py | 12 ++ tests/test_docs.py | 114 +++++++++++++++++- 42 files changed, 311 insertions(+), 138 deletions(-) diff --git a/changelog.md b/changelog.md index 1dccd03113..790d2452e7 100644 --- a/changelog.md +++ b/changelog.md @@ -9,7 +9,8 @@ - Added a `context_getter` SpanGetter argument to the `eds.matcher` class to only retrieve entities inside the spans returned by the getter - Added a `filter_expr` parameter to scorers to filter the documents to score - Added a new `required` field to `eds.contextual_matcher` assign patterns to only match if the required field has been found, and an `include` parameter (similar to `exclude`) to search for required patterns without assigning them to the entity -- Added context strings (e.g., "words[0:5] | sent[0:1]") to the `eds.contextual_matcher` component to allow for more complex patterns in the selection of the window around the trigger spans +- Added context strings (e.g., "words[0:5] | sent[0:1]") to the `eds.contextual_matcher` component to allow for more complex patterns in the selection of the window around the trigger spans. +- Improve the contextual matcher documentation. ### Fixed diff --git a/docs/assets/fragments/alcohol-examples.md b/docs/assets/fragments/alcohol-examples.md index 5a5a812799..f563040a37 100644 --- a/docs/assets/fragments/alcohol-examples.md +++ b/docs/assets/fragments/alcohol-examples.md @@ -73,12 +73,12 @@ spans = doc.spans["alcohol"] spans - # Out: [Alcoolism non sevré] + # Out: [Alcoolisme non sevré] span = spans[0] - span._.detailed_status - # Out: None # "sevré" is negated, so no "ABTINENCE" status + span._.detailed_status # "sevré" is negated, so no "ABTINENCE" status + # Out: None ``` @@ -90,7 +90,7 @@ spans = doc.spans["alcohol"] spans - # Out: [Alcool: 0] + # Out: [Alcool] span = spans[0] diff --git a/docs/assets/fragments/peptic-ulcer-disease-examples.md b/docs/assets/fragments/peptic-ulcer-disease-examples.md index c2a7ac52fc..2f1a793eef 100644 --- a/docs/assets/fragments/peptic-ulcer-disease-examples.md +++ b/docs/assets/fragments/peptic-ulcer-disease-examples.md @@ -41,7 +41,7 @@ spans = doc.spans["peptic_ulcer_disease"] spans - # Out: [ulcères] + # Out: [gastrique: blabla blabla blabla blabla blabla quelques ulcères] span = spans[0] diff --git a/docs/assets/fragments/tobacco-examples.md b/docs/assets/fragments/tobacco-examples.md index 80f65ff624..8e7d29c946 100644 --- a/docs/assets/fragments/tobacco-examples.md +++ b/docs/assets/fragments/tobacco-examples.md @@ -66,7 +66,7 @@ spans = doc.spans["tobacco"] spans - # Out: [Tabac: 0] + # Out: [Tabac] span = spans[0] @@ -77,7 +77,7 @@ # Out: True span._.assigned - # Out: {'zero_after': [0]} + # Out: {'zero_after': 0} ``` diff --git a/docs/pipes/core/contextual-matcher.md b/docs/pipes/core/contextual-matcher.md index 245750885e..9f81cbb6be 100644 --- a/docs/pipes/core/contextual-matcher.md +++ b/docs/pipes/core/contextual-matcher.md @@ -4,6 +4,10 @@ EDS-NLP provides simple pattern matchers like `eds.matcher` to extract regular e ## Example +The following example demonstrates how to configure and use `eds.contextual_matcher` to extract mentions of solid cancers and lymphomas, while filtering out irrelevant mentions (e.g., benign tumors) and enriching entities with contextual information such as stage or metastasis status. + +Let's dive in with the full code example: + ```python import edsnlp, edsnlp.pipes as eds @@ -38,15 +42,16 @@ nlp.add_pipe( reduce_mode="keep_last", # (15)! ), ], + source="Cancer solide", # (16)! ), dict( - source="Lymphome", # (16)! regex=["lymphom", "lymphangio"], # (17)! regex_attr="NORM", # (18)! exclude=dict( regex=["hodgkin"], # (19)! window=3, # (20)! ), + source="Lymphome", # (21)! ), ], label="cancer", @@ -69,11 +74,12 @@ nlp.add_pipe( 13. Window size for detection 14. Keep main entity 15. Keep furthest extraction -16. Source label for lymphoma +16. Optional source label for solid tumor. This can be useful to know which pattern matched the entity. 17. Regex patterns for lymphoma 18. Apply regex on normalized text 19. Exclude Hodgkin lymphoma 20. Window size for exclusion +21. Optional source label for lymphoma. This can be useful to know which pattern matched the entity. Let's explore some examples using this pipeline: @@ -114,10 +120,12 @@ Let's explore some examples using this pipeline: txt = "Le patient a eu un cancer de stade 3." doc = nlp(txt) - doc.ents[0]._.assigned - # Out: {'stage': '3'} + doc.ents[0]._.assigned # (1)! + # Out: {'stage': ['3']} ``` + 1. We get a list for 'stage' because `reduce_mode` is set to `None` (default). If you want to keep only the first or last match, set `reduce_mode="keep_first"` or `reduce_mode="keep_last"`. + ## Better control over the final extracted entities Three main parameters refine how entities are extracted: diff --git a/docs/tutorials/detecting-dates.md b/docs/tutorials/detecting-dates.md index e3b576828b..e61f3e3c89 100644 --- a/docs/tutorials/detecting-dates.md +++ b/docs/tutorials/detecting-dates.md @@ -160,10 +160,10 @@ for i, date in enumerate(doc.spans["dates"]): note_datetime=note_datetime, infer_from_context=False, tz=None ), ) - # Out: 0 - 12 avril - None - # Out: 1 - il y a trois jours - 1999-08-24 00:00:00 - # Out: 2 - l'année dernière - 1998-08-27 00:00:00 - # Out: 3 - mai 1995 - None +# Out: 0 - 12 avril - None +# Out: 1 - il y a trois jours - 1999-08-24 00:00:00 +# Out: 2 - l'année dernière - 1998-08-27 00:00:00 +# Out: 3 - mai 1995 - None for i, date in enumerate(doc.spans["dates"]): @@ -179,17 +179,17 @@ for i, date in enumerate(doc.spans["dates"]): default_day=15, ), ) - # Out: 0 - 12 avril - 1999-04-12T00:00:00 - # Out: 1 - il y a trois jours - 1999-08-24 00:00:00 - # Out: 2 - l'année dernière - 1998-08-27 00:00:00 - # Out: 3 - mai 1995 - 1995-05-15T00:00:00 +# Out: 0 - 12 avril - 1999-04-12 00:00:00 +# Out: 1 - il y a trois jours - 1999-08-24 00:00:00 +# Out: 2 - l'année dernière - 1998-08-27 00:00:00 +# Out: 3 - mai 1995 - 1995-05-15 00:00:00 ``` As a first heuristic, let's consider that an entity can be linked to a date if the two are in the same sentence. In the case where multiple dates are present, we'll select the closest one. ```python title="utils.py" -from edsnlp.tokens import Span +from spacy.tokens import Span from typing import List, Optional @@ -219,9 +219,8 @@ def get_event_date(ent: Span) -> Optional[Span]: We can apply this simple function: -```{ .python .no-check } +```python import edsnlp, edsnlp.pipes as eds -from utils import get_event_date from datetime import datetime nlp = edsnlp.blank("eds") @@ -247,7 +246,9 @@ for ent in doc.ents: if ent.label_ != "admission": continue date = get_event_date(ent) - print(f"{ent.text:<20}{date.text:<20}{date._.date.to_datetime(now).strftime('%d/%m/%Y'):<15}{date._.date.to_duration(now)}") + print( + f"{ent.text:<20}{date.text:<20}{date._.date.to_datetime(now).strftime('%d/%m/%Y'):<15}{date._.date.to_duration(now)}" + ) # Out: admis 12 avril 12/04/2023 21 weeks 4 days 6 hours 3 minutes 26 seconds # Out: pris en charge l'année dernière 10/09/2022 -1 year ``` diff --git a/docs/tutorials/reason.md b/docs/tutorials/reason.md index e87e155f48..bc06eaef5e 100644 --- a/docs/tutorials/reason.md +++ b/docs/tutorials/reason.md @@ -63,8 +63,7 @@ reason._.is_reason ```python # ↑ Omitted code above ↑ -entities = reason._.ents_reason # (1) -for e in entities: +for e in reason._.ents_reason: # (1) print( "Entity:", e.text, diff --git a/docs/utilities/tests/blocs.md b/docs/utilities/tests/blocs.md index 42a6edb0a3..19f81e1b71 100644 --- a/docs/utilities/tests/blocs.md +++ b/docs/utilities/tests/blocs.md @@ -1,6 +1,6 @@ # Testing Code Blocs -We created a utility that scans through markdown files, extracts code blocs and executes them to check that everything is indeed functional. +We created a utility that scans through the documentation, extracts code blocs and executes them to check that everything is indeed functional. There is more! Whenever the utility comes across an example (denoted by `# Out: `, see example below), an `assert` statement is dynamically added to the snippet to check that the output matches. @@ -22,12 +22,12 @@ v = a assert repr(v) == "1" ``` -We can disable code checking for a specific code bloc by adding `` above it: +We can disable code checking for a specific code bloc by adding a `.no-check` class to the code bloc: ````md -```{ .python .no-check } +```python { .no-check } test = undeclared_function(42) ``` ```` -See the [dedicated reference][edsnlp.utils.blocs.check_md_file] for more information +Visit the source code of [test_docs.py](https://github.com/aphp/edsnlp/blob/master/tests/test_docs.py) for more information. diff --git a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py index 58cc5fe310..016acc9b98 100644 --- a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py +++ b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py @@ -2,7 +2,7 @@ import warnings from typing import Generator, Iterable, Optional, Union -from confit import VisibleDeprecationWarning +from confit import VisibleDeprecationWarning, validate_arguments from loguru import logger from spacy.tokens import Doc, Span @@ -16,6 +16,7 @@ from .models import FullConfig, SingleAssignModel, SingleConfig +@validate_arguments() class ContextualMatcher(BaseNERComponent): """ Allows additional matching in the surrounding context of the main match group, @@ -27,7 +28,7 @@ class ContextualMatcher(BaseNERComponent): spaCy `Language` object. name : Optional[str] The name of the pipe - patterns : AsList[SingleConfig] + patterns : FullConfig ??? subdoc "The patterns to match" ::: edsnlp.pipes.core.contextual_matcher.models.SingleConfig diff --git a/edsnlp/pipes/core/contextual_matcher/models.py b/edsnlp/pipes/core/contextual_matcher/models.py index 1107463288..b643b84b4b 100644 --- a/edsnlp/pipes/core/contextual_matcher/models.py +++ b/edsnlp/pipes/core/contextual_matcher/models.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union import regex -from pydantic import BaseModel, Extra +from pydantic import BaseModel from edsnlp.matchers.utils import ListOrStr from edsnlp.utils.span_getters import ( @@ -149,10 +149,10 @@ class SingleAssignModel(BaseModel): replace_entity : Optional[bool] If set to `True`, the match from the corresponding assign key will be used as entity, instead of the main match. - See [this paragraph][the-replace_entity-parameter] + See [this paragraph][replace_entity] reduce_mode : Optional[Flags] Set how multiple assign matches are handled. See the documentation of the - [`reduce_mode` parameter][the-reduce_mode-parameter] + [`reduce_mode` parameter][reduce_mode] required : Optional[str] If set to `True`, the assign key must match for the extraction to be kept. If it does not match, the extraction is discarded. @@ -226,7 +226,7 @@ def __get_validators__(cls): AssignModel = List[SingleAssignModel] # noqa: F811 -class SingleConfig(BaseModel, extra=Extra.forbid): +class SingleConfig(BaseModel, extra="forbid"): """ A single configuration for the contextual matcher. diff --git a/edsnlp/pipes/core/endlines/endlines.py b/edsnlp/pipes/core/endlines/endlines.py index b2ad186acc..2d91c11550 100644 --- a/edsnlp/pipes/core/endlines/endlines.py +++ b/edsnlp/pipes/core/endlines/endlines.py @@ -143,6 +143,10 @@ def __init__( self._read_model(model_path) + def set_extensions(self): + if not Token.has_extension("excluded"): + Token.set_extension("excluded", default=False) + def _read_model(self, end_lines_model: Optional[Union[str, EndLinesModel]]): """ Parameters diff --git a/edsnlp/pipes/misc/consultation_dates/consultation_dates.py b/edsnlp/pipes/misc/consultation_dates/consultation_dates.py index 2f62fd2a7c..7f0cf8fe29 100644 --- a/edsnlp/pipes/misc/consultation_dates/consultation_dates.py +++ b/edsnlp/pipes/misc/consultation_dates/consultation_dates.py @@ -52,7 +52,7 @@ class ConsultationDatesMatcher(GenericMatcher): # Out: [Consultation du 03/10/2018] doc.spans["consultation_dates"][0]._.consultation_date.to_datetime() - # Out: DateTime(2018, 10, 3, 0, 0, 0) + # Out: 2018-10-03 00:00:00 ``` Extensions diff --git a/edsnlp/pipes/misc/dates/dates.py b/edsnlp/pipes/misc/dates/dates.py index 3b00719a84..bc9b5b764b 100644 --- a/edsnlp/pipes/misc/dates/dates.py +++ b/edsnlp/pipes/misc/dates/dates.py @@ -41,7 +41,7 @@ class DatesMatcher(BaseNERComponent): | `relative` | `hier`, `la semaine dernière` | | `duration` | `pendant quatre jours` | - See the [tutorial](../../tutorials/detecting-dates.md) for a presentation of a + See the [tutorial](/tutorials/detecting-dates.md) for a presentation of a full pipeline featuring the `eds.dates` component. ## Usage @@ -67,7 +67,7 @@ class DatesMatcher(BaseNERComponent): # Out: [23 août 2021, il y a un an, mai 1995] dates[0]._.date.to_datetime() - # Out: 2021-08-23T00:00:00+02:00 + # Out: 2021-08-23 00:00:00 dates[1]._.date.to_datetime() # Out: None @@ -76,7 +76,7 @@ class DatesMatcher(BaseNERComponent): doc._.note_datetime = note_datetime dates[1]._.date.to_datetime() - # Out: 2020-08-27T00:00:00+02:00 + # Out: 2020-08-27 00:00:00+00:09 date_2_output = dates[2]._.date.to_datetime( note_datetime=note_datetime, @@ -85,7 +85,7 @@ class DatesMatcher(BaseNERComponent): default_day=15, ) date_2_output - # Out: 1995-05-15T00:00:00+02:00 + # Out: 1995-05-15 00:00:00+02:00 doc.spans["durations"] # Out: [pendant une semaine] @@ -260,7 +260,7 @@ def __init__( if on_ents_only: assert span_getter is None, ( - "Cannot use both `on_ents_only` and " "`span_getter`" + "Cannot use both `on_ents_only` and `span_getter`" ) def span_getter(doc): diff --git a/edsnlp/pipes/misc/dates/models.py b/edsnlp/pipes/misc/dates/models.py index 506af3f958..0c94707765 100644 --- a/edsnlp/pipes/misc/dates/models.py +++ b/edsnlp/pipes/misc/dates/models.py @@ -40,14 +40,11 @@ class Mode(str, Enum): DURATION = "duration" -class Period(BaseModel): +class Period(BaseModel, arbitrary_types_allowed=True): FROM: Optional[Span] = None UNTIL: Optional[Span] = None DURATION: Optional[Span] = None - class Config: - arbitrary_types_allowed = True - class BaseDate(BaseModel): mode: Mode = None diff --git a/edsnlp/pipes/misc/split/split.py b/edsnlp/pipes/misc/split/split.py index 17db63384e..d26b115686 100644 --- a/edsnlp/pipes/misc/split/split.py +++ b/edsnlp/pipes/misc/split/split.py @@ -128,8 +128,7 @@ def __init__( stream = stream.map(eds.split(max_length=5, regex="\\n{2,}")) print(" | ".join(doc.text.strip() for doc in stream)) - # Out: - # Sentence 1 | This is another longer sentence | more than 5 words + # Out: Sentence 1 | This is another longer sentence | more than 5 words ``` Parameters diff --git a/edsnlp/pipes/misc/tables/tables.py b/edsnlp/pipes/misc/tables/tables.py index ede5a91b4e..9d0dcc9ac0 100644 --- a/edsnlp/pipes/misc/tables/tables.py +++ b/edsnlp/pipes/misc/tables/tables.py @@ -78,7 +78,7 @@ class TablesMatcher(BaseComponent): index=False, # set True to use the first column as index ) type(df) - # Out: pandas.core.frame.DataFrame + # Out: ``` The pandas DataFrame: diff --git a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py index ca543fa6e1..a22e1c993d 100644 --- a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py +++ b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py @@ -1,10 +1,11 @@ """`eds.alcohol` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.qualifiers.negation import NegationQualifier from ...disorders.base import DisorderMatcher @@ -59,7 +60,7 @@ class AlcoholMatcher(DisorderMatcher): ), ), ) - nlp.add_pipe(f"eds.alcohol") + nlp.add_pipe(eds.alcohol()) ``` Below are a few examples: @@ -72,7 +73,7 @@ class AlcoholMatcher(DisorderMatcher): The pipeline object name : Optional[str] The name of the component - patterns : Union[Dict[str, Any], List[Dict[str, Any]]] + patterns : FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -91,7 +92,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "alcohol", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label="alcohol", span_setter={"ents": True, "alcohol": True}, ): diff --git a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py index 38c795926f..0d041722fa 100644 --- a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py +++ b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py @@ -1,4 +1,4 @@ -default_patterns = dict( +default_pattern = dict( source="alcohol", regex=[ r"\balco[ol]", @@ -40,3 +40,4 @@ ), ], ) +default_patterns = [default_pattern] diff --git a/edsnlp/pipes/ner/behaviors/tobacco/tobacco.py b/edsnlp/pipes/ner/behaviors/tobacco/tobacco.py index 36c0701cce..3010478c0f 100644 --- a/edsnlp/pipes/ner/behaviors/tobacco/tobacco.py +++ b/edsnlp/pipes/ner/behaviors/tobacco/tobacco.py @@ -1,18 +1,20 @@ """`eds.tobacco` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Iterable, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig +from edsnlp.pipes.qualifiers.negation import NegationQualifier from edsnlp.utils.numbers import parse_digit -from ..alcohol.alcohol import AlcoholMatcher +from ...disorders.base import DisorderMatcher from .patterns import default_patterns -class TobaccoMatcher(AlcoholMatcher): +class TobaccoMatcher(DisorderMatcher): """ The `eds.tobacco` pipeline component extracts mentions of tobacco consumption. @@ -80,7 +82,7 @@ class TobaccoMatcher(AlcoholMatcher): The pipeline object name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -99,7 +101,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "tobacco", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "tobacco", span_setter: SpanSetterArg = {"ents": True, "tobacco": True}, ): @@ -108,21 +110,34 @@ def __init__( name=name, patterns=patterns, label=label, + detailed_status_mapping={ + 1: None, + 2: "ABSTINENCE", + }, span_setter=span_setter, + include_assigned=True, ) + self.negation = NegationQualifier(nlp) - def process(self, doc: Doc) -> List[Span]: + def process(self, doc: Doc) -> Iterable[Span]: for span in super().process(doc): - if "secondhand" in span._.assigned.keys(): + if "stopped" in span._.assigned: + # using nlp(text) so that we don't assign negation flags on + # the original document + stopped = self.negation.process(span) + if not any(stopped_token.negation for stopped_token in stopped.tokens): + span._.status = 2 + if "zero_after" in span._.assigned: span._.negation = True - - elif "PA" in span._.assigned.keys(): + if "secondhand" in span._.assigned: + span._.negation = True + if "PA" in span._.assigned and ("stopped" not in span._.assigned): pa = parse_digit( span._.assigned["PA"], atttr="NORM", ignore_excluded=True, ) - if (pa == 0) and ("stopped" not in span._.assigned.keys()): + if pa == 0: span._.negation = True yield span diff --git a/edsnlp/pipes/ner/disorders/aids/aids.py b/edsnlp/pipes/ner/disorders/aids/aids.py index f12e7a911a..2e5e24d2bd 100644 --- a/edsnlp/pipes/ner/disorders/aids/aids.py +++ b/edsnlp/pipes/ner/disorders/aids/aids.py @@ -1,10 +1,11 @@ """`eds.aids` pipeline""" import itertools -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from edsnlp.pipes.qualifiers.hypothesis import HypothesisQualifier from edsnlp.pipes.qualifiers.hypothesis.factory import ( @@ -82,7 +83,7 @@ class AIDSMatcher(DisorderMatcher): The pipeline object name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -101,7 +102,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "aids", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "aids", span_setter: SpanSetterArg = {"ents": True, "aids": True}, ): diff --git a/edsnlp/pipes/ner/disorders/base.py b/edsnlp/pipes/ner/disorders/base.py index 06d25358d7..dcceec302b 100644 --- a/edsnlp/pipes/ner/disorders/base.py +++ b/edsnlp/pipes/ner/disorders/base.py @@ -1,11 +1,12 @@ import re -from typing import Any, Dict, List, Union +from typing import Dict, Union from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg from edsnlp.pipes.core.contextual_matcher import ContextualMatcher +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.utils.deprecation import deprecated_getter_factory from edsnlp.utils.filter import filter_spans @@ -20,7 +21,7 @@ class DisorderMatcher(ContextualMatcher): spaCy `Language` object. name : str The name of the pipe - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The configuration dictionary include_assigned : bool Whether to include (eventual) assign matches to the final entity @@ -44,7 +45,7 @@ def __init__( name: str, *, label: str, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]], + patterns: FullConfig, include_assigned: bool = True, ignore_excluded: bool = True, ignore_space_tokens: bool = True, diff --git a/edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py b/edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py index 6d562b8e09..2c60db4140 100644 --- a/edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py +++ b/edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py @@ -1,11 +1,12 @@ """`eds.cerebrovascular_accident` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -69,7 +70,7 @@ class CerebrovascularAccidentMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -88,7 +89,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "cerebrovascular_accident", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "cerebrovascular_accident", span_setter: SpanSetterArg = {"ents": True, "cerebrovascular_accident": True}, ): diff --git a/edsnlp/pipes/ner/disorders/ckd/ckd.py b/edsnlp/pipes/ner/disorders/ckd/ckd.py index e4a4ca20d0..3a0dc7f35d 100644 --- a/edsnlp/pipes/ner/disorders/ckd/ckd.py +++ b/edsnlp/pipes/ner/disorders/ckd/ckd.py @@ -1,12 +1,13 @@ """`eds.ckd` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from loguru import logger from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -82,7 +83,7 @@ class CKDMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -101,7 +102,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "ckd", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "ckd", span_setter: SpanSetterArg = {"ents": True, "ckd": True}, ): diff --git a/edsnlp/pipes/ner/disorders/congestive_heart_failure/congestive_heart_failure.py b/edsnlp/pipes/ner/disorders/congestive_heart_failure/congestive_heart_failure.py index 2ad2753368..42a7d30bb7 100644 --- a/edsnlp/pipes/ner/disorders/congestive_heart_failure/congestive_heart_failure.py +++ b/edsnlp/pipes/ner/disorders/congestive_heart_failure/congestive_heart_failure.py @@ -1,9 +1,10 @@ """`eds.congestive_heart_failure` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -63,7 +64,7 @@ class CongestiveHeartFailureMatcher(DisorderMatcher): The pipeline object name : str, The name of the component - patterns : Optional[Dict[str, Any]] + patterns : FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -82,7 +83,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "congestive_heart_failure", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "congestive_heart_failure", span_setter: SpanSetterArg = {"ents": True, "congestive_heart_failure": True}, ): diff --git a/edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py b/edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py index 1819c4a13f..18547c8918 100644 --- a/edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py +++ b/edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py @@ -1,11 +1,12 @@ """`eds.connective_tissue_disease` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -66,7 +67,7 @@ class ConnectiveTissueDiseaseMatcher(DisorderMatcher): The pipeline object name : str The name of the component - patterns : Optional[Dict[str, Any]] + patterns : FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -85,7 +86,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "connective_tissue_disease", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "connective_tissue_disease", span_setter: SpanSetterArg = {"ents": True, "connective_tissue_disease": True}, ): diff --git a/edsnlp/pipes/ner/disorders/copd/copd.py b/edsnlp/pipes/ner/disorders/copd/copd.py index 9ebc6fd009..f6a1e3a559 100644 --- a/edsnlp/pipes/ner/disorders/copd/copd.py +++ b/edsnlp/pipes/ner/disorders/copd/copd.py @@ -1,11 +1,12 @@ """`eds.copd` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from spacy.tokens import Doc from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -69,7 +70,7 @@ class COPDMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -88,7 +89,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "copd", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "copd", span_setter: SpanSetterArg = {"ents": True, "copd": True}, ): diff --git a/edsnlp/pipes/ner/disorders/dementia/dementia.py b/edsnlp/pipes/ner/disorders/dementia/dementia.py index a33c4824e7..7de6438809 100644 --- a/edsnlp/pipes/ner/disorders/dementia/dementia.py +++ b/edsnlp/pipes/ner/disorders/dementia/dementia.py @@ -1,9 +1,10 @@ """`eds.dementia` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -63,7 +64,7 @@ class DementiaMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -82,7 +83,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "dementia", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "dementia", span_setter: SpanSetterArg = {"ents": True, "dementia": True}, ): diff --git a/edsnlp/pipes/ner/disorders/diabetes/diabetes.py b/edsnlp/pipes/ner/disorders/diabetes/diabetes.py index f2d2467ec5..c32f2cb00b 100644 --- a/edsnlp/pipes/ner/disorders/diabetes/diabetes.py +++ b/edsnlp/pipes/ner/disorders/diabetes/diabetes.py @@ -1,6 +1,6 @@ """`eds.diabetes` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span @@ -8,6 +8,7 @@ from edsnlp.matchers.regex import RegexMatcher from edsnlp.matchers.utils import get_text from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import COMPLICATIONS, default_patterns @@ -74,7 +75,7 @@ class DiabetesMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -93,7 +94,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "diabetes", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "diabetes", span_setter: SpanSetterArg = {"ents": True, "diabetes": True}, ): diff --git a/edsnlp/pipes/ner/disorders/hemiplegia/hemiplegia.py b/edsnlp/pipes/ner/disorders/hemiplegia/hemiplegia.py index 7baa64f075..c391223f9c 100644 --- a/edsnlp/pipes/ner/disorders/hemiplegia/hemiplegia.py +++ b/edsnlp/pipes/ner/disorders/hemiplegia/hemiplegia.py @@ -1,9 +1,10 @@ """`eds.hemiplegia` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -63,7 +64,7 @@ class HemiplegiaMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -82,7 +83,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "hemiplegia", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "hemiplegia", span_setter: SpanSetterArg = {"ents": True, "hemiplegia": True}, ): diff --git a/edsnlp/pipes/ner/disorders/leukemia/leukemia.py b/edsnlp/pipes/ner/disorders/leukemia/leukemia.py index 7da1533ccf..cc723eb56b 100644 --- a/edsnlp/pipes/ner/disorders/leukemia/leukemia.py +++ b/edsnlp/pipes/ner/disorders/leukemia/leukemia.py @@ -1,9 +1,10 @@ """`eds.leukemia` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -63,7 +64,7 @@ class LeukemiaMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -82,7 +83,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "leukemia", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "leukemia", span_setter: SpanSetterArg = {"ents": True, "leukemia": True}, ): diff --git a/edsnlp/pipes/ner/disorders/liver_disease/liver_disease.py b/edsnlp/pipes/ner/disorders/liver_disease/liver_disease.py index 1c5f1b76a7..9e567c70d6 100644 --- a/edsnlp/pipes/ner/disorders/liver_disease/liver_disease.py +++ b/edsnlp/pipes/ner/disorders/liver_disease/liver_disease.py @@ -1,11 +1,12 @@ """`eds.liver_disease` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -67,7 +68,7 @@ class LiverDiseaseMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -86,7 +87,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "liver_disease", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "liver_disease", span_setter: SpanSetterArg = {"ents": True, "liver_disease": True}, ): diff --git a/edsnlp/pipes/ner/disorders/lymphoma/lymphoma.py b/edsnlp/pipes/ner/disorders/lymphoma/lymphoma.py index b4e130ddad..fc491621ac 100644 --- a/edsnlp/pipes/ner/disorders/lymphoma/lymphoma.py +++ b/edsnlp/pipes/ner/disorders/lymphoma/lymphoma.py @@ -1,9 +1,10 @@ """`eds.lymphoma` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -67,7 +68,7 @@ class LymphomaMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -86,7 +87,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "lymphoma", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "lymphoma", span_setter: SpanSetterArg = {"ents": True, "lymphoma": True}, ): diff --git a/edsnlp/pipes/ner/disorders/myocardial_infarction/myocardial_infarction.py b/edsnlp/pipes/ner/disorders/myocardial_infarction/myocardial_infarction.py index 51f0f41db5..3aed7d3b53 100644 --- a/edsnlp/pipes/ner/disorders/myocardial_infarction/myocardial_infarction.py +++ b/edsnlp/pipes/ner/disorders/myocardial_infarction/myocardial_infarction.py @@ -1,11 +1,12 @@ """`eds.myocardial_infarction` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -71,7 +72,7 @@ class MyocardialInfarctionMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -90,7 +91,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "myocardial_infarction", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "myocardial_infarction", span_setter: SpanSetterArg = {"ents": True, "myocardial_infarction": True}, ): diff --git a/edsnlp/pipes/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py b/edsnlp/pipes/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py index 3bd55440f5..09076e14ab 100644 --- a/edsnlp/pipes/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py +++ b/edsnlp/pipes/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py @@ -1,11 +1,12 @@ """`eds.peptic_ulcer_disease` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -66,7 +67,7 @@ class PepticUlcerDiseaseMatcher(DisorderMatcher): The pipeline object name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -85,7 +86,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "peptic_ulcer_disease", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "peptic_ulcer_disease", span_setter: SpanSetterArg = {"ents": True, "peptic_ulcer_disease": True}, ): diff --git a/edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py b/edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py index 99c0e9cbd8..3eff5c6bc0 100644 --- a/edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py +++ b/edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py @@ -1,11 +1,12 @@ """`eds.peripheral_vascular_disease` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -66,7 +67,7 @@ class PeripheralVascularDiseaseMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -86,7 +87,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "peripheral_vascular_disease", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "peripheral_vascular_disease", span_setter: SpanSetterArg = { "ents": True, diff --git a/edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py b/edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py index a577fd2c22..a0db70d422 100644 --- a/edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py +++ b/edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py @@ -1,11 +1,12 @@ """`eds.solid_tumor` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.utils.numbers import parse_digit from ..base import DisorderMatcher @@ -71,7 +72,7 @@ class SolidTumorMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -95,7 +96,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "solid_tumor", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, use_tnm: bool = False, use_patterns_metastasis_ct_scan: bool = False, label: str = "solid_tumor", diff --git a/edsnlp/pipes/qualifiers/history/history.py b/edsnlp/pipes/qualifiers/history/history.py index 2b4f3c7623..8b753a2e9d 100644 --- a/edsnlp/pipes/qualifiers/history/history.py +++ b/edsnlp/pipes/qualifiers/history/history.py @@ -83,9 +83,19 @@ class HistoryQualifier(RuleBasedQualifier): !!! info "Dates" - To take the most of the `eds.dates` component, you may add the ``note_datetime`` - context (cf. [Adding context][using-eds-nlps-helper-functions]). It allows the - component to compute the duration of absolute dates + To take the most of the `eds.dates` component, you may set a value for + `doc._.note_datetime`, either directly: + + ```python { .no-check } + doc = nlp.make_doc(text) + doc._.note_datetime = datetime.datetime(2022, 8, 28) + nlp(doc) + ``` + + or using a converter such as the + [`omop` converter][edsnlp.data.converters.OmopDict2DocConverter] + + It allows the component to compute the duration of absolute dates (e.g., le 28 août 2022/August 28, 2022). The ``birth_datetime`` context allows the component to exclude the birthdate from the extracted dates. diff --git a/edsnlp/tune.py b/edsnlp/tune.py index 1dd6188e39..cff2abccc6 100644 --- a/edsnlp/tune.py +++ b/edsnlp/tune.py @@ -34,7 +34,7 @@ CHECKPOINT = "study.pkl" -class HyperparameterConfig(BaseModel): +class HyperparameterConfig(BaseModel, extra="forbid"): """ A configuration model for hyperparameters used in optimization or tuning processes. """ @@ -47,9 +47,6 @@ class HyperparameterConfig(BaseModel): log: Optional[bool] = None choices: Optional[List[Union[str, float, int, bool]]] = None - class Config: - extra = "forbid" - if pydantic.VERSION < "2": model_dump = BaseModel.dict diff --git a/edsnlp/utils/span_getters.py b/edsnlp/utils/span_getters.py index 23f2fb8ecd..58166a3054 100644 --- a/edsnlp/utils/span_getters.py +++ b/edsnlp/utils/span_getters.py @@ -346,7 +346,7 @@ class ContextWindowMeta(abc.ABCMeta): pass -class ContextWindow(abc.ABC, metaclass=ContextWindowMeta): +class ContextWindow(Validated, abc.ABC, metaclass=ContextWindowMeta): """ A ContextWindow specifies how much additional context (such as sentences or words) should be included relative to an anchor span. For example, one might define a @@ -361,11 +361,14 @@ class ContextWindow(abc.ABC, metaclass=ContextWindowMeta): Examples -------- + ```python from confit import validate_arguments - from edsnlp.utils.span_getters import ContextWindow from spacy.tokens import Span + import edsnlp + from edsnlp.utils.span_getters import ContextWindow + @validate_arguments def apply_context(span: Span, ctx: ContextWindow): @@ -373,13 +376,21 @@ def apply_context(span: Span, ctx: ContextWindow): return ctx(span) + nlp = edsnlp.blank("eds") + nlp.add_pipe("eds.sentences") + + doc = nlp("A first sentence. A second sentence, longer this time. A third.") + span = doc[5:6] # "second" + # Will return a span with the 10 words before and after the span # and words of the current sentence and the next sentence. - apply_context(span, "words[-10:10] | sents[0:1]") + apply_context(span, "words[-3:3] | sents[0:1]").text + # Out: "sentence. A second sentence, longer this time. A third." # Will return the span covering at most the -5 and +5 words # around the span and the current sentence of the span. - apply_context(span, "words[-5:5] & sent") + apply_context(span, "words[-4:4] & sent").text + # Out: "A second sentence, longer this" ``` !!! warning "Indexing" @@ -442,10 +453,6 @@ def validate(cls, obj, config=None): return WordContextWindow(obj, 0) if obj < 0 else WordContextWindow(0, obj) raise ValueError(f"Invalid context: {obj}") - @classmethod - def __get_validators__(cls): - yield cls.validate - class LeafContextWindowMeta(ContextWindowMeta): def __getitem__(cls, item) -> Span: diff --git a/edsnlp/utils/typing.py b/edsnlp/utils/typing.py index 5b017a60e5..d7f0eb9c7a 100644 --- a/edsnlp/utils/typing.py +++ b/edsnlp/utils/typing.py @@ -64,12 +64,9 @@ class AsList(Generic[T], metaclass=MetaAsList): if pydantic.VERSION < "2": def cast(type_, obj): - class Model(pydantic.BaseModel): + class Model(pydantic.BaseModel, arbitrary_types_allowed=True): __root__: type_ - class Config: - arbitrary_types_allowed = True - return Model(__root__=obj).__root__ else: from dataclasses import is_dataclass diff --git a/tests/conftest.py b/tests/conftest.py index 54578d4154..26b3306d9a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,6 +27,18 @@ pytest.importorskip("rich") +def pytest_collection_modifyitems(items): + """Run test_docs* at the end""" + first_tests = [] + last_tests = [] + for item in items: + if item.name.startswith("test_code_blocks"): + last_tests.append(item) + else: + first_tests.append(item) + items[:] = first_tests + last_tests + + @fixture(scope="session", params=["eds", "fr"]) def lang(request): return request.param diff --git a/tests/test_docs.py b/tests/test_docs.py index 5bc4ef1f93..24fe2e0234 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -1,7 +1,13 @@ +import ast +import inspect +import re import sys +import textwrap import warnings +import catalogue import pytest +from spacy.tokens.underscore import Underscore pytest.importorskip("mkdocs") try: @@ -20,6 +26,8 @@ url_to_code = {} else: url_to_code = dict(extract_docs_code()) + # just to make sure something didn't go wrong + assert len(url_to_code) > 50 def printer(code: str) -> None: @@ -38,10 +46,97 @@ def printer(code: str) -> None: print("\n".join(lines)) +def insert_assert_statements(code): + line_table = [0] + for line in code.splitlines(keepends=True): + line_table.append(line_table[-1] + len(line)) + + tree = ast.parse(code) + replacements = [] + + for match in re.finditer( + r"^\s*#\s*Out\s*: (.*$(?:\n#\s.*$)*)", code, flags=re.MULTILINE + ): + lineno = code[: match.start()].count("\n") + for stmt in tree.body: + if stmt.end_lineno == lineno: + if isinstance(stmt, ast.Expr): + expected = textwrap.dedent(match.group(1)).replace("\n# ", "\n") + begin = line_table[stmt.lineno - 1] + if not (expected.startswith("'") or expected.startswith('"')): + expected = repr(expected) + end = match.end() + stmt_str = ast.unparse(stmt) + if stmt_str.startswith("print("): + stmt_str = stmt_str[len("print") :] + repl = f"""\ +value = {stmt_str} +assert {expected} == str(value) +""" + replacements.append((begin, end, repl)) + if isinstance(stmt, ast.For): + expected = textwrap.dedent(match.group(1)).split("\n# Out: ") + expected = [line.replace("\n# ", "\n") for line in expected] + begin = line_table[stmt.lineno - 1] + end = match.end() + stmt_str = ast.unparse(stmt).replace("print", "assert_print") + repl = f"""\ +printed = [] +{stmt_str} +assert {expected} == printed +""" + replacements.append((begin, end, repl)) + + for begin, end, repl in reversed(replacements): + code = code[:begin] + repl + code[end:] + + return code + + +# TODO: once in a while, it can be interesting to run reset_imports for each code block, +# instead of only once and tests should still pass, but it's way slower. +@pytest.fixture(scope="module") +def reset_imports(): + """ + Reset the imports for each test. + """ + # 1. Clear registered functions to avoid using cached ones + for k, m in list(catalogue.REGISTRY.items()): + mod = inspect.getmodule(m) + if mod is not None and mod.__name__.startswith("edsnlp"): + del catalogue.REGISTRY[k] + + # Let's ensure that we "bump" into every possible warnings: + # 2. Remove all modules that start with edsnlp, to reimport them + for k in list(sys.modules): + if k.split(".")[0] == "edsnlp": + del sys.modules[k] + + # 3. Delete spacy extensions to avoid error when re-importing + Underscore.span_extensions.clear() + Underscore.doc_extensions.clear() + Underscore.token_extensions.clear() + + # Note the use of `str`, makes for pretty output @pytest.mark.parametrize("url", sorted(url_to_code.keys()), ids=str) -def test_code_blocks(url): - raw = url_to_code[url] +def test_code_blocks(url, tmpdir, reset_imports): + code = url_to_code[url] + code_with_asserts = """ +def assert_print(*args, sep=" ", end="\\n", file=None, flush=False): + printed.append((sep.join(map(str, args)) + end).rstrip('\\n')) + +""" + insert_assert_statements(code) + assert "# Out:" not in code_with_asserts, ( + "Unparsed asserts in {url}:\n" + code_with_asserts + ) + # We'll import test_code_blocks from here + sys.path.insert(0, str(tmpdir)) + test_file = tmpdir.join("test_code_blocks.py") + + # Clear all warnings + warnings.resetwarnings() + try: with warnings.catch_warnings(): warnings.simplefilter("error") @@ -49,7 +144,18 @@ def test_code_blocks(url): warnings.filterwarnings( message="__package__ != __spec__.parent", action="ignore" ) - exec(raw, {"__MODULE__": "__main__"}) + # First, forget test_code_blocks + sys.modules.pop("test_code_blocks", None) + + # Then, reimport it, to let pytest do its assertion rewriting magic + test_file.write_text(code_with_asserts, encoding="utf-8") + + import test_code_blocks # noqa: F401 + + exec( + compile(code_with_asserts, test_file, "exec"), + {"__MODULE__": "__main__"}, + ) except Exception: - printer(raw) + printer(code_with_asserts) raise From 108950439b4ae27358c1e7d7f9fed8756a17a31d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Tue, 15 Apr 2025 01:59:31 +0200 Subject: [PATCH 07/11] fix: prevent include/exclude patterns from matching strictly inside the anchor --- changelog.md | 3 +++ .../contextual_matcher/contextual_matcher.py | 24 +++++++++++++++--- .../pipes/core/contextual_matcher/models.py | 6 +++-- .../pipelines/core/test_contextual_matcher.py | 25 +++++++++++++++++++ 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/changelog.md b/changelog.md index 790d2452e7..2b66e6e951 100644 --- a/changelog.md +++ b/changelog.md @@ -10,6 +10,9 @@ - Added a `filter_expr` parameter to scorers to filter the documents to score - Added a new `required` field to `eds.contextual_matcher` assign patterns to only match if the required field has been found, and an `include` parameter (similar to `exclude`) to search for required patterns without assigning them to the entity - Added context strings (e.g., "words[0:5] | sent[0:1]") to the `eds.contextual_matcher` component to allow for more complex patterns in the selection of the window around the trigger spans. +- Include and exclude patterns in the contextual matcher now dismiss matches that occur inside the anchor pattern (e.g. "anti" exclude pattern for anchor pattern "antibiotics" will not match the "anti" part of "antibiotics") + +### Changed - Improve the contextual matcher documentation. ### Fixed diff --git a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py index 016acc9b98..a036b2808c 100644 --- a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py +++ b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py @@ -207,9 +207,17 @@ def filter_one(self, span: Span, pattern) -> Optional[Span]: if ( exclude.regex_matcher is not None - and next(exclude.regex_matcher(snippet), None) is not None + and any( + # check that it isn't inside in the anchor span + not (s.start >= span.start and s.end <= span.end) + for s in exclude.regex_matcher(snippet, as_spans=True) + ) or exclude.span_getter is not None - and next(get_spans(snippet, exclude.regex_matcher), None) is not None + and any( + # check that it isn't inside in the anchor span + not (s.start >= span.start and s.end <= span.end) + for s in get_spans(snippet, exclude.span_getter) + ) ): to_keep = False break @@ -219,9 +227,17 @@ def filter_one(self, span: Span, pattern) -> Optional[Span]: if ( include.regex_matcher is not None - and next(include.regex_matcher(snippet), None) is None + and not any( + # check that it isn't inside in the anchor span + not (s.start >= span.start and s.end <= span.end) + for s in include.regex_matcher(snippet, as_spans=True) + ) or include.span_getter is not None - and next(get_spans(snippet, include.regex_matcher), None) is None + and not any( + # check that it isn't inside in the anchor span + not (s.start >= span.start and s.end <= span.end) + for s in get_spans(snippet, include.span_getter) + ) ): to_keep = False break diff --git a/edsnlp/pipes/core/contextual_matcher/models.py b/edsnlp/pipes/core/contextual_matcher/models.py index b643b84b4b..544108ff7c 100644 --- a/edsnlp/pipes/core/contextual_matcher/models.py +++ b/edsnlp/pipes/core/contextual_matcher/models.py @@ -50,7 +50,8 @@ class SingleExcludeModel(BaseModel): """ A dictionary to define exclusion rules. Exclusion rules are given as Regexes, and if a match is found in the surrounding context of an extraction, the extraction is - removed. Each dictionary should have the following keys: + removed. Note that only take a match into account if it is not inside the anchor + span. Parameters ---------- @@ -83,7 +84,8 @@ class SingleIncludeModel(BaseModel): """ A dictionary to define inclusion rules. Inclusion rules are given as Regexes, and if a match isn't found in the surrounding context of an extraction, the extraction - is removed. Each dictionary should have the following keys: + is removed. Note that only take a match into account if it is not inside the anchor + span. Parameters ---------- diff --git a/tests/pipelines/core/test_contextual_matcher.py b/tests/pipelines/core/test_contextual_matcher.py index 4d022031c5..e98da9b3ae 100644 --- a/tests/pipelines/core/test_contextual_matcher.py +++ b/tests/pipelines/core/test_contextual_matcher.py @@ -276,3 +276,28 @@ def test_contextual_matcher_include(blank_nlp): ent = doc.ents[0] assert ent.label_ == "tumor_size" assert ent._.assigned["size"]._.value.cm == 3 + + +# Checks https://github.com/aphp/edsnlp/issues/394 +def test_contextual_matcher_exclude_outside(): + import edsnlp + import edsnlp.pipes as eds + + asa_pattern = r"\basa\b ?:? ?([1-5]|[A-Z]{1,3})" + exclude_asa_ttt = r"5" + asa = dict( + source="asa", + regex=asa_pattern, + regex_attr="NORM", + exclude=dict(regex=exclude_asa_ttt, window=-5), + ) + + nlp = edsnlp.blank("eds") + nlp.add_pipe(eds.sentences()) + nlp.add_pipe(eds.contextual_matcher(patterns=[asa], label="asa")) + + doc = nlp("ASA 5") + assert str(doc.ents) == "(ASA 5,)" + + doc = nlp("5 ASA 5") + assert str(doc.ents) == "()" From c5730165671694479c1ce43394717aea93b1cf0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Tue, 15 Apr 2025 11:41:04 +0200 Subject: [PATCH 08/11] fix: handle list of patterns in eds.score's value_extract param --- changelog.md | 2 +- edsnlp/pipes/ner/scores/base_score.py | 18 +++++++-------- tests/pipelines/ner/test_score.py | 32 ++++++++++++++++++++++----- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/changelog.md b/changelog.md index 2b66e6e951..a7a916575b 100644 --- a/changelog.md +++ b/changelog.md @@ -20,7 +20,7 @@ - `edsnlp.package` now correctly detect if a project uses an old-style poetry pyproject or a PEP621 pyproject.toml. - PEP621 projects containing nested directories (e.g., "my_project/pipes/foo.py") are now supported. - Try several paths to find current pip executable -- Compatibility with Optuna 4.3.0 +- The parameter "value_extract" of `eds.score` now correctly handles lists of patterns. ## v0.16.0 (2025-0.3-26) diff --git a/edsnlp/pipes/ner/scores/base_score.py b/edsnlp/pipes/ner/scores/base_score.py index 6ddb63f0ab..494985c9f8 100644 --- a/edsnlp/pipes/ner/scores/base_score.py +++ b/edsnlp/pipes/ner/scores/base_score.py @@ -7,6 +7,7 @@ from edsnlp.core import PipelineProtocol, registry from edsnlp.pipes.base import SpanSetterArg from edsnlp.pipes.core.contextual_matcher import ContextualMatcher +from edsnlp.utils.typing import AsList class SimpleScoreMatcher(ContextualMatcher): @@ -55,7 +56,7 @@ def __init__( *, regex: List[str] = None, attr: str = "NORM", - value_extract: Union[str, Dict[str, str], List[Dict[str, str]]] = None, + value_extract: Union[AsList[Dict[str, str]], str] = None, score_normalization: Union[str, Callable[[Union[str, None]], Any]] = None, window: int = 7, ignore_excluded: bool = False, @@ -79,14 +80,13 @@ def __init__( span_setter = {"ents": True, label: True} if isinstance(value_extract, str): - value_extract = dict( - name="value", - regex=value_extract, - window=window, - ) - - if isinstance(value_extract, dict): - value_extract = [value_extract] + value_extract = [ + dict( + name="value", + regex=value_extract, + window=window, + ) + ] value_exists = False for i, extract in enumerate(value_extract): diff --git a/tests/pipelines/ner/test_score.py b/tests/pipelines/ner/test_score.py index 6d0aab2034..2c278db74b 100644 --- a/tests/pipelines/ner/test_score.py +++ b/tests/pipelines/ner/test_score.py @@ -1,6 +1,8 @@ import re -from edsnlp.pipelines.ner.scores import Score +import edsnlp +import edsnlp.pipes as eds +from edsnlp.pipes.ner.scores import Score from edsnlp.utils.examples import parse_example example = """ @@ -67,8 +69,28 @@ def testscore_normalization(raw_score: str): doc = testscore(doc) for entity, ent in zip(entities, doc.ents): - for modifier in entity.modifiers: - assert ( - getattr(ent._, modifier.key) == modifier.value - ), f"{modifier.key} labels don't match." + assert getattr(ent._, modifier.key) == modifier.value, ( + f"{modifier.key} labels don't match." + ) + + +def test_multi_value_extract(): + # dummy example, we have eds.quantities to extract sizes + nlp = edsnlp.blank("eds") + nlp.add_pipe( + eds.score( + name="taille", + regex=[r"taille"], + value_extract=[ + {"name": "value", "regex": r"(\d+)"}, + {"name": "unit", "regex": r"(cm|mm)"}, + ], + score_normalization=float, + label="taille", + ) + ) + doc = nlp("taille 12 cm") + assert len(doc.ents) == 1 + ent = doc.ents[0] + assert ent._.score_value == 12.0 From 117e7e037f07b15fe6b5d8900cba3ddcba6892cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 17 Apr 2025 09:43:59 +0200 Subject: [PATCH 09/11] fix: cap some dependencies to avoid source installs and speed up installation --- pyproject.toml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fa115c1fca..e19115af4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,9 @@ dependencies = [ # thinc doesn't provide binaries for python<3.9 from 8.2.5 so we need to cap it ourself "thinc<8.2.5; python_version<'3.9'", "thinc>=8.2.5; python_version>='3.9'", + # blis>1.2.0 (dependency of thinc) doesn't provide binaries for python<3.10 so we need to cap it ourself + "blis<1.0.0; python_version<'3.9'", + "blis<1.2.1; python_version>='3.9' and python_version<'3.10'", "confit>=0.7.3", "tqdm", "umls-downloader>=0.1.1", @@ -54,6 +57,9 @@ dev-no-ml = [ "scikit-learn", + # Packaging + "poetry", + "edsnlp[docs-no-ml]", ] docs-no-ml = [ @@ -76,7 +82,10 @@ docs-no-ml = [ ] ml = [ "rich-logger>=0.3.1", - "torch>=1.13.0", + # TODO: uv doesn't seem to resolve torch correctly, unless we cap it ourself + "torch>=1.13.0,<2.0.0; python_version<'3.8'", + "torch>=1.13.0,<2.5.0; python_version<'3.9'", + "torch>=1.13.0; python_version>='3.9'", "foldedtensor>=0.4.0", "safetensors>=0.3.0; python_version>='3.8'", "safetensors>=0.3.0,<0.5.0; python_version<'3.8'", @@ -337,6 +346,8 @@ requires = [ "numpy==1.22.2; python_version>='3.8' and python_version<'3.9' and platform_machine=='loongarch64' and platform_python_implementation!='PyPy'", "numpy==1.22.2; python_version=='3.8' and platform_machine!='loongarch64' and platform_python_implementation=='PyPy'", "numpy>=2.0; python_version>='3.9'", + "blis<1.0.0; python_version<'3.9'", + "blis<1.2.1; python_version>='3.9' and python_version<'3.10'", ] build-backend = "setuptools.build_meta" @@ -430,6 +441,9 @@ include = ["edsnlp/*"] concurrency = ["multiprocessing", "thread"] parallel = true +[tool.uv.pip] +torch-backend = "auto" + [tool.cibuildwheel] skip = [ "*p36-*", # Skip Python 3.6 From de706d4ba71b2950da72a49dad27741faafd0aa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 17 Apr 2025 03:04:54 +0200 Subject: [PATCH 10/11] ci: publish docs preview for pull requests --- .github/workflows/tests.yml | 128 ++++++++++++++++++------------------ .gitignore | 2 + changelog.md | 1 + edsnlp/package.py | 20 +++--- 4 files changed, 77 insertions(+), 74 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9df3974b79..5d8a20d8cf 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,9 +7,11 @@ on: branches: [master] env: - # UV_INDEX_STRATEGY: "unsafe-first-match" - # UV_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" + UV_INDEX_STRATEGY: "unsafe-first-match" + UV_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" + UV_SYSTEM_PYTHON: 1 + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} jobs: linting: @@ -22,7 +24,7 @@ jobs: # requites to grab the history of the PR fetch-depth: 0 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: cache: 'pip' @@ -46,19 +48,6 @@ jobs: path: ~/.data/ key: resources - # - name: Cache pip - # uses: actions/cache@v3 - # with: - # path: ~/.cache/pip - # key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip - - - run: echo WEEK=$(date +%V) >>$GITHUB_ENV - shell: bash - - # - uses: hynek/setup-cached-uv@v1 - # with: - # cache-suffix: -tests-${{ matrix.python-version }}-${{ env.WEEK }} - - name: Set up Java uses: actions/setup-java@v2 with: @@ -72,48 +61,28 @@ jobs: cache: 'pip' - name: Install dependencies - run: | - pip install poetry - pip install -e '.[dev]' pytest-xdist pip + run: pip install -e ".[dev]" if: matrix.python-version != '3.9' && matrix.python-version != '3.10' && matrix.python-version != '3.11' && matrix.python-version != '3.12' -# uv venv -# source .venv/bin/activate -# uv pip install -e '.[dev]' pytest-xdist pip - name: Install dependencies - run: | - pip install poetry - pip install -e '.[dev,setup]' pytest-xdist pip + run: pip install -e ".[dev,setup]" if: matrix.python-version == '3.9' -# uv venv -# source .venv/bin/activate -# uv pip install -e '.[dev]' pytest-xdist pip - name: Install dependencies - run: | - pip install poetry - pip install -e '.[dev-no-ml]' pytest-xdist pip # skip ML tests for 3.10 and 3.11 + run: pip install -e ".[dev-no-ml]" if: matrix.python-version == '3.10' || matrix.python-version == '3.11' || matrix.python-version == '3.12' - name: Test with Pytest on Python ${{ matrix.python-version }} env: UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }} - run: | - coverage run -m pytest --ignore tests/test_docs.py # -n auto - # coverage combine - # mv .coverage .coverage.${{ matrix.python-version }} -# source .venv/bin/activate + run: coverage run -m pytest --ignore tests/test_docs.py if: matrix.python-version != '3.9' - name: Test with Pytest on Python ${{ matrix.python-version }} env: UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }} - run: | - coverage run -m pytest # -n auto - # coverage combine - # mv .coverage .coverage.${{ matrix.python-version }} -# source .venv/bin/activate + run: coverage run -m pytest if: matrix.python-version == '3.9' - name: Upload coverage data @@ -137,33 +106,72 @@ jobs: documentation: name: Documentation - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: - python-version: "3.9" + python-version: 3.9 cache: 'pip' - - run: echo WEEK=$(date +%V) >>$GITHUB_ENV - shell: bash + - name: Install dependencies + run: pip install -e ".[docs]" - # - uses: hynek/setup-cached-uv@v1 - # with: - # cache-suffix: -docs-${{ matrix.python-version }}-${{ env.WEEK }} + - name: Set up Git + run: | + git config user.name ${{ github.actor }} + git config user.email ${{ github.actor }}@users.noreply.github.com + echo Current branch: $BRANCH_NAME - - name: Install dependencies + - name: Build documentation run: | - pip install '.[docs]' -# uv venv -# uv pip install '.[docs]' + mike deploy --no-redirect --rebase --update-aliases $BRANCH_NAME latest + mike set-default $BRANCH_NAME + - name: Put content of gh-pages to public folder + run: rm -rf public && mkdir public && git archive gh-pages | tar -x -C ./public/ - - name: Build documentation + - name: Set up Vercel + run: npm install --global vercel@latest + + - name: Pull Vercel environment + run: vercel pull --yes --environment=preview --token=${{ secrets.VERCEL_TOKEN }} + + - name: Create new vercel project linked to this branch + run: vercel project add edsnlp-$BRANCH_NAME --token=${{ secrets.VERCEL_TOKEN }} + + - name: Link public folder to the (maybe) new vercel project + run: vercel link --cwd public --project edsnlp-$BRANCH_NAME --yes --token=${{ secrets.VERCEL_TOKEN }} + + - name: Deploy to Vercel + run: vercel deploy public/ --yes --token=${{ secrets.VERCEL_TOKEN }} --archive=tgz --prod > deployment-url.txt + + - name: Post the documentation link + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - mkdocs build --clean -# source .venv/bin/activate + URL=https://edsnlp-$BRANCH_NAME.vercel.app/ + COMMENT_BODY="## Docs preview URL\n\n$URL\n\n" + HEADER="Authorization: token $GITHUB_TOKEN" + PR_COMMENTS_URL="https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" + + # Fetch existing comments to find if one from this workflow already exists + COMMENTS=$(curl -s -H "$HEADER" "$PR_COMMENTS_URL") + COMMENT_ID=$(echo "$COMMENTS" | jq -r '.[] | select(.user.login == "github-actions[bot]" and (.body | startswith("## Docs preview URL"))) | .id') + + # Check if we have a comment ID, if so, update it, otherwise create a new one + if [[ "$COMMENT_ID" ]]; then + # Update existing comment + curl -s -X PATCH -H "$HEADER" -H "Content-Type: application/json" -d "{\"body\": \"$COMMENT_BODY\"}" "https://api.github.com/repos/${{ github.repository }}/issues/comments/$COMMENT_ID" + else + # Post new comment + curl -s -X POST -H "$HEADER" -H "Content-Type: application/json" -d "{\"body\": \"$COMMENT_BODY\"}" "$PR_COMMENTS_URL" + fi + + if [ $status -ne 0 ]; then + exit $status + fi simple-installation: name: Simple installation @@ -175,7 +183,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -183,13 +191,7 @@ jobs: - run: echo WEEK=$(date +%V) >>$GITHUB_ENV shell: bash - # - uses: hynek/setup-cached-uv@v1 - # with: - # cache-suffix: -simple-install-${{ matrix.python-version }}-${{ env.WEEK }} - - name: Install library run: | pip install ".[ml]" pytest pytest tests/pipelines/test_pipelines.py -# uv venv -# uv pip install . diff --git a/.gitignore b/.gitignore index bf0d160a0e..3432f0519d 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ site/ *.cpp *.so *.c +public/ # Unit test / coverage reports htmlcov/ @@ -71,3 +72,4 @@ _build/ docs/reference docs/changelog.md docs/contributing.md +.vercel diff --git a/changelog.md b/changelog.md index a7a916575b..c4128d13da 100644 --- a/changelog.md +++ b/changelog.md @@ -11,6 +11,7 @@ - Added a new `required` field to `eds.contextual_matcher` assign patterns to only match if the required field has been found, and an `include` parameter (similar to `exclude`) to search for required patterns without assigning them to the entity - Added context strings (e.g., "words[0:5] | sent[0:1]") to the `eds.contextual_matcher` component to allow for more complex patterns in the selection of the window around the trigger spans. - Include and exclude patterns in the contextual matcher now dismiss matches that occur inside the anchor pattern (e.g. "anti" exclude pattern for anchor pattern "antibiotics" will not match the "anti" part of "antibiotics") +- Pull Requests will now build a public accessible preview of the docs ### Changed - Improve the contextual matcher documentation. diff --git a/edsnlp/package.py b/edsnlp/package.py index 3c0c837d08..60dc4b8b1e 100644 --- a/edsnlp/package.py +++ b/edsnlp/package.py @@ -337,11 +337,9 @@ def __init__( poetry = pyproject["tool"]["poetry"] # Extract packages - poetry_bin_path = ( - subprocess.run(["which", "poetry"], stdout=subprocess.PIPE) - .stdout.decode() - .strip() - ) + poetry_bin_path = shutil.which("poetry") + if poetry_bin_path is None: + raise RuntimeError("Poetry is not installed or not found in PATH.") python_executable = Path(poetry_bin_path).read_text().split("\n")[0][2:] result = subprocess.run( [ @@ -407,9 +405,9 @@ def __init__( pass if "version" in constraint: dep_version = constraint.pop("version") - assert not dep_version.startswith( - "^" - ), "Packaging models with ^ dependencies is not supported" + assert not dep_version.startswith("^"), ( + "Packaging models with ^ dependencies is not supported" + ) dep += ( "" if dep_version == "*" @@ -421,9 +419,9 @@ def __init__( dep += f"; {constraint.pop('markers')}" except KeyError: pass - assert ( - not constraint - ), f"Unsupported constraints for dependency {dep_name}: {constraint}" + assert not constraint, ( + f"Unsupported constraints for dependency {dep_name}: {constraint}" + ) if dep_name == "python": new_pyproject["project"]["requires-python"] = dep.replace( "python", "" From a793837bc6b519687e00f16445c49f4c48b9607c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 17 Apr 2025 09:42:32 +0200 Subject: [PATCH 11/11] fix: catch zero variance error in tuning importance computation --- changelog.md | 1 + edsnlp/tune.py | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/changelog.md b/changelog.md index c4128d13da..8361006481 100644 --- a/changelog.md +++ b/changelog.md @@ -22,6 +22,7 @@ - PEP621 projects containing nested directories (e.g., "my_project/pipes/foo.py") are now supported. - Try several paths to find current pip executable - The parameter "value_extract" of `eds.score` now correctly handles lists of patterns. +- "Zero variance error" when computing param tuning importance are now catched and converted as a warning ## v0.16.0 (2025-0.3-26) diff --git a/edsnlp/tune.py b/edsnlp/tune.py index cff2abccc6..d982ed3a39 100644 --- a/edsnlp/tune.py +++ b/edsnlp/tune.py @@ -5,6 +5,7 @@ import os import random import sys +import warnings from typing import Dict, List, Optional, Tuple, Union import joblib @@ -159,11 +160,16 @@ def compute_importances(study, n=10): cumulative_importances = collections.defaultdict(float) for i in range(n): - importance_scores = get_param_importances( - study, - evaluator=FanovaImportanceEvaluator(seed=i), - target=lambda t: t.value, - ) + try: + importance_scores = get_param_importances( + study, + evaluator=FanovaImportanceEvaluator(seed=i), + target=lambda t: t.value, + ) + except RuntimeError as e: + if "zero total variance" in str(e): # pragma: no cover + warnings.warn("Zero total variance : skipping importance computation.") + continue for feature, importance in importance_scores.items(): cumulative_importances[feature] += importance @@ -357,7 +363,7 @@ def process_results( if key_phase_1 not in best_params.keys(): f.write(f" {key_phase_1}: {value_phase_1}\n") f.write("\nImportances:\n") - for key, value in importances.items(): + for key, value in importances.items(): # pragma: no cover f.write(f" {key}: {value}\n") write_final_config(output_dir, config_path, tuned_parameters, best_params)