From b6230df38fba47503546237008afc92c7a3cf4d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Wed, 19 Feb 2025 08:07:58 +0100 Subject: [PATCH] fix: allow deep attributes in Standoff and OMOP doc2dict converters --- changelog.md | 1 + edsnlp/data/converters.py | 24 +++++++++++++++++++----- edsnlp/utils/bindings.py | 19 ++++++++++++------- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/changelog.md b/changelog.md index a2612e10b8..4c4f6fd173 100644 --- a/changelog.md +++ b/changelog.md @@ -14,6 +14,7 @@ - Support packaging with poetry 2.0 - Solve pickling issues with multiprocessing when pytorch is installed +- Allow deep attributes like `a.b.c` for `span_attributes` in Standoff and OMOP doc2dict converters # v0.15.0 (2024-12-13) diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py index 2059d3c8c3..4cca047b00 100644 --- a/edsnlp/data/converters.py +++ b/edsnlp/data/converters.py @@ -355,6 +355,14 @@ def __init__( def __call__(self, doc): spans = get_spans(doc, self.span_getter) + span_binding_getters = { + obj_name: BINDING_GETTERS[ + ("_." + ext_name) + if ext_name.split(".")[0] not in SPAN_BUILTIN_ATTRS + else ext_name + ] + for ext_name, obj_name in self.span_attributes.items() + } obj = { FILENAME: doc._.note_id, "doc_id": doc._.note_id, @@ -369,9 +377,12 @@ def __call__(self, doc): } ], "attributes": { - obj_name: getattr(ent._, ext_name) - for ext_name, obj_name in self.span_attributes.items() - if ent._.has(ext_name) + obj_name: value + for obj_name, value in ( + (k, getter(ent)) + for k, getter in span_binding_getters.items() + ) + if value is not None }, "label": ent.label_, } @@ -621,8 +632,11 @@ def __call__(self, doc): "lexical_variant": ent.text, "note_nlp_source_value": ent.label_, **{ - obj_name: getter(ent) - for obj_name, getter in span_binding_getters.items() + obj_name: value + for obj_name, value in ( + (k, getter(ent)) + for k, getter in span_binding_getters.items() + ) }, } for i, ent in enumerate(sorted(dict.fromkeys(spans))) diff --git a/edsnlp/utils/bindings.py b/edsnlp/utils/bindings.py index eec911f016..17933cd5df 100644 --- a/edsnlp/utils/bindings.py +++ b/edsnlp/utils/bindings.py @@ -19,9 +19,14 @@ def _check_path(path: str): "The label must be a path of valid python identifier to be used as a getter" "in the following template: span.[YOUR_LABEL], such as `label_` or `_.negated" ) - if path[0].isalpha() or path[0] == "_": - return "." + path - return path + parts = path.split(".") + new_path = "span" + for part in parts: + if " " in part: + new_path = "getattr(" + new_path + f", {part!r})" + elif len(part) > 0: + new_path += "." + part + return new_path def make_binding_getter(attribute: Union[str, Binding]): @@ -47,7 +52,7 @@ def make_binding_getter(attribute: Union[str, Binding]): exec( f"def getter(span):\n" f" try:\n" - f" return span{path} == value\n" + f" return {path} == value\n" f" except AttributeError:\n" f" return False\n", ctx, @@ -60,7 +65,7 @@ def make_binding_getter(attribute: Union[str, Binding]): exec( f"def getter(span):\n" f" try:\n" - f" return span{path}\n" + f" return {path}\n" f" except AttributeError:\n" f" return None\n", ctx, @@ -88,12 +93,12 @@ def make_binding_setter(binding: Binding): if isinstance(binding, tuple): path, value = binding path = _check_path(path) - fn_string = f"""def setter(span): span{path} = value""" + fn_string = f"""def setter(span): {path} = value""" ctx = {"value": value} exec(fn_string, ctx, ctx) else: path = _check_path(binding) - fn_string = f"""def setter(span, value): span{path} = value""" + fn_string = f"""def setter(span, value): {path} = value""" ctx = {} exec(fn_string, ctx, ctx) return ctx["setter"]