Expose annotation through biolexica.Grounder and improve type hints (#10

)
biopragmatics · Feb 2, 2024 · 7561873 · 7561873
1 parent f0d3968
commit 7561873
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 38 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -7,6 +7,7 @@ prune tests/.pytest_cache
 prune docs/build
 prune docs/source/api
 prune lexica
+prune scenarios
 
 recursive-include docs/source *.py
 recursive-include docs/source *.rst

diff --git a/README.md b/README.md
@@ -53,8 +53,11 @@ Load a pre-defined grounder like this:
 import biolexica
 
 grounder = biolexica.load_grounder("phenotype")
+
 >>> grounder.get_best_match("Alzheimer's disease")
 Match(reference=Reference(prefix='doid', identifier='10652'), name="Alzheimer's disease", score=0.7777777777777778)
+
+>>> grounder.annotate("""Clinical trials for reducing beta amyloid levels in Alzheimer's disease have been controversial.""")
 ```
 
 Note: Biolexica constructs extended version of `gilda.Grounder` that has convenience functions and a more

diff --git a/src/biolexica/api.py b/src/biolexica/api.py
@@ -3,7 +3,7 @@
 import logging
 import tempfile
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Iterable, List, Literal, Optional, Union
 from urllib.request import urlretrieve
 
 import bioregistry
@@ -26,6 +26,8 @@
     "iter_terms_by_prefix",
     "load_grounder",
     "get_mesh_category_curies",
+    "Annotation",
+    "Match",
 ]
 
 logger = logging.getLogger(__name__)
@@ -80,6 +82,30 @@ def from_gilda(cls, scored_match: gilda.ScoredMatch):
         )
 
 
+class Annotation(BaseModel):
+    """Data about an annotation."""
+
+    text: str
+    start: int
+    end: int
+    match: Match
+
+    @property
+    def reference(self) -> Reference:
+        """Get the match's reference."""
+        return self.match.reference
+
+    @property
+    def name(self) -> str:
+        """Get the match's entry name."""
+        return self.match.name
+
+    @property
+    def substr(self) -> str:
+        """Get the substring that was matched."""
+        return self.text[self.start : self.end]
+
+
 class Grounder(gilda.Grounder):
     """Wrap a Gilda grounder with additional functionality."""
 
@@ -113,6 +139,15 @@ def get_best_match(
             return None
         return Match.from_gilda(scored_matches[0])
 
+    def annotate(self, text: str, **kwargs: Any) -> List[Annotation]:
+        """Annotate the text."""
+        import gilda.ner
+
+        return [
+            Annotation(text=text, match=Match.from_gilda(match), start=start, end=end)
+            for text, match, start, end in gilda.ner.annotate(text, grounder=self, **kwargs)
+        ]
+
 
 def load_grounder(grounder: GrounderHint) -> Grounder:
     """Load a gilda grounder, potentially from a remote location."""

diff --git a/src/biolexica/literature/annotate.py b/src/biolexica/literature/annotate.py
@@ -8,19 +8,17 @@
 from collections import Counter
 from typing import List, Optional, Union
 
-import gilda
-import gilda.ner
 from curies import Reference
 from more_itertools import batched
 from pydantic import BaseModel
 from tqdm.auto import tqdm
 
+from biolexica.api import Annotation, GrounderHint, load_grounder
 from biolexica.literature.retrieve import get_pubmed_dataframe
 from biolexica.literature.search import query_pubmed
 
 __all__ = [
     "AnnotatedArticle",
-    "Annotation",
     "annotate_abstracts_from_search",
     "annotate_abstracts_from_pubmeds",
 ]
@@ -29,22 +27,6 @@
 logger = logging.getLogger(__name__)
 
 
-class Annotation(BaseModel):
-    """Data about an annotation."""
-
-    text: str
-    reference: Reference
-    score: float
-    start: int
-    end: int
-    name: str
-
-    @property
-    def substr(self) -> str:
-        """Get the substring that was matched."""
-        return self.text[self.start : self.end]
-
-
 class AnnotatedArticle(BaseModel):
     """A data model representing an annotated article from PubMed."""
 
@@ -60,7 +42,7 @@ def count_references(self) -> t.Counter[t.Tuple[Reference, str]]:
 
 def annotate_abstracts_from_search(
     pubmed_query: str,
-    grounder: gilda.Grounder,
+    grounder: GrounderHint,
     *,
     use_indra_db: bool = True,
     limit: Optional[int] = None,
@@ -78,7 +60,7 @@ def annotate_abstracts_from_search(
 
 def annotate_abstracts_from_pubmeds(
     pubmed_ids: t.Collection[Union[str, int]],
-    grounder: gilda.Grounder,
+    grounder: GrounderHint,
     *,
     use_indra_db: bool = True,
     batch_size: int = 20_000,
@@ -89,6 +71,8 @@ def annotate_abstracts_from_pubmeds(
 
     rv: List[AnnotatedArticle] = []
 
+    grounder = load_grounder(grounder)
+
     outer_it = tqdm(
         batched(pubmed_ids, batch_size),
         total=1 + n_pmids // batch_size,
@@ -119,23 +103,8 @@ def annotate_abstracts_from_pubmeds(
                     pubmed=pmid,
                     title=title,
                     abstract=abstract,
-                    annotations=annotate(abstract, grounder=grounder),
+                    annotations=grounder.annotate(abstract),
                 )
             )
 
     return rv
-
-
-def annotate(text: str, grounder: gilda.Grounder) -> List[Annotation]:
-    """Annotate text using the given Gilda grounder."""
-    return [
-        Annotation(
-            text=text,
-            reference=Reference(prefix=match.term.db, identifier=match.term.id),
-            name=match.term.entry_name,
-            score=match.score,
-            start=start,
-            end=end,
-        )
-        for text, match, start, end in gilda.ner.annotate(text, grounder=grounder)
-    ]