Skip to content

Commit

Permalink
Expose annotation through biolexica.Grounder and improve type hints (#10
Browse files Browse the repository at this point in the history
)
  • Loading branch information
cthoyt committed Feb 2, 2024
1 parent f0d3968 commit 7561873
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 38 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ prune tests/.pytest_cache
prune docs/build
prune docs/source/api
prune lexica
prune scenarios

recursive-include docs/source *.py
recursive-include docs/source *.rst
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,11 @@ Load a pre-defined grounder like this:
import biolexica

grounder = biolexica.load_grounder("phenotype")

>>> grounder.get_best_match("Alzheimer's disease")
Match(reference=Reference(prefix='doid', identifier='10652'), name="Alzheimer's disease", score=0.7777777777777778)

>>> grounder.annotate("""Clinical trials for reducing beta amyloid levels in Alzheimer's disease have been controversial.""")
```

Note: Biolexica constructs extended version of `gilda.Grounder` that has convenience functions and a more
Expand Down
37 changes: 36 additions & 1 deletion src/biolexica/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Union
from typing import TYPE_CHECKING, Any, Iterable, List, Literal, Optional, Union
from urllib.request import urlretrieve

import bioregistry
Expand All @@ -26,6 +26,8 @@
"iter_terms_by_prefix",
"load_grounder",
"get_mesh_category_curies",
"Annotation",
"Match",
]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -80,6 +82,30 @@ def from_gilda(cls, scored_match: gilda.ScoredMatch):
)


class Annotation(BaseModel):
"""Data about an annotation."""

text: str
start: int
end: int
match: Match

@property
def reference(self) -> Reference:
"""Get the match's reference."""
return self.match.reference

@property
def name(self) -> str:
"""Get the match's entry name."""
return self.match.name

@property
def substr(self) -> str:
"""Get the substring that was matched."""
return self.text[self.start : self.end]


class Grounder(gilda.Grounder):
"""Wrap a Gilda grounder with additional functionality."""

Expand Down Expand Up @@ -113,6 +139,15 @@ def get_best_match(
return None
return Match.from_gilda(scored_matches[0])

def annotate(self, text: str, **kwargs: Any) -> List[Annotation]:
"""Annotate the text."""
import gilda.ner

return [
Annotation(text=text, match=Match.from_gilda(match), start=start, end=end)
for text, match, start, end in gilda.ner.annotate(text, grounder=self, **kwargs)
]


def load_grounder(grounder: GrounderHint) -> Grounder:
"""Load a gilda grounder, potentially from a remote location."""
Expand Down
43 changes: 6 additions & 37 deletions src/biolexica/literature/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,17 @@
from collections import Counter
from typing import List, Optional, Union

import gilda
import gilda.ner
from curies import Reference
from more_itertools import batched
from pydantic import BaseModel
from tqdm.auto import tqdm

from biolexica.api import Annotation, GrounderHint, load_grounder
from biolexica.literature.retrieve import get_pubmed_dataframe
from biolexica.literature.search import query_pubmed

__all__ = [
"AnnotatedArticle",
"Annotation",
"annotate_abstracts_from_search",
"annotate_abstracts_from_pubmeds",
]
Expand All @@ -29,22 +27,6 @@
logger = logging.getLogger(__name__)


class Annotation(BaseModel):
"""Data about an annotation."""

text: str
reference: Reference
score: float
start: int
end: int
name: str

@property
def substr(self) -> str:
"""Get the substring that was matched."""
return self.text[self.start : self.end]


class AnnotatedArticle(BaseModel):
"""A data model representing an annotated article from PubMed."""

Expand All @@ -60,7 +42,7 @@ def count_references(self) -> t.Counter[t.Tuple[Reference, str]]:

def annotate_abstracts_from_search(
pubmed_query: str,
grounder: gilda.Grounder,
grounder: GrounderHint,
*,
use_indra_db: bool = True,
limit: Optional[int] = None,
Expand All @@ -78,7 +60,7 @@ def annotate_abstracts_from_search(

def annotate_abstracts_from_pubmeds(
pubmed_ids: t.Collection[Union[str, int]],
grounder: gilda.Grounder,
grounder: GrounderHint,
*,
use_indra_db: bool = True,
batch_size: int = 20_000,
Expand All @@ -89,6 +71,8 @@ def annotate_abstracts_from_pubmeds(

rv: List[AnnotatedArticle] = []

grounder = load_grounder(grounder)

outer_it = tqdm(
batched(pubmed_ids, batch_size),
total=1 + n_pmids // batch_size,
Expand Down Expand Up @@ -119,23 +103,8 @@ def annotate_abstracts_from_pubmeds(
pubmed=pmid,
title=title,
abstract=abstract,
annotations=annotate(abstract, grounder=grounder),
annotations=grounder.annotate(abstract),
)
)

return rv


def annotate(text: str, grounder: gilda.Grounder) -> List[Annotation]:
"""Annotate text using the given Gilda grounder."""
return [
Annotation(
text=text,
reference=Reference(prefix=match.term.db, identifier=match.term.id),
name=match.term.entry_name,
score=match.score,
start=start,
end=end,
)
for text, match, start, end in gilda.ner.annotate(text, grounder=grounder)
]

0 comments on commit 7561873

Please sign in to comment.