Skip to content

Commit

Permalink
Enable configuring exclusions from indexing (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Feb 1, 2024
1 parent ce27dde commit d7ee99b
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 60 deletions.
40 changes: 21 additions & 19 deletions lexica/anatomy/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,26 @@
"ncit",
# "umls", # TODO find appropriate subset
]
BIOLEXICA_CONFIG = [
biolexica.Input(source="uberon", processor="pyobo"),
biolexica.Input(
source="mesh",
# skip A11 since it's cells
ancestors=biolexica.get_mesh_category_curies("A", skip=["A11"]),
processor="pyobo",
),
biolexica.Input(
source="ncit",
ancestors=[
"NCIT:C12219", # Anatomic Structure, System, or Substance
],
processor="pyobo",
),
biolexica.Input(source="bto", processor="pyobo"),
biolexica.Input(source="caro", processor="pyobo"),
]
BIOLEXICA_CONFIG = biolexica.Configuration(
inputs=[
biolexica.Input(source="uberon", processor="pyobo"),
biolexica.Input(
source="mesh",
# skip A11 since it's cells
ancestors=biolexica.get_mesh_category_curies("A", skip=["A11"]),
processor="pyobo",
),
biolexica.Input(
source="ncit",
ancestors=[
"NCIT:C12219", # Anatomic Structure, System, or Substance
],
processor="pyobo",
),
biolexica.Input(source="bto", processor="pyobo"),
biolexica.Input(source="caro", processor="pyobo"),
]
)

SEMRA_CONFIG = semra.Configuration(
name="Anatomy mappings",
Expand Down Expand Up @@ -66,7 +68,7 @@
def _main() -> None:
mappings = SEMRA_CONFIG.get_mappings()
biolexica.assemble_terms(
inputs=BIOLEXICA_CONFIG,
BIOLEXICA_CONFIG,
mappings=mappings,
processed_path=TERMS_PATH,
)
Expand Down
24 changes: 14 additions & 10 deletions lexica/cell/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,19 @@
TERMS_PATH = HERE.joinpath("terms.tsv.gz")

PRIORITY = ["cl", "cellosaurus", "bto", "clo", "efo", "mesh", "ccle", "depmap"]
BIOLEXICA_CONFIG = [
biolexica.Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]), # cells (A11)
biolexica.Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
biolexica.Input(source="cellosaurus", processor="pyobo"),
biolexica.Input(source="ccle", processor="pyobo"),
biolexica.Input(source="bto", processor="pyobo"),
biolexica.Input(source="cl", processor="pyobo"),
biolexica.Input(source="clo", processor="pyobo"),
]
BIOLEXICA_CONFIG = biolexica.Configuration(
inputs=[
biolexica.Input(
source="mesh", processor="pyobo", ancestors=["mesh:D002477"]
), # cells (A11)
biolexica.Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
biolexica.Input(source="cellosaurus", processor="pyobo"),
biolexica.Input(source="ccle", processor="pyobo"),
biolexica.Input(source="bto", processor="pyobo"),
biolexica.Input(source="cl", processor="pyobo"),
biolexica.Input(source="clo", processor="pyobo"),
]
)

SEMRA_CONFIG = semra.Configuration(
name="Cell and Cell Line Mappings",
Expand Down Expand Up @@ -66,7 +70,7 @@
def _main() -> None:
mappings = SEMRA_CONFIG.get_mappings()
biolexica.assemble_terms(
inputs=BIOLEXICA_CONFIG,
BIOLEXICA_CONFIG,
mappings=mappings,
processed_path=TERMS_PATH,
)
Expand Down
41 changes: 22 additions & 19 deletions lexica/phenotype/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,27 @@
"mesh",
"efo",
]
BIOLEXICA_CONFIG = [
biolexica.Input(source="doid", processor="pyobo"),
biolexica.Input(source="mondo", processor="pyobo"),
biolexica.Input(source="hp", processor="pyobo"),
biolexica.Input(source="symp", processor="pyobo"),
biolexica.Input(
source="mesh",
processor="pyobo",
ancestors=[
*biolexica.get_mesh_category_curies("C"),
*biolexica.get_mesh_category_curies("F"),
# TODO should there be others?
],
),
biolexica.Input(source="efo", processor="pyobo"), # TODO find subset of EFO
# biolexica.Input(source="umls", processor="pyobo"), # TODO find subset of UMLS
# biolexica.Input(source="ncit", processor="pyobo"), # TODO find subset of NCIT
]
BIOLEXICA_CONFIG = biolexica.Configuration(
inputs=[
biolexica.Input(source="doid", processor="pyobo"),
biolexica.Input(source="mondo", processor="pyobo"),
biolexica.Input(source="hp", processor="pyobo"),
biolexica.Input(source="symp", processor="pyobo"),
biolexica.Input(
source="mesh",
processor="pyobo",
ancestors=[
*biolexica.get_mesh_category_curies("C"),
*biolexica.get_mesh_category_curies("F"),
# TODO should there be others?
],
),
biolexica.Input(source="efo", processor="pyobo"), # TODO find subset of EFO
# biolexica.Input(source="umls", processor="pyobo"), # TODO find subset of UMLS
# biolexica.Input(source="ncit", processor="pyobo"), # TODO find subset of NCIT
],
excludes=["doid:4"],
)

SEMRA_CONFIG = semra.Configuration(
name="Cell and Cell Line Mappings",
Expand Down Expand Up @@ -68,7 +71,7 @@
def _main() -> None:
mappings = SEMRA_CONFIG.get_mappings()
biolexica.assemble_terms(
inputs=BIOLEXICA_CONFIG,
BIOLEXICA_CONFIG,
mappings=mappings,
processed_path=TERMS_PATH,
)
Expand Down
Binary file modified lexica/phenotype/terms.tsv.gz
Binary file not shown.
40 changes: 32 additions & 8 deletions src/biolexica/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
import pyobo
from gilda.grounder import load_entries_from_terms_file
from gilda.process import normalize
from pydantic import BaseModel
from pydantic import BaseModel, Field
from tqdm.auto import tqdm

if TYPE_CHECKING:
import semra

__all__ = [
"Configuration",
"Input",
"assemble_terms",
"iter_terms_by_prefix",
Expand All @@ -42,6 +43,15 @@ class Input(BaseModel):
ancestors: Union[None, str, List[str]] = None


class Configuration(BaseModel):
"""A configuration for construction of a lexicon."""

inputs: List[Input]
excludes: Optional[List[str]] = Field(
default=None, description="A list of CURIEs to exclude after processing is complete"
)


PREDEFINED = ["cell", "anatomy", "phenotype"]
URL_FMT = "https://github.com/biopragmatics/biolexica/raw/main/lexica/{key}/terms.tsv.gz"

Expand All @@ -65,33 +75,40 @@ def load_grounder(grounder: GrounderHint) -> gilda.Grounder:


def assemble_grounder(
inputs: List[Union[Input, List[gilda.Term]]],
configuration: Configuration,
mappings: Optional[List["semra.Mapping"]] = None,
*,
extra_terms: Optional[List["gilda.Term"]] = None,
include_biosynonyms: bool = True,
) -> gilda.Grounder:
"""Assemble terms from multiple resources and load into a grounder."""
terms = assemble_terms(
inputs=inputs, mappings=mappings, include_biosynonyms=include_biosynonyms
configuration=configuration,
mappings=mappings,
include_biosynonyms=include_biosynonyms,
extra_terms=extra_terms,
)
grounder = gilda.Grounder(list(terms))
return grounder


def _term_curie(term: gilda.Term) -> str:
return f"{term.db}:{term.id}"


def assemble_terms(
inputs: List[Union[Input, List[gilda.Term]]],
configuration: Configuration,
mappings: Optional[List["semra.Mapping"]] = None,
*,
extra_terms: Optional[List["gilda.Term"]] = None,
include_biosynonyms: bool = True,
raw_path: Optional[Path] = None,
processed_path: Optional[Path] = None,
) -> List[gilda.Term]:
"""Assemble terms from multiple resources."""
terms: List[gilda.Term] = []
for inp in inputs:
if isinstance(inp, list):
terms.extend(inp)
elif inp.processor in {"pyobo", "bioontologies"}:
for inp in configuration.inputs:
if inp.processor in {"pyobo", "bioontologies"}:
terms.extend(
iter_terms_by_prefix(inp.source, ancestors=inp.ancestors, processor=inp.processor)
)
Expand All @@ -102,6 +119,9 @@ def assemble_terms(
else:
raise ValueError(f"Unknown processor {inp.processor}")

if extra_terms:
terms.extend(extra_terms)

if include_biosynonyms:
terms.extend(biosynonyms.get_gilda_terms())

Expand All @@ -114,6 +134,10 @@ def assemble_terms(

terms = update_terms(terms, mappings)

if configuration.excludes:
_excludes_set = set(configuration.excludes)
terms = [term for term in terms if _term_curie(term) not in _excludes_set]

if processed_path is not None:
logger.info("Writing %d processed terms to %s", len(terms), processed_path)
gilda.term.dump_terms(terms, processed_path)
Expand Down
8 changes: 7 additions & 1 deletion src/biolexica/literature/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,16 @@ def annotate_abstracts_from_search(
*,
use_indra_db: bool = True,
limit: Optional[int] = None,
show_progress: bool = True,
**kwargs,
) -> List[AnnotatedArticle]:
"""Get articles based on the query and do NER annotation using the given Gilda grounder."""
pubmed_ids = query_pubmed(pubmed_query, **kwargs)
if limit is not None:
pubmed_ids = pubmed_ids[:limit]
return annotate_abstracts_from_pubmeds(pubmed_ids, grounder=grounder, use_indra_db=use_indra_db)
return annotate_abstracts_from_pubmeds(
pubmed_ids, grounder=grounder, use_indra_db=use_indra_db, show_progress=show_progress
)


def annotate_abstracts_from_pubmeds(
Expand All @@ -79,6 +82,7 @@ def annotate_abstracts_from_pubmeds(
*,
use_indra_db: bool = True,
batch_size: int = 20_000,
show_progress: bool = True,
) -> List[AnnotatedArticle]:
"""Annotate the given articles using the given Gilda grounder."""
n_pmids = len(pubmed_ids)
Expand All @@ -90,6 +94,7 @@ def annotate_abstracts_from_pubmeds(
total=1 + n_pmids // batch_size,
unit="batch",
desc="Annotating articles",
disable=not show_progress,
)
for i, pubmed_batch in enumerate(outer_it, start=1):
t = time.time()
Expand All @@ -107,6 +112,7 @@ def annotate_abstracts_from_pubmeds(
unit="article",
total=n_retrieved,
leave=False,
disable=not show_progress,
):
rv.append(
AnnotatedArticle(
Expand Down
2 changes: 1 addition & 1 deletion src/biolexica/literature/retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_pubmed_dataframe(
except (ValueError, ImportError):
logger.warning(
"Could not to access INDRA DB, relying on PubMed API. "
"Warning: this is intractably slow and also is missing full text."
"Warning: this could be intractably slow depending on the query, and also is missing full text."
)
return _from_api(pubmed_ids)

Expand Down
26 changes: 24 additions & 2 deletions tests/test_lexica.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import biolexica
from biolexica.api import PREDEFINED
from biolexica.literature import annotate_abstracts_from_search
from biolexica.literature import annotate_abstracts_from_pubmeds, annotate_abstracts_from_search

HERE = Path(__file__).parent.resolve()
ROOT = HERE.parent
Expand Down Expand Up @@ -37,10 +37,32 @@ def test_ground_cells(self):
self.assertEqual("cellosaurus", res[0].term.db)
self.assertEqual("0030", res[0].term.id)

def test_exclude_doid_4(self):
"""Test that exclusion during construction of the phenotype index works properly."""
pubmed_id = "38279949"
results = annotate_abstracts_from_pubmeds(
[pubmed_id], grounder=self.phenotype_grounder, show_progress=False
)
articles_with_doid_4 = {
result.pubmed
for result in results
for ref, _name in result.count_references()
if ref.curie == "doid:4"
}
self.assertEqual(
set(),
articles_with_doid_4,
msg="No articles should contain the reference `doid:4` for the top-level disease annotation, "
"since this should be filtered out during construction of the lexical index.",
)

def test_search_alz(self):
"""Test searching and annotating Alzheimer's docs gets a desired annotation."""
results = annotate_abstracts_from_search(
"alzheimers", grounder=self.phenotype_grounder, limit=20
"alzheimers",
grounder=self.phenotype_grounder,
limit=20,
show_progress=False,
)
self.assertTrue(
any(
Expand Down

0 comments on commit d7ee99b

Please sign in to comment.