Enable configuring exclusions from indexing (#8)

biopragmatics · Feb 1, 2024 · d7ee99b · d7ee99b
1 parent ce27dde
commit d7ee99b
Show file tree

Hide file tree

Showing 8 changed files with 121 additions and 60 deletions.
diff --git a/lexica/anatomy/generate.py b/lexica/anatomy/generate.py
@@ -15,24 +15,26 @@
     "ncit",
     # "umls", # TODO find appropriate subset
 ]
-BIOLEXICA_CONFIG = [
-    biolexica.Input(source="uberon", processor="pyobo"),
-    biolexica.Input(
-        source="mesh",
-        # skip A11 since it's cells
-        ancestors=biolexica.get_mesh_category_curies("A", skip=["A11"]),
-        processor="pyobo",
-    ),
-    biolexica.Input(
-        source="ncit",
-        ancestors=[
-            "NCIT:C12219",  # Anatomic Structure, System, or Substance
-        ],
-        processor="pyobo",
-    ),
-    biolexica.Input(source="bto", processor="pyobo"),
-    biolexica.Input(source="caro", processor="pyobo"),
-]
+BIOLEXICA_CONFIG = biolexica.Configuration(
+    inputs=[
+        biolexica.Input(source="uberon", processor="pyobo"),
+        biolexica.Input(
+            source="mesh",
+            # skip A11 since it's cells
+            ancestors=biolexica.get_mesh_category_curies("A", skip=["A11"]),
+            processor="pyobo",
+        ),
+        biolexica.Input(
+            source="ncit",
+            ancestors=[
+                "NCIT:C12219",  # Anatomic Structure, System, or Substance
+            ],
+            processor="pyobo",
+        ),
+        biolexica.Input(source="bto", processor="pyobo"),
+        biolexica.Input(source="caro", processor="pyobo"),
+    ]
+)
 
 SEMRA_CONFIG = semra.Configuration(
     name="Anatomy mappings",
@@ -66,7 +68,7 @@
 def _main() -> None:
     mappings = SEMRA_CONFIG.get_mappings()
     biolexica.assemble_terms(
-        inputs=BIOLEXICA_CONFIG,
+        BIOLEXICA_CONFIG,
         mappings=mappings,
         processed_path=TERMS_PATH,
     )

diff --git a/lexica/cell/generate.py b/lexica/cell/generate.py
@@ -8,15 +8,19 @@
 TERMS_PATH = HERE.joinpath("terms.tsv.gz")
 
 PRIORITY = ["cl", "cellosaurus", "bto", "clo", "efo", "mesh", "ccle", "depmap"]
-BIOLEXICA_CONFIG = [
-    biolexica.Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]),  # cells (A11)
-    biolexica.Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
-    biolexica.Input(source="cellosaurus", processor="pyobo"),
-    biolexica.Input(source="ccle", processor="pyobo"),
-    biolexica.Input(source="bto", processor="pyobo"),
-    biolexica.Input(source="cl", processor="pyobo"),
-    biolexica.Input(source="clo", processor="pyobo"),
-]
+BIOLEXICA_CONFIG = biolexica.Configuration(
+    inputs=[
+        biolexica.Input(
+            source="mesh", processor="pyobo", ancestors=["mesh:D002477"]
+        ),  # cells (A11)
+        biolexica.Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
+        biolexica.Input(source="cellosaurus", processor="pyobo"),
+        biolexica.Input(source="ccle", processor="pyobo"),
+        biolexica.Input(source="bto", processor="pyobo"),
+        biolexica.Input(source="cl", processor="pyobo"),
+        biolexica.Input(source="clo", processor="pyobo"),
+    ]
+)
 
 SEMRA_CONFIG = semra.Configuration(
     name="Cell and Cell Line Mappings",
@@ -66,7 +70,7 @@
 def _main() -> None:
     mappings = SEMRA_CONFIG.get_mappings()
     biolexica.assemble_terms(
-        inputs=BIOLEXICA_CONFIG,
+        BIOLEXICA_CONFIG,
         mappings=mappings,
         processed_path=TERMS_PATH,
     )

diff --git a/lexica/phenotype/generate.py b/lexica/phenotype/generate.py
@@ -15,24 +15,27 @@
     "mesh",
     "efo",
 ]
-BIOLEXICA_CONFIG = [
-    biolexica.Input(source="doid", processor="pyobo"),
-    biolexica.Input(source="mondo", processor="pyobo"),
-    biolexica.Input(source="hp", processor="pyobo"),
-    biolexica.Input(source="symp", processor="pyobo"),
-    biolexica.Input(
-        source="mesh",
-        processor="pyobo",
-        ancestors=[
-            *biolexica.get_mesh_category_curies("C"),
-            *biolexica.get_mesh_category_curies("F"),
-            # TODO should there be others?
-        ],
-    ),
-    biolexica.Input(source="efo", processor="pyobo"),  # TODO find subset of EFO
-    # biolexica.Input(source="umls", processor="pyobo"), # TODO find subset of UMLS
-    # biolexica.Input(source="ncit", processor="pyobo"), # TODO find subset of NCIT
-]
+BIOLEXICA_CONFIG = biolexica.Configuration(
+    inputs=[
+        biolexica.Input(source="doid", processor="pyobo"),
+        biolexica.Input(source="mondo", processor="pyobo"),
+        biolexica.Input(source="hp", processor="pyobo"),
+        biolexica.Input(source="symp", processor="pyobo"),
+        biolexica.Input(
+            source="mesh",
+            processor="pyobo",
+            ancestors=[
+                *biolexica.get_mesh_category_curies("C"),
+                *biolexica.get_mesh_category_curies("F"),
+                # TODO should there be others?
+            ],
+        ),
+        biolexica.Input(source="efo", processor="pyobo"),  # TODO find subset of EFO
+        # biolexica.Input(source="umls", processor="pyobo"), # TODO find subset of UMLS
+        # biolexica.Input(source="ncit", processor="pyobo"), # TODO find subset of NCIT
+    ],
+    excludes=["doid:4"],
+)
 
 SEMRA_CONFIG = semra.Configuration(
     name="Cell and Cell Line Mappings",
@@ -68,7 +71,7 @@
 def _main() -> None:
     mappings = SEMRA_CONFIG.get_mappings()
     biolexica.assemble_terms(
-        inputs=BIOLEXICA_CONFIG,
+        BIOLEXICA_CONFIG,
         mappings=mappings,
         processed_path=TERMS_PATH,
     )

diff --git a/lexica/phenotype/terms.tsv.gz b/lexica/phenotype/terms.tsv.gz
diff --git a/src/biolexica/api.py b/src/biolexica/api.py
@@ -12,13 +12,14 @@
 import pyobo
 from gilda.grounder import load_entries_from_terms_file
 from gilda.process import normalize
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from tqdm.auto import tqdm
 
 if TYPE_CHECKING:
     import semra
 
 __all__ = [
+    "Configuration",
     "Input",
     "assemble_terms",
     "iter_terms_by_prefix",
@@ -42,6 +43,15 @@ class Input(BaseModel):
     ancestors: Union[None, str, List[str]] = None
 
 
+class Configuration(BaseModel):
+    """A configuration for construction of a lexicon."""
+
+    inputs: List[Input]
+    excludes: Optional[List[str]] = Field(
+        default=None, description="A list of CURIEs to exclude after processing is complete"
+    )
+
+
 PREDEFINED = ["cell", "anatomy", "phenotype"]
 URL_FMT = "https://github.com/biopragmatics/biolexica/raw/main/lexica/{key}/terms.tsv.gz"
 
@@ -65,33 +75,40 @@ def load_grounder(grounder: GrounderHint) -> gilda.Grounder:
 
 
 def assemble_grounder(
-    inputs: List[Union[Input, List[gilda.Term]]],
+    configuration: Configuration,
     mappings: Optional[List["semra.Mapping"]] = None,
     *,
+    extra_terms: Optional[List["gilda.Term"]] = None,
     include_biosynonyms: bool = True,
 ) -> gilda.Grounder:
     """Assemble terms from multiple resources and load into a grounder."""
     terms = assemble_terms(
-        inputs=inputs, mappings=mappings, include_biosynonyms=include_biosynonyms
+        configuration=configuration,
+        mappings=mappings,
+        include_biosynonyms=include_biosynonyms,
+        extra_terms=extra_terms,
     )
     grounder = gilda.Grounder(list(terms))
     return grounder
 
 
+def _term_curie(term: gilda.Term) -> str:
+    return f"{term.db}:{term.id}"
+
+
 def assemble_terms(
-    inputs: List[Union[Input, List[gilda.Term]]],
+    configuration: Configuration,
     mappings: Optional[List["semra.Mapping"]] = None,
     *,
+    extra_terms: Optional[List["gilda.Term"]] = None,
     include_biosynonyms: bool = True,
     raw_path: Optional[Path] = None,
     processed_path: Optional[Path] = None,
 ) -> List[gilda.Term]:
     """Assemble terms from multiple resources."""
     terms: List[gilda.Term] = []
-    for inp in inputs:
-        if isinstance(inp, list):
-            terms.extend(inp)
-        elif inp.processor in {"pyobo", "bioontologies"}:
+    for inp in configuration.inputs:
+        if inp.processor in {"pyobo", "bioontologies"}:
             terms.extend(
                 iter_terms_by_prefix(inp.source, ancestors=inp.ancestors, processor=inp.processor)
             )
@@ -102,6 +119,9 @@ def assemble_terms(
         else:
             raise ValueError(f"Unknown processor {inp.processor}")
 
+    if extra_terms:
+        terms.extend(extra_terms)
+
     if include_biosynonyms:
         terms.extend(biosynonyms.get_gilda_terms())
 
@@ -114,6 +134,10 @@ def assemble_terms(
 
         terms = update_terms(terms, mappings)
 
+    if configuration.excludes:
+        _excludes_set = set(configuration.excludes)
+        terms = [term for term in terms if _term_curie(term) not in _excludes_set]
+
     if processed_path is not None:
         logger.info("Writing %d processed terms to %s", len(terms), processed_path)
         gilda.term.dump_terms(terms, processed_path)

diff --git a/src/biolexica/literature/annotate.py b/src/biolexica/literature/annotate.py
@@ -64,13 +64,16 @@ def annotate_abstracts_from_search(
     *,
     use_indra_db: bool = True,
     limit: Optional[int] = None,
+    show_progress: bool = True,
     **kwargs,
 ) -> List[AnnotatedArticle]:
     """Get articles based on the query and do NER annotation using the given Gilda grounder."""
     pubmed_ids = query_pubmed(pubmed_query, **kwargs)
     if limit is not None:
         pubmed_ids = pubmed_ids[:limit]
-    return annotate_abstracts_from_pubmeds(pubmed_ids, grounder=grounder, use_indra_db=use_indra_db)
+    return annotate_abstracts_from_pubmeds(
+        pubmed_ids, grounder=grounder, use_indra_db=use_indra_db, show_progress=show_progress
+    )
 
 
 def annotate_abstracts_from_pubmeds(
@@ -79,6 +82,7 @@ def annotate_abstracts_from_pubmeds(
     *,
     use_indra_db: bool = True,
     batch_size: int = 20_000,
+    show_progress: bool = True,
 ) -> List[AnnotatedArticle]:
     """Annotate the given articles using the given Gilda grounder."""
     n_pmids = len(pubmed_ids)
@@ -90,6 +94,7 @@ def annotate_abstracts_from_pubmeds(
         total=1 + n_pmids // batch_size,
         unit="batch",
         desc="Annotating articles",
+        disable=not show_progress,
     )
     for i, pubmed_batch in enumerate(outer_it, start=1):
         t = time.time()
@@ -107,6 +112,7 @@ def annotate_abstracts_from_pubmeds(
             unit="article",
             total=n_retrieved,
             leave=False,
+            disable=not show_progress,
         ):
             rv.append(
                 AnnotatedArticle(

diff --git a/src/biolexica/literature/retrieve.py b/src/biolexica/literature/retrieve.py
@@ -26,7 +26,7 @@ def get_pubmed_dataframe(
         except (ValueError, ImportError):
             logger.warning(
                 "Could not to access INDRA DB, relying on PubMed API. "
-                "Warning: this is intractably slow and also is missing full text."
+                "Warning: this could be intractably slow depending on the query, and also is missing full text."
             )
     return _from_api(pubmed_ids)
 

diff --git a/tests/test_lexica.py b/tests/test_lexica.py
@@ -5,7 +5,7 @@
 
 import biolexica
 from biolexica.api import PREDEFINED
-from biolexica.literature import annotate_abstracts_from_search
+from biolexica.literature import annotate_abstracts_from_pubmeds, annotate_abstracts_from_search
 
 HERE = Path(__file__).parent.resolve()
 ROOT = HERE.parent
@@ -37,10 +37,32 @@ def test_ground_cells(self):
         self.assertEqual("cellosaurus", res[0].term.db)
         self.assertEqual("0030", res[0].term.id)
 
+    def test_exclude_doid_4(self):
+        """Test that exclusion during construction of the phenotype index works properly."""
+        pubmed_id = "38279949"
+        results = annotate_abstracts_from_pubmeds(
+            [pubmed_id], grounder=self.phenotype_grounder, show_progress=False
+        )
+        articles_with_doid_4 = {
+            result.pubmed
+            for result in results
+            for ref, _name in result.count_references()
+            if ref.curie == "doid:4"
+        }
+        self.assertEqual(
+            set(),
+            articles_with_doid_4,
+            msg="No articles should contain the reference `doid:4` for the top-level disease annotation, "
+            "since this should be filtered out during construction of the lexical index.",
+        )
+
     def test_search_alz(self):
         """Test searching and annotating Alzheimer's docs gets a desired annotation."""
         results = annotate_abstracts_from_search(
-            "alzheimers", grounder=self.phenotype_grounder, limit=20
+            "alzheimers",
+            grounder=self.phenotype_grounder,
+            limit=20,
+            show_progress=False,
         )
         self.assertTrue(
             any(