Merge pull request #246 from DeNeutoy/new-linkers

New linkers
allenai · Jul 8, 2020 · 8994934 · 8994934
2 parents e3e9f0f + 60da605
commit 8994934
Show file tree

Hide file tree

Showing 7 changed files with 126 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -127,10 +127,17 @@ for abrv in doc._.abbreviations:
 ```
 ### EntityLinker
 
-The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. Currently, the
-Unified Medical Language System and the Medical Subject Headings (MESH) are supported.
-The linker simply performs a string overlap search on named entities,
-comparing them with a knowledge base of 2.7 million concepts using an approximate nearest neighbours search.
+The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. The linker simply performs
+a string overlap - based search (char-3grams) on named entities, comparing them with the concepts in a knowledge base
+using an approximate nearest neighbours search.
+
+Currently (v2.5.0), there are 5 supported linkers:
+
+- `umls`: Links to the [Unified Medical Language System](https://www.nlm.nih.gov/research/umls/index.html), levels 0,1,2 and 9. This has ~3M concepts.
+- `mesh`: Links to the [Medical Subject Headings](https://www.nlm.nih.gov/mesh/meshhome.html). This contains a smaller set of higher quality entities, which are used for indexing in Pubmed. MeSH contains ~30k entities. NOTE: The MeSH KB is derrived directly from MeSH itself, and as such uses different unique identifiers than the other KBs.
+- `rxnorm`: Links to the [RxNorm](https://www.nlm.nih.gov/research/umls/rxnorm/index.html) ontology. RxNorm contains ~100k concepts focused on normalized names for clinical drugs. It is comprised of several other drug vocabularies commonly used in pharmacy management and drug interaction, including First Databank, Micromedex, and the Gold Standard Drug Database.
+- `go`: Links to the [Gene Ontology](http://geneontology.org/). The Gene Ontology contains ~67k concepts focused on the functions of genes.
+- `hpo`: Links to the [Human Phenotype Ontology](https://hpo.jax.org/app/). The Human Phenotype Ontology contains 16k concepts focused on phenotypic abnormalities encountered in human disease.
 
 You may want to play around with some of the parameters
 below to adapt to your use case (higher precision, higher recall etc).

diff --git a/scispacy/candidate_generation.py b/scispacy/candidate_generation.py
@@ -11,7 +11,14 @@
 from nmslib.dist import FloatIndex
 
 from scispacy.file_cache import cached_path
-from scispacy.linking_utils import KnowledgeBase, UmlsKnowledgeBase, MeshKnowledgeBase
+from scispacy.linking_utils import (
+    KnowledgeBase,
+    UmlsKnowledgeBase,
+    Mesh,
+    GeneOntology,
+    RxNorm,
+    HumanPhenotypeOntology,
+)
 
 
 class LinkerPaths(NamedTuple):
@@ -34,10 +41,10 @@ class LinkerPaths(NamedTuple):
 
 
 UmlsLinkerPaths = LinkerPaths(
-    ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/nmslib_index.bin",
-    tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectorizer.joblib",  # noqa
-    tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectors_sparse.npz",  # noqa
-    concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/concept_aliases.json",  # noqa
+    ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/nmslib_index.bin",
+    tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/tfidf_vectorizer.joblib",  # noqa
+    tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/tfidf_vectors_sparse.npz",  # noqa
+    concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/concept_aliases.json",  # noqa
 )
 
 MeshLinkerPaths = LinkerPaths(
@@ -47,15 +54,42 @@ class LinkerPaths(NamedTuple):
     concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_linking_model/concept_aliases.json",  # noqa
 )
 
+GeneOntologyLinkerPaths = LinkerPaths(
+    ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/nmslib_index.bin",
+    tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/tfidf_vectorizer.joblib",  # noqa
+    tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/tfidf_vectors_sparse.npz",  # noqa
+    concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/concept_aliases.json",  # noqa
+)
+
+HumanPhenotypeOntologyLinkerPaths = LinkerPaths(
+    ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/nmslib_index.bin",
+    tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/tfidf_vectorizer.joblib",  # noqa
+    tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/tfidf_vectors_sparse.npz",  # noqa
+    concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/concept_aliases.json",  # noqa
+)
+
+RxNormLinkerPaths = LinkerPaths(
+    ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/nmslib_index.bin",
+    tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/tfidf_vectorizer.joblib",  # noqa
+    tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/tfidf_vectors_sparse.npz",  # noqa
+    concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/concept_aliases.json",  # noqa
+)
+
 
 DEFAULT_PATHS: Dict[str, LinkerPaths] = {
     "umls": UmlsLinkerPaths,
     "mesh": MeshLinkerPaths,
+    "go": GeneOntologyLinkerPaths,
+    "hpo": HumanPhenotypeOntologyLinkerPaths,
+    "rxnorm": RxNormLinkerPaths,
 }
 
 DEFAULT_KNOWLEDGE_BASES: Dict[str, Type[KnowledgeBase]] = {
     "umls": UmlsKnowledgeBase,
-    "mesh": MeshKnowledgeBase,
+    "mesh": Mesh,
+    "go": GeneOntology,
+    "hpo": HumanPhenotypeOntology,
+    "rxnorm": RxNorm,
 }
 
 

diff --git a/scispacy/linking_utils.py b/scispacy/linking_utils.py
@@ -37,7 +37,7 @@ def __repr__(self):
 
 
 DEFAULT_UMLS_PATH = (
-    "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2017_aa_cat0129.json"
+    "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2020_aa_cat0129.jsonl"
 )
 DEFAULT_UMLS_TYPES_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv"
 
@@ -94,9 +94,33 @@ def __init__(
         )
 
 
-class MeshKnowledgeBase(KnowledgeBase):
+class Mesh(KnowledgeBase):
     def __init__(
         self,
         file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_2020.jsonl",
     ):
         super().__init__(file_path)
+
+
+class GeneOntology(KnowledgeBase):
+    def __init__(
+        self,
+        file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_gene_ontology.jsonl",
+    ):
+        super().__init__(file_path)
+
+
+class HumanPhenotypeOntology(KnowledgeBase):
+    def __init__(
+        self,
+        file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl",  # noqa
+    ):
+        super().__init__(file_path)
+
+
+class RxNorm(KnowledgeBase):
+    def __init__(
+        self,
+        file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl",  # noqa
+    ):
+        super().__init__(file_path)
diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py
@@ -37,7 +37,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
     return None
 
 
-def read_umls_concepts(meta_path: str, concept_details: Dict):
+def read_umls_concepts(meta_path: str, concept_details: Dict, source: str = None):
     """
     Read the concepts file MRCONSO.RRF from a UMLS release and store it in
     concept_details dictionary. Each concept is represented with
@@ -54,6 +54,8 @@ def read_umls_concepts(meta_path: str, concept_details: Dict):
     Args:
         meta_path: path to the META directory of an UMLS release
         concept_details: a dictionary to be filled with concept informations
+        source: An optional source identifier, used as a filter to extract only a
+                specific source from UMLS.
     """
     concepts_filename = "MRCONSO.RRF"
     headers = read_umls_file_headers(meta_path, concepts_filename)
@@ -65,6 +67,10 @@ def read_umls_concepts(meta_path: str, concept_details: Dict):
             if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N":
                 continue  # Keep English non-suppressed concepts only
 
+            if source is not None:
+                if concept["SAB"] != source:
+                    continue
+
             concept_id = concept["CUI"]
             if concept_id not in concept_details:  # a new concept
                 # add it to the dictionary with an empty list of aliases and types

diff --git a/scripts/create_linker.py b/scripts/create_linker.py
@@ -0,0 +1,27 @@
+import argparse
+import os
+
+from scispacy.candidate_generation import create_tfidf_ann_index
+from scispacy.linking_utils import KnowledgeBase
+
+
+def main(kb_path: str, output_path: str):
+
+    os.makedirs(output_path, exist_ok=True)
+    kb = KnowledgeBase(kb_path)
+    create_tfidf_ann_index(output_path, kb)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--kb_path',
+        help="Path to the KB file."
+    )
+    parser.add_argument(
+        '--output_path',
+        help="Path to the output directory."
+    )
+
+    args = parser.parse_args()
+    main(args.kb_path, args.output_path)
diff --git a/scripts/export_uml_json.py → scripts/export_umls_json.py b/scripts/export_uml_json.py → scripts/export_umls_json.py
@@ -1,13 +1,13 @@
 """
 
-Convert a umls release to a json file of concepts.
+Convert a umls release to a jsonl file of concepts.
 
 """
 import json
 import argparse
 from scispacy import umls_utils
 
-def main(meta_path, output_path):
+def main(meta_path: str, output_path: str, source: str = None):
 
     concept_details = {}  # dictionary of concept_id -> {
                           #                 'concept_id': str,
@@ -18,7 +18,7 @@ def main(meta_path, output_path):
                           # }
 
     print('Reading concepts ... ')
-    umls_utils.read_umls_concepts(meta_path, concept_details)
+    umls_utils.read_umls_concepts(meta_path, concept_details, source)
 
     print('Reading types ... ')
     umls_utils.read_umls_types(meta_path, concept_details)
@@ -73,12 +73,14 @@ def main(meta_path, output_path):
         if 'is_from_preferred_source' in concept:
             del concept['is_from_preferred_source']
 
-    print('Exporting to the a json file {} ...'.format(output_path))
+    print('Exporting to the a jsonl file {} ...'.format(output_path))
     with open(output_path, 'w') as fout:
-        json.dump(list(concept_details.values()), fout)
 
+        for value in concept_details.values():
+            fout.write(json.dumps(value) + "\n")
     print('DONE.')
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -87,7 +89,13 @@ def main(meta_path, output_path):
     )
     parser.add_argument(
         '--output_path',
-        help="Path to the output json file"
+        help="Path to the output jsonl file"
+    )
+    parser.add_argument(
+        '--source',
+        type=str,
+        default=None,
+        help="Whether to filter for a only a single UMLS source."
     )
     args = parser.parse_args()
-    main(args.meta_path, args.output_path)
+    main(args.meta_path, args.output_path, args.source)
diff --git a/scripts/train_linker.py b/scripts/train_linker.py