In [1]:
import sys

from os import path
from pathlib import Path

from quickumls.constants import MEDSPACY_DEFAULT_SPAN_GROUP_NAME
import quickumls.spacy_component

import spacy
import nltk

sys.path.insert(0, "..")
import medspacy

from medspacy.util import get_quickumls_demo_dir
from medspacy.util import DEFAULT_PIPE_NAMES
from medspacy.visualization import visualize_ent
from medspacy.section_detection import Sectionizer


In [2]:
print('Running on platform: {}'.format(sys.platform))

Running on platform: darwin


## In the notebook for QuickUMLS Default, spacy entities were used for demonstration.  These entities are the default for many spacy components as well as medspacy components but these have a limitation in that their text spans cannot overlap.  The way to work around this is to use SpanGroup results which do allow overlapping.  These examples below will demonstrate how to do this.  Note that many non-default arguments for QuickUMLS and other medspacy components require setting up a pipeline more directly as opposed to using the helper utility `medspacy.load`, but this function can be  used as helpful reference.

In [3]:
nlp = spacy.blank("en")

quickumls_file_path = get_quickumls_demo_dir('en')

print('quickumls_file_path:')
print(quickumls_file_path)

nlp.add_pipe("medspacy_quickumls", config={"threshold": 0.7,
                                           "result_type": "group",
                                           # do not constrain to the best match for overlapping
                                           "best_match": False,
                                           "quickumls_fp": quickumls_file_path})

print(nlp.pipe_names)

quickumls_file_path:
/Users/u6022257/Documents/medspacy/resources/quickumls/QuickUMLS_SAMPLE_lowercase_POSIX_unqlite
['medspacy_quickumls']


In [4]:
# the demo data contains both of these concepts, so let's put them together
# and allow overlap on one of the tokens
# dipalmitoyl phosphatidylcholine
# phosphatidylcholine, dipalmitoyl
text = """dipalmitoyl phosphatidylcholine dipalmitoyl"""

doc = nlp(text)

print('Total spans matched: {}'.format(len(doc.spans[MEDSPACY_DEFAULT_SPAN_GROUP_NAME])))

for span in doc.spans[MEDSPACY_DEFAULT_SPAN_GROUP_NAME]:
    print('Span text : {}'.format(span.text))
    
    # each span may have multiple matches (different CUIs and similarity) so let's loop over that
    for umls_match in span._.umls_matches:
        print('Label (UMLS CUI) : {}'.format(span.label_))
        print('CUI: {}'.format(umls_match.cui))
        print('Similarity : {}'.format(umls_match.similarity))
        print('Semtypes : {}'.format(umls_match.semtypes))
        print('********************')

Total spans matched: 2
Span text : dipalmitoyl phosphatidylcholine
Label (UMLS CUI) : C0000039
CUI: C0000039
Similarity : 0.78125
Semtypes : {'T121', 'T119'}
********************
Span text : phosphatidylcholine dipalmitoyl
Label (UMLS CUI) : C0000039
CUI: C0000039
Similarity : 0.78125
Semtypes : {'T121', 'T119'}
********************
