In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import spacy
import sys
import cycontext
from cycontext.viz import visualize_dep, visualize_ent
from spacy.tokens.span import Span

In [3]:
sys.path.append("../..")

In [4]:
from nlp_postprocessor import Postprocessor, PostprocessingRule, PostprocessingPattern
from nlp_postprocessor import postprocessing_functions

In [63]:
nlp = spacy.load("en_core_web_sm", disable="ner")

In [6]:
context = cycontext.ConTextComponent(nlp, rules="default")

Use cases:
- Attributes:
    - If a span is negated, remove it
    - If a cancer entity is modified by family history, change the label to “FAMILY_HX_CANCER”
- Disambiguation:
    - If a span is followed by or preceded by certain words, change its label
    - If a phrase of text is found, delete all entities
    - If a sentence contains a phrase, change the label


**1.** If a span is negated, remove it

In [7]:
text = "no evidence of pneumonia but there is PE."

In [8]:
doc = nlp(text)

In [9]:
doc.ents = (Span(doc, 3, 4, "CONDITION"), Span(doc, 7, 8, "CONDITION"))

In [10]:
context(doc)

no evidence of pneumonia but there is PE.

In [11]:
visualize_dep(doc)

In [12]:
for ent in doc.ents:
    print(ent, ent._.is_negated)

pneumonia True
PE False


In [13]:
pattern = PostprocessingPattern(condition=postprocessing_functions.is_negated)

In [14]:
rule = PostprocessingRule([pattern], postprocessing_functions.remove_ent)

In [15]:
postprocessor = Postprocessor()

In [16]:
postprocessor.add([rule])

In [17]:
postprocessor(doc)

no evidence of pneumonia but there is PE.

In [37]:
doc.ents

(PE,)

## If a cancer entity is modified by family history, change the label to “FAMILY_HX_CANCER”

In [None]:
text = "family history of breast cancer."

## If a sentence contains a phrase, change the label

In [23]:
text = "lab results show high level of ca."
doc = nlp(text)

In [19]:
from spacy.pipeline import EntityRuler

In [20]:
cancer_pattern = {"label": "CANCER", "pattern": "ca"}

In [22]:
ruler = EntityRuler(nlp)
ruler.add_patterns([cancer_pattern])

In [24]:
ruler(doc)

lab results show high level of ca.

In [25]:
for ent in doc.ents:
    print(ent, ent.label_)

ca CANCER


If "ca" occurs in the same sentence as "lab", change the label from "CANCER" to "TEST".

In [50]:
pattern = PostprocessingPattern(condition=postprocessing_functions.sentence_contains, 
                                condition_args=("lab",))

In [51]:
rule = PostprocessingRule(patterns=[PostprocessingPattern], action=postprocessing_functions.set_label,
                         action_args=("TEST",))

In [52]:
postprocessor = Postprocessor()
postprocessor.add([rule])

In [53]:
postprocessor(doc)

lab results show high level of ca.

In [54]:
for ent in doc.ents:
    print(ent, ent.label_)

ca TEST


## If a span is followed by or preceded by certain words, change its label
"DM" can be a common abbreviation for "diabetes mellitus". However, it sometimes appears in the name of drugs containing dextromethorphan, such as "mucinex dm". You can disambiguate between diabetes and this other usage by checking if it is preceded by "mucinex".

In [68]:
doc = nlp("mucinex tablet dm")

In [69]:
ruler = EntityRuler(nlp)
ruler.add_patterns([{"label": "DIABETES", "pattern": "dm"}])

In [70]:
ruler(doc)

mucinex tablet dm

In [71]:
for ent in doc.elnts:
    print(ent, ent.label_)

dm DIABETES


In [81]:
postprocessing_functions.is_preceded_by(ent, ["mucinex", "guaifenesin"], window=2)

True

In [85]:
pattern = PostprocessingPattern(condition=postprocessing_functions.is_preceded_by,
                               condition_args=(["mucinex", "guaifenesin"], 2)
                               )

In [86]:
rule = PostprocessingRule([pattern], action=postprocessing_functions.remove_ent)

In [88]:
postprocessor = Postprocessor()
postprocessor.add([rule])
postprocessor(doc)

mucinex tablet dm

In [89]:
print(doc.ents)

()


In [90]:
doc = nlp("coronavirus epidemic")
doc.ents = (Span(doc, 0, 1, "CORONAVIRUS"),)

In [91]:
print(doc.ents)

(coronavirus,)


In [93]:
postprocessing_functions.is_followed_by(doc.ents[0], "epidemic", window=1)

True