# Overview
* Introduction to pyConText
* Understand and develop Targets
* Understand and develop Modifiers
* Graph and visualize Targets and Modifiers together

In [None]:
import os
import re

import pyConTextNLP
# import pyConTextNLP.pyConText as pyConText
from pyConTextNLP import pyConTextGraph
from pyConTextNLP import itemData
from pyConTextNLP.display.html import mark_document_with_html


import urllib
import pandas as pd

# packages for interaction
from IPython.html.widgets import interact, interactive, fixed
from IPython.display import display, HTML, Image
import ipywidgets

# and also our utilities for this class
from nlp_pneumonia_utils import *
from text_processing import tokenize_sents

print('Imported pneumonia nlp utilities...')

In [None]:
# let us set up an example document to work with
example_document = """
PORTABLE CHEST:  Comparison made to prior film from X:XX a.m. the same day.
     
The ET tube and nasogastric tube remain in good position. Cardiac and
mediastinal contours are stable. No acute changes are seen within the lung
parenchyma; specifically, there is no evidence of new infiltrate (skin folds
do project over the right lung). No consolidation on either side.

IMPRESSION: No evidence of pneumonia."""

example_sentence = """IMPRESSION: No evidence of pneumonia."""

# I. Extracting mentions of pneumonia with pyConText

**pyConText** is an open-source Python library for extracting concepts from clinical text and asserting whether those concepts actually exist. For example, given the sentence below, we would want to extract the span of text **"pneumonia"**:

In [None]:
example_sentence = "There is clear evidence of pneumonia."

We start by defining our **targets**: these are the strings which represent the concepts we want to extract. To handle this example sentence, the only target we need is the word *"pneumonia"*.

A target in pyConText is defined an array of 4 elements:
1. **Literal**: the exact text of a concept. For example, "pneumonia"
2. **Category**: the semantic category of a concept. For example, "Evidence of Pneumonia"
3. **Regular Expression** (opt): a regular expression pattern used to match this target. If none is provided, then pyConText uses the string from *literal*
4. **Direction**: We'll cover this a little later

So let's now use pyConText to extract "pneumonia":

In [None]:
targets1 = [itemData.contextItem(["pneumonia", "EVIDENCE_OF_PNEUMONIA", "", ""])]
modifiers1 = [] # This is empty for now, we'll come back to it later

Now we create a **markup** item by calling a function `markup_sentence` with our text and the rules we've defined:

In [None]:
def markup_sentence(s, modifiers, targets, prune_inactive=True, verbose=False):
    """
    """
    markup = pyConTextGraph.ConTextMarkup()
    markup.setRawText(s)
    markup.cleanText()
    markup.markItems(targets, mode="target")
    markup.markItems(modifiers, mode="modifier")
    markup.pruneMarks()
    markup.dropMarks('Exclusion')
    # apply modifiers to any targets within the modifiers scope
    markup.applyModifiers()
    markup.pruneSelfModifyingRelationships()
    if prune_inactive:
        markup.dropInactiveModifiers()
    return markup

In [None]:
markup = markup_sentence(example_sentence, modifiers1, targets1)

The result is a `ConTextMarkup` which contains the sentence and the findings extracted by our rules. If we display this object we can see the results.

In [None]:
type(markup)

In [None]:
markup

A target with the class of **"evidence_of_pneumonia"** was matched. So we've now successfully identified evidence of pneumonia from clinical text.

**Question : Will we find a target match on this sentence below? Will we match "pneumonias"?**

Remember our lesson regular expressions: rules match *exactly* what they are specified to match.

In [None]:
example_sentence_2 = """Findings consistent with CHF, although underlying bilateral lower lobe pneumonias cannot be excluded."""

In [None]:
# let's see how things look on this sentence
markup_sentence_2 = markup_sentence(example_sentence_2, modifiers1, targets1)
markup_sentence_2

So let's modify our target so that we can match either "pneumonia" or "pneumonias".

In [None]:
targets2 = [
    itemData.contextItem(["", "EVIDENCE_OF_PNEUMONIA", "pneumonias?", ""]),
]
modifiers2 = []

In [None]:
# let's see how things look on this sentence
markup_sentence_2 = markup_sentence(example_sentence_2, modifiers2, targets2)
display(markup_sentence_2.nodes(data = True))

In [None]:
markup_sentence_2

## Processing multi-sentence documents
Now let's go to a new example. When extracting information from clinical text, we typically handle one sentence at a time. In the examples above, we were only looking at single sentences. Let's now extend this to process a document with multiple sentences. To do this, we'll write a function called `markup_context_document` which takes a report text, modifiers, and targets, splits the report up into sentences, and returns a `ConTextDocument` which represents the entire document, not just a single sentence.

In [None]:
def markup_context_document(report_text, modifiers, targets):
    context = pyConTextGraph.ConTextDocument()

    # Split up into sentences
    sentences = tokenize_sents(report_text)
    for sentence in sentences:
        m = markup_sentence(sentence, modifiers=modifiers, targets=targets)
        context.addMarkup(m)
    
    return context

In [None]:
example_document = """
IMPRESSION:
Findings consistent with CHF, although underlying bilateral lower lobe pneumonias cannot be excluded.
There is a discrete infiltration in the left upper and right lower lung zones. 
The infiltrate appears consistent with pneumonia.
"""

In [None]:
clearPyConTextRegularExpressions()

In [None]:
targets3 = [
    itemData.contextItem(["pneumonia", "EVIDENCE_OF_PNEUMONIA", "pneumonias?", ""]),
    itemData.contextItem(["infiltrate", "EVIDENCE_OF_PNEUMONIA", "infiltrate", ""]),
]

modifiers3 = []

In [None]:
context = markup_context_document(example_document, modifiers3, targets3)

In [None]:
type(context)

Let's look at our marked up document. We can view it in a view different ways:
- See the document graph, which will look like the markups above
- View an XML, which is a common structured data format and could be used to save processed documents
- Use HTML to visualize the marked up document

In [None]:
context.getDocumentGraph()

In [None]:
print(context.getXML())

In [None]:
evidence_only_colors = {
    "evidence_of_pneumonia": "orange"
}

context_html = mark_document_with_html(context, colors = evidence_only_colors, default_color="black")
display(HTML(context_html))

# Reading in a Knowledge Base
The targets which we defined above are what we call a **"knowledge base"** - they define the concepts we're interested in and the rules used to extract them.

When it's just a few rules, we can type them directly in our code. However, knowledge bases will often be much, much bigger and we will want to save them separate from our code.

pyConText lets us save a knowledge base as a **tab-separated (tsv)** file. In this exercise, we've saved a file at *KB/pneumonia_targets.tsv*. Let's read it in using Pandas:\ a file.

In [None]:
pneumonia_targets_file = 'KB/pneumonia_targets.tsv'

# let's see what we're working with by loading this as a Pandas DataFrame and then we can display it
pneumonia_targets_df = pd.read_csv(pneumonia_targets_file, delimiter = '\t')
pneumonia_targets_df

In [None]:
# before we continue, let's clear a mapping of compiled regular expressions which pyConText uses
clearPyConTextRegularExpressions()

In [None]:
# Our first attempt was very simple target, so now let's add some additional concepts
targets2 = []
modifiers2 = []


clearPyConTextRegularExpressions()
full_targets_path = 'file:///' + os.path.join(os.getcwd(), pneumonia_targets_file)
print('Loading pneumonia targets from : ' + full_targets_path)
targets2 = pyConTextNLP.itemData.instantiateFromCSVtoitemData(full_targets_path)

In [None]:
targets2

Now let's re-process the example report with our new rules:

In [None]:
context = markup_context_document(example_document, modifiers2, targets2)

In [None]:
context.getDocumentGraph()

In [None]:
context_html = mark_document_with_html(context, colors = evidence_only_colors, default_color="black")
display(HTML(context_html))

# Modifiers
We can now identify mentions of pneumonia in text. However, just finding those concepts is often not enough.

As an example, suppose a doctor suspects a patient has pneumonia but is not sure. They would order a CT of the lungs to check for pneumonia. To document that the purpose of the scan is to check for pneumonia, the report might say:


---
<p style="text-align: center;"><strong>Indication:</strong> Pneumonia</p>

---

After examing the scan, the radiologist might determine that the scan does not show pneumonia:

---
<p style="text-align: center;">
    <strong>Indication:</strong> Pneumonia<br>
    <strong>Impression:</strong> No evidence of pneumonia.
</p>

---

If we process this document with our current rules, here's what we extract:

In [None]:
example_document = """
Indication: Pneumonia.
Impression: No evidence of pneumonia.
"""

In [None]:
context = markup_context_document(example_document, modifiers3, targets3)

In [None]:
context.getDocumentGraph()

In [None]:
context_html = mark_document_with_html(context, colors = evidence_only_colors, default_color="black")
display(HTML(context_html))

Even though the word "pneumonia" is mentioned multiple times, the patient does not actually have pneumonia.

To handle this, we need to look at the **context** of a concept. This context will include **semantic modifiers** which add meaning to the phrases of our target concepts. For example, the phrases **"Indication"** **"No evidence"** both tell us that we shouldn't consider the sentence to represent a positive instance of pneumonia.

In [None]:
from IPython.display import Image

We also need to look at the **directionality** of a modifier. This is specified by the last element in the rule and can be either "forward", "backward", or "bidrectional". This specifies the syntactic property of how a phrase modifies the word around it. For example, the modifiers "no evidence of" and "Indication" above both modifier concepts **forward**, but in "is not seen" the sentence the modifier looks **backwards**:

In [None]:
Image('./images/no_evidence_of.png')

In [None]:
Image('./images/indication.png')

In [None]:
Image('./images/is_not_seen.png')

Just like a target, a modifier rule in pyConText consists of four elements. Instead of "EVIDENCE_OF_PNEUMONIA", our classes are now "INDICATION" and "DEFINITE_NEGATED_EXISTENCE". 


Let's add these modifiers and add re-process our document:

In [None]:
clearPyConTextRegularExpressions()

In [None]:
targets4 = [
    itemData.contextItem(["", "EVIDENCE_OF_PNEUMONIA", "pneumonias?", ""]),
]
modifiers4 = [
    itemData.contextItem(["indication", "INDICATION", "", "forward"]),
    itemData.contextItem(["no evidence of", "DEFINITE_NEGATED_EXISTENCE", "", "forward"]),
    itemData.contextItem(["not seen", "DEFINITE_NEGATED_EXISTENCE", "", "backward"]),
]

In [None]:
context = markup_context_document(example_document, modifiers4, targets4)

Look at the document graph below. Note that both "evidence_of_pneumonia" targets now have a "----MODIFIED BY" note. This means that these concepts are being modified by "Indication" and "No evidence of", which tells us that pneumonia doesn't actually exist in that sentence.

In [None]:
context.getDocumentGraph()

In [None]:
colors = {
    "evidence_of_pneumonia": "orange",
    "definite_negated_existence": "red",
    "indication": "purple"
}

context_html = mark_document_with_html(context, colors=colors, default_color="black")
display(HTML(context_html))

## Read in complete modifiers list

Let's now read in a larger list of modifiers which are included in the pyConText package:

In [None]:
modifier_file_path = 'file:///' + os.path.join(os.getcwd(), "KB/pneumonia_modifiers.tsv")
modifier_file = urllib.request.urlopen(modifier_file_path, data=None)
# now let's load this in directly into a DataFrame with Pandas and take a look at it
modifier_df = pd.read_csv(modifier_file, delimiter = "\t")
display(modifier_df.head(5))
display(modifier_df.tail(5))

Let's look at all the different types of modifiers:

In [None]:
modifier_df['Type'].value_counts()

In [None]:
modifier_df['Direction'].value_counts()

In [None]:
# let us set up an example document to work with
negative_document = """
PORTABLE CHEST:  Comparison made to prior film from X:XX a.m. the same day.
     
The ET tube and nasogastric tube remain in good position. Cardiac and
mediastinal contours are stable. No acute changes are seen within the lung
parenchyma; specifically, there is no evidence of new infiltrate (skin folds
do project over the right lung). No consolidation on either side.

IMPRESSION: No evidence of pneumonia."""

In [None]:
# And a positive one to compare with
positive_document = """
SINGLE VIEW CHEST, AP: There has been interval opacification within the left
lower lobe consistent with consolidation. The pulmonary vasculature is within
normal limits. The cardiac, mediastinal and hilar contours are unchanged with
unfolding of the aorta and wall calcifications. There may be a left-sided
pleural effusion also.

IMPRESSION: Left lower lobe pneumonia.
"""

In [None]:
full_targets_path = 'file:///' + os.path.join(os.getcwd(), pneumonia_targets_file)
modifier_file_path = 'file:///' + os.path.join(os.getcwd(), "KB/pneumonia_modifiers.tsv")

targets = pyConTextNLP.itemData.instantiateFromCSVtoitemData(full_targets_path)
modifiers = pyConTextNLP.itemData.instantiateFromCSVtoitemData(modifier_file_path)

In [None]:
# prepare some colors for displaying any markup we might see
colors = {
    "evidence_of_pneumonia": "orange",
    "definite_negated_existence": "red",
    "probable_negated_existence": "indianred",
    "ambivalent_existence": "orange",
    "probable_existence": "forestgreen",
    "definite_existence": "green",
    "historical": "goldenrod",
    "indication": "pink",
    "acute": "golden"
}

# let's mark up a new context object for our pipeline#3
context = markup_context_document(negative_document, modifiers, targets)

display(HTML(mark_document_with_html(context, colors = colors, default_color="black")))

# Document Inference
We can now extract mentions of pneumonia and identify contextual modifiers. Next, we want to roll this up to a document level. Based on the concepts found in the text, is the overall document **positive** or **negative** for pneumonia?

Here is the logic we will implement:
- Define a number of modifier categories which mean that a **phrase** does not actually contain pneumonia
- For each evidence of pneumonia found, check if it is modified by these modifiers. If it is, we can ignore it
- If at least one evidence of pneumonia does not have any non-positive modifiers, we'll say that this document is "PNEUMONIA_DOC_YES"
- Otherwise, it will be "PNEUMONIA_DOC_NO"

We'll wrap this logic up in a function and use it to classify our documents. We'll print out some information along the way to show what logic is being implemented.

In [None]:
non_positive_categories = ["definite_negated_existence",
                 "probable_negated_existence",
                 "probable_existence",
                "indication"
                 ]

In [None]:
def classify_pneumonia_document(report_text, modifiers, targets, verbose=0):
    context = markup_context_document(report_text, modifiers, targets)
    num_positive = 0
    graph = context.getDocumentGraph()
    for target in graph.getMarkedTargets():
        modified_by_non_positive = False
        for modifier in graph.getModifiers(target):
            if modifier.getCategory()[0] in non_positive_categories:
                if verbose:
                    print("--'{0}' modified by '{1}' - not positive evidence".format(target.getPhrase().upper(), 
                                                                               modifier.getPhrase().upper()))
                modified_by_non_positive = True
        if modified_by_non_positive is False:
            num_positive += 1
            if verbose:
                print("--'{0}' not modified by non-positive modifiers: setting num_positive = {1}".format(target.getPhrase().upper(),
                                                                                         num_positive))
    if num_positive > 0:
        if verbose:
            print("Positive Document")
        return 1 # Positive
    else: 
        if verbose:
            print("Negative Document")
        return 0

### Negative document

In [None]:
# This should return '0'
classify_pneumonia_document(negative_document, modifiers, targets, verbose=1)

In [None]:
context = markup_context_document(negative_document, modifiers, targets)
display(HTML(mark_document_with_html(context, colors = colors, default_color="black")))

### Positive document

In [None]:
# This should return '1'
classify_pneumonia_document(positive_document, modifiers, targets, verbose=1)

In [None]:
context = markup_context_document(positive_document, modifiers, targets)
display(HTML(mark_document_with_html(context, colors = colors, default_color="black")))