In [1]:
import argparse
import pickle
import csv
from nltk import word_tokenize, sent_tokenize
import spacy
from spacy.tokens.doc import Doc
from spacy.tokens import Span
import medspacy
from medspacy.context import ConTextComponent, ConTextRule

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initialize ConText algorithm for negated entity detection
!python -m spacy download en_core_web_sm
import en_core_web_sm
nlp = en_core_web_sm.load(disable=["tokenizer","ner"])
# spacy.load("en_core_web_sm", disable=["tokenizer","ner"])
context = ConTextComponent(nlp, rules="default", use_context_window=True, max_scope=5)

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 10.9 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Add paths to files containing extracted mentions and raw texts here
# In our pipeline, mentions are extracted using a model trained on the i2b2 2010 dataset
mention_file = 'PATH TO MENTION EXTRACTOR OUTPUT'
text_file = 'PATH TO RAW TEXTS'

In [4]:
# Read in mentions and raw texts
texts = {}
reader = csv.reader(open(text_file))
next(reader, None)
for row in reader:
    texts[row[0]] = []
    sents = sent_tokenize(row[1])
    for sent in sents:
        texts[row[0]].append(list(word_tokenize(sent)))

mentions = pickle.load(open(mention_file, 'rb'))

In [5]:
# Apply ConText algorithm to identify and filter negated entities
filtered_mentions = {}
file_ind = 0
for file in texts:
    print('Processsing file {} ({})'.format(file, file_ind))
    file_ind += 1
    filtered_mentions[file] = {}
    for i, line in enumerate(texts[file]):
        cur_ment = mentions[file][i]
        if not cur_ment:
            continue
        # If mentions are present in sentence, perform negation-based filtering
        filtered_mentions[file][i] = []
        doc = Doc(nlp.vocab, line)
        for name, proc in nlp.pipeline:
            doc = proc(doc)
        entities = []
        for mention in cur_ment:
            entities.append(Span(doc, mention['start_offset'], mention['end_offset']+1, mention['pred_type']))
        doc.ents = tuple(entities)
        doc = context(doc)

        for ent in doc.ents:
            if ent._.is_negated:
                continue
            filtered_mentions[file][i].append({'mention': ent.text, 'start_offset': ent.start, 'end_offset': ent.end, 'pred_type': ent.label_})
    all_mention_count = sum([len(y) for x,y in mentions[file].items()])
    filtered_mention_count = sum([len(y) for x,y in filtered_mentions[file].items()])
    print('{} mentions kept out of {} for file {}'.format(filtered_mention_count, all_mention_count, file))

pickle.dump(filtered_mentions, open('PATH TO OUTPUT FILE', 'wb'))

Processsing file 181614 (0)
31 mentions kept out of 32 for file 181614
Processsing file 136342 (1)
17 mentions kept out of 18 for file 136342
Processsing file 199961 (2)
18 mentions kept out of 20 for file 199961
Processsing file 121438 (3)
63 mentions kept out of 63 for file 121438
Processsing file 194393 (4)
5 mentions kept out of 5 for file 194393
Processsing file 138531 (5)
68 mentions kept out of 73 for file 138531
Processsing file 180762 (6)
31 mentions kept out of 33 for file 180762
Processsing file 166330 (7)
34 mentions kept out of 43 for file 166330
Processsing file 170119 (8)
70 mentions kept out of 77 for file 170119
Processsing file 194148 (9)
85 mentions kept out of 104 for file 194148
Processsing file 196032 (10)
4 mentions kept out of 4 for file 196032
Processsing file 119472 (11)
17 mentions kept out of 19 for file 119472
Processsing file 133167 (12)
6 mentions kept out of 7 for file 133167
Processsing file 108023 (13)
78 mentions kept out of 85 for file 108023
Process