This notebook will explore different combinations of the features that were used in submission and save the results/evaluations to compare the different scores.

In [2]:
from bioc import BioCXMLReader

In [3]:
import os, sys
import pickle

In [4]:
sys.path.append('../final_system')

In [5]:
import annotation
import base_feature
import made_utils
import random
from collections import Counter, defaultdict


from nltk import ngrams as nltk_ngrams
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict

In [21]:
DATADIR = '/Users/alec/Data/NLP_Challenge'
# ALLDIR = os.path.join(DATADIR, 'original_data')
TRAINDIR = os.path.join(DATADIR, 'MADE-1.0')
TESTDIR = os.path.join(DATADIR, 'made_test_data')
print(os.path.exists(TRAINDIR))
print(os.path.exists(TESTDIR))

True
True


In [22]:
def normalize_grams(ngram_string):
    """
    Normalizes the values in a string of joined ngrams
    """
    # Substitute numbers
    ngram_string = re.sub('[\d]+|one|two|three|four|five|six|seven|eight|nine|ten', '<NUMBER>', ngram_string)
    return ngram_string



class LexicalFeatureExtractor(base_feature.BaseFeatureExtractor):
    """
    ngram_window - the length of ngrams to include in the vocabulary.
    context_window - the number of ngrams to include before and after the entity.
    """
    def __init__(self, ngram_window=(1, 1), context_window=(2, 2),
                vocab=None, pos_vocab=None, min_vocab_count=5, min_pos_count=5):
        super().__init__()
        self.ngram_window = ngram_window
        if min(ngram_window) < 1 or max(ngram_window) > 3:
            raise NotImplementedError("Ngram Window must be between one and 3")
        self.context_window = context_window
        self.min_vocab_count = min_vocab_count
        self.min_pos_count = min_pos_count

        # Set vocab and POS vocab
        self._unfiltered_vocab = vocab # Contains unigrams-trigrams, no count threshold
        self._unfiltered_pos_vocab = pos_vocab

        self.vocab = self.create_vocab(vocab, min_vocab_count, self.ngram_window) # Only contains ngrams defined by context_window
        #print(self.vocab); exit()
        self.pos_vocab =  self.create_vocab(pos_vocab, min_pos_count, self.ngram_window)
        #self.tokens = [gram for (gram, idx) in self.vocab.items() if len(gram.split()) == 1] # Only unigrams
        self.pos = {} # Will eventually contain mapping for POS tags

        # pyConText tools
        #self.modifiers = itemData.instantiateFromCSVtoitemData("https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/lexical_kb_05042016.tsv")
        #self.targets = itemData.instantiateFromCSVtoitemData("https://raw.githubusercontent.com/abchapman93/MADE_relations/master/feature_extraction/targets.tsv?token=AUOYx9rYHO6A5fiZS3mB9e_3DP83Uws8ks5aownVwA%3D%3D")


        #self.all_features_values = self.create_base_features()



    def create_base_features(self):
        """
        Enumerates possible feature values from the vocab, as well as an OOV value.
        Any features that are binary should only get one index and are encoded as 0.
        """
        # This will be a dictionary that contains all possible values for each feature
        all_features_values = {
            'same_sentence': 0,
            'num_tokens_between': 0,
            'grams_between': ['OOV'] + list(self.vocab),
            'grams_before': ['OOV'] + list(self.vocab),
            'grams_after': ['OOV'] + list(self.vocab),
            'pos_grams_between': ['OOV'] + list(self.pos_vocab),
            #'pos_grams_before': ['OOV'] + list(self.pos_vocab),
            #'pos_grams_after': ['OOV'] + list(self.pos_vocab),
            'first_entity_type': 0,#list(ENTITY_TYPES_MAPPING.values()),
            'second_entity_type': 0,#list(ENTITY_TYPES_MAPPING.values()),

            }
        return all_features_values

    def create_feature_dict(self, relat, doc, entities=True, entities_between=True, surface=True):
        """
        Takes a RelationAnnotation and an AnnotatedDocument.
        Returns the a dictionary containing the defined lexical features.
        """

        lex_features = {}

        if entities:
            lex_features.update(self.get_entity_features(relat, doc))
        if entities_between:
            lex_features.update(self.get_entities_between_features(relat, doc))
        if surface:
            lex_features.update(self.get_surface_features(relat, doc))
        return lex_features
    
    
    def get_entity_features(self, relat, doc):
        features = {}
        
        # The full string of the entities
        anno1, anno2 = relat.get_annotations()
        features['text_in_anno1'] = anno1.text.lower()
        features['text_in_anno2'] = anno2.text.lower()
        features['concat_text'] = anno1.text.lower() + ':' + anno2.text.lower()
        
        # Features for types of the entities
        features['first_entity_type:<{}>'.format(relat.entity_types[0].upper())] = 1
        features['second_entity_type:<{}>'.format(relat.entity_types[1].upper())] = 1
        
        # Feature types for entities, left to right
        sorted_entities = sorted((relat.annotation_1, relat.annotation_2), key=lambda a: a.span[0])
        features['entity_types_concat'] = '<=>'.join(['<{}>'.format(a.type.upper()) for a in sorted_entities])
        return features
    
    
    def get_entities_between_features(self, relat, doc):
       
        features = {}
        # One binary feature for every type of entity between
        entities_between = self.get_entities_between(relat, doc)
        # TODO: Maybe change this to a count
        features.update({
            'entities_between:<{}>'.format(v.type.upper()): 1 for v in entities_between
            })
        features['num_entities_between'] = len(entities_between)

        
        return features
    
    
    
    def get_surface_features(self, relat, doc):        
        
        features = {}
        
        # Same sentence
        features['num_sentences_overlap'] = len(doc.get_sentences_overlap_span(relat.get_span()))
        # Get the number of tokens between
        # NOTE: only unigrams
        
        features['num_tokens_between'] = len(self.get_grams_between(relat, doc, ngram_window=(1, 1)))
        # Get all tokens/POS tags in between
        # Create one feature for each ngram/tag
        features.update({
            'grams_between:<{}>'.format(v): 1 for v in self.get_grams_between(relat, doc)
            })
        features.update({
            'grams_before:<{}>'.format(v): 1 for v in self.get_grams_before(relat, doc)
            })
        features.update({
            'grams_after:<{}>'.format(v): 1 for v in self.get_grams_after(relat, doc)
            })

        features.update({
            'tags_between:<{}>'.format(v): 1 for v in self.get_grams_between(relat, doc, seq='tags')
            })
        features.update({
            'tags_before:<{}>'.format(v): 1 for v in self.get_grams_before(relat, doc, seq='tags')
            })
        features.update({
            'tags_after:<{}>'.format(v): 1 for v in self.get_grams_after(relat, doc, seq='tags')
            })

        # Get features for information about entities/context between
        # Binary feature: Are they in the same sentence?
        features['same_sentence'] = doc.in_same_sentence(relat.get_span())
        return features
        

    def get_grams_between(self, relat, doc, seq='tokens', ngram_window=None):
        """
        Returns the N-grams between the two entities connected in relat.
        Represents it as OOV if it's not in the vocabulary.
        Returns a unique set.
        """

        if seq == 'tokens':
            vocab = self.vocab
        elif seq == 'tags':
            vocab = self.pos_vocab
        else:
            raise ValueError("Must specify seq: {}".format(seq))

        if not ngram_window:
            ngram_window = self.ngram_window

        all_grams = []
        span1, span2 = relat.spans
        # Fixed this: get the start and span of the middle, not of the entire relation
        _, start, end, _ = sorted(span1 +span2)
        tokens_in_span = doc.get_tokens_or_tags_at_span((start, end), seq)
        # NOTE: lower-casing the ngrams, come back to this if you want to encode the casing
        tokens_in_span = [token.lower() for token in tokens_in_span]
        for n in range(ngram_window[0], ngram_window[1] + 1):
            # Now sort the ngrams so that it doesn't matter what order they occur in
            grams = list(nltk_ngrams(tokens_in_span, n))
            grams = self.sort_ngrams(grams)# + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_in_span, n))]
            all_grams.extend(set(grams))
        all_grams = [self.normalize_grams(x) for x in set(all_grams)]
        all_grams = [x if x in vocab else 'OOV' for x in all_grams]
        return set(all_grams)


    def get_grams_before(self, relat,doc, seq='tokens', ngram_window=None):
        """
        Returns the n-grams before the first entity.
        """
        if seq == 'tokens':
            vocab = self.vocab
        elif seq == 'tags':
            vocab = self.pos_vocab
        if not ngram_window:
            ngram_window = self.ngram_window

        all_grams = []
        offset = relat.span[0]
        tokens_before = doc.get_tokens_or_tags_before_or_after(offset, delta=-1,
            n=self.context_window[0], seq=seq, padding=True)
        tokens_before = [token.lower() for token in tokens_before]
        for n in range(ngram_window[0], ngram_window[1] + 1):
            grams = list(nltk_ngrams(tokens_before, n))
            grams = self.sort_ngrams(grams)# + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_in_span, n))]
            all_grams.extend(set(grams))
            #grams = grams + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_before, n))]
        all_grams = [self.normalize_grams(x) for x in set(all_grams)]
        all_grams = [x if x in vocab else 'OOV' for x in all_grams]
        return set(all_grams)

    def get_grams_after(self, relat, doc, seq='tokens', ngram_window=None):
        """
        Returns the n-grams after the final entity.
        """
        if seq == 'tokens':
            vocab = self.vocab
        elif seq == 'tags':
            vocab = self.pos_vocab
        if not ngram_window:
            ngram_window = self.ngram_window

        all_grams = []
        offset = relat.span[1]
        tokens_after = doc.get_tokens_or_tags_before_or_after(offset, delta=1,
                                        n=self.context_window[1], seq=seq)
        tokens_after = [token.lower() for token in tokens_after]
        for n in range(ngram_window[0], ngram_window[1] + 1):
            grams = list(nltk_ngrams(tokens_after, n))
            grams = self.sort_ngrams(grams)# + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_in_span, n))]
            all_grams.extend(set(grams))
            #grams = grams + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_after, n))]
        all_grams = [self.normalize_grams(x) for x in set(all_grams)]
        all_grams = [x if x in vocab else 'OOV' for x in all_grams]
        return set(all_grams)

    def sort_ngrams(self, ngrams):
        return [' '.join(sorted(tup)) for tup in ngrams]

    def normalize_grams(self, ngram_string):
        """
        Normalizes the values in a string of joined ngrams
        """
        # Substitute numbers
        return normalize_grams(ngram_string)

    def get_pos_tags(self):
        pass

    def get_entities_between(self, relat, doc):
        """
        Returns a list of entities that occur between entity1 and entity2
        """
        offset, end = relat.get_span()
        overlapping_entities = []
        # Index the entity in doc by span
        offset_to_entity = {entity.span[0]: entity for entity in doc.get_annotations()
                    if entity.id not in (
                        relat.annotation_1.id, relat.annotation_2.id)
                        }

        while offset < end:
            if offset in offset_to_entity:
                overlapping_entities.append(offset_to_entity[offset])
            offset += 1

        return overlapping_entities


    def get_sent_with_anno(self, anno, doc, entity_type):
        """
        Returns the sentence that contains a given annotation.
        Replaces the text of the annotations with a tag <ENTITY-TYPE>
        """
        tokens = []
        # Step back some window
        offset = anno.start_index

        while offset not in doc._sentences:
            offset -= 1
            if offset < 0:
                break
            if offset in doc._tokens:
                tokens.insert(0, doc._tokens[offset].lower())

        # Now add an entity
        tokens.append(entity_type)

        # Now add all the tokens between them
        offset = anno.start_index

        while offset not in doc._sentences:
            if offset > max(doc._tokens.keys()):
                break
            if offset in doc._tokens:
                tokens.append(doc._tokens[offset].lower())
            offset += 1


        return ' '.join(tokens)


    def __repr__(self):
        return "LexicalFeatureExtractor Ngram Window: {} Vocab: {} terms".format(
                self.ngram_window, len(self.vocab))

In [29]:
# # Let's remove the validation hold-out set
# import glob

# held_out = [os.path.basename(x) for x in glob.glob(os.path.join('..', 'data', 'heldout_xmls', 'corpus', '*'))]
# docs = {fname: doc for (fname, doc) in docs.items() if fname not in held_out}
# len(docs)

In [30]:
def pair_annotations_in_doc(doc, legal_edges=[], max_sent_length=3):
    """
    Takes a single AnnotatedDocument that contains annotations.
    All annotations that have a legal edge between them
    and are have an overlapping sentence length <= max_sent_length,
        ie., they are in either the same sentence or n adjancent sentences,
    are paired to create RelationAnnotations.
    Takes an optional list legal_edges that defines which edges should be allowed.

    Returns a list of new RelationAnnotations with annotation type 'none'.
    """
    if legal_edges == []:
        legal_edges = [('Drug', 'Route'),
                         ('Drug', 'Indication'),
                         ('SSLIF', 'Severity'),
                         ('Drug', 'Dose'),
                         ('Drug', 'Frequency'),
                         ('Drug', 'Duration'),
                         ('Drug', 'ADE'),
                         ('ADE', 'Severity'),
                         ('Indication', 'Severity'),
                         ('SSLIF', 'ADE')]
    true_annotations = doc.get_annotations()
    true_relations = doc.get_relations()
    generated_relations = []
    edges = defaultdict(list)
    edges = set()

    # Map all annotation_1's to annotation_2's
    # in order to identify all positive examples of relations
    # If this is testing data, it may not actually have these
    for relat in true_relations:
        anno1, anno2 = relat.get_annotations()
        edges.add((anno1.id, anno2.id))

    for anno1 in true_annotations:
        for anno2 in true_annotations:

            # Don't pair the same annotation with itself
            if anno1.id == anno2.id:
                continue

            if anno1.span == anno2.span:
                continue

            # Don't generate paris that have already been paried
            if (anno1.id, anno2.id) in edges:
                continue

            # Exclude illegal relations
            if len(legal_edges) and (anno1.type, anno2.type) not in legal_edges:
                continue

            # Check the span between them, make sure it's either 1 or 2
            start1, end1 = anno1.span
            start2, end2 = anno2.span
            sorted_spans = list(sorted([start1, end1, start2, end2]))
            span = (sorted_spans[0], sorted_spans[-1])
            overlapping_sentences = doc.get_sentences_overlap_span(span)
            if len(overlapping_sentences) > max_sent_length:
                continue

            # If they haven't already been paired, pair them
            else:
                generated_relation = annotation.RelationAnnotation.from_null_rel(
                    anno1, anno2, doc.file_name
                )
                edges.add((anno1.id, anno2.id))
                generated_relations.append(generated_relation)
    
    
    
    return list(set(generated_relations + true_relations))

def sample_negative_examples(relations, neg_prop=1.0):
    """
    Takes a list of Relationannotations and
    neg_prop, a float that specifies the proportion of negative
    to positive examples.

    In the future, a more sophisticated method of sampling might be used,
    ie., sampling by the probability of the Annotation types in the nodes.
    """
    pos_relations = []
    neg_relations = []
    for relat in relations:
        if relat.type == 'none':
            neg_relations.append(relat)
        else:
            pos_relations.append(relat)

    pos_size = len(pos_relations)
    neg_sample_size = int(neg_prop * pos_size)

    neg_sample = random.sample(neg_relations, neg_sample_size)
    print("Original Distribution: {} positive relations, {} negative relations".format(
                len(pos_relations),
                len(neg_relations)))
    print("{} positive relations, {} negative relations".format(len(pos_relations),
                len(neg_sample)))
    return pos_relations + neg_sample

In [90]:
reader = made_utils.TextAndBioCParser(TRAINDIR)
docs = reader.read_texts_and_xmls(5) # TODO: Change to -1

0/5


In [91]:
doc = docs['12_123']

In [92]:
relations = []
for i, (fname, doc) in enumerate(docs.items()):
    if i  >= 0:
        print('-{}: {} '.format(i, fname))
        print(len(doc.relations))
    new_relations = pair_annotations_in_doc(doc)
    
    # Add Fake relations for training
    neg_relations = set(new_relations).difference(set(doc.relations))
    # Sample them
    if len(neg_relations) >= 2 * len(doc.relations):
        neg_relations = random.sample(neg_relations, 2 * len(doc.relations))
    else:
        pass
        
    doc.add_relations(neg_relations)
    
    relations.extend(doc.get_relations())
    if i >= 0:
        print(len(doc.get_relations()))
   

-0: 12_123 
32
96
-1: 4_857 
31
59
-2: 17_839 
11
33
-3: 10_988 
16
48
-4: 13_513 
22
47


In [33]:
import pickle
with open('../final_system/data/vocab.pkl', 'rb') as f:
    vocab, pos_vocab = pickle.load(f)

with open('tmp_data/all_training_docs_and_relations.pkl', 'wb') as f:
    pickle.dump((docs, relations), f)

relations = []
for doc in docs.values():
    relations += doc.relations
random.shuffle(relations)
len(relations)

In [93]:
docs.keys()

dict_keys(['12_123', '4_857', '17_839', '10_988', '13_513'])

In [94]:
relat = relations[0]
print(len([r for r in relations if r.type == 'none']))

171


In [95]:
len(relations)

283

In [96]:
# Train-val split
from sklearn.model_selection import train_test_split
train_relats, val_relats = train_test_split(relations,test_size=0.2)

In [97]:
print(len(train_relats))
print(len(val_relats))

226
57


In [98]:
print(len({r.id for r in train_relats}.intersection({r.id for r in val_relats})))

0


In [99]:
r = relations[7]
doc = docs[r.file_name]
print(r.span)
r

(548, 637)


'atenolol':'oral', Drug:Route, type=manner/route

In [100]:
rids = [r.id for r in relations]
print(len(rids))
print(len(set(rids)))

283
283


In [101]:
print(len(relations))
len(set(relations))

283


283

In [102]:
relations = list(set(relations))

In [103]:
rtypes = [r.type for r in relations]
from collections import Counter
c = Counter(rtypes)
c

Counter({'none': 171,
         'do': 22,
         'severity_type': 19,
         'du': 15,
         'manner/route': 12,
         'fr': 13,
         'reason': 29,
         'adverse': 2})

In [104]:
feature_extractor = LexicalFeatureExtractor(context_window=(2, 2),
                            ngram_window=(1, 3), vocab=vocab, pos_vocab=pos_vocab,
                            min_vocab_count=20, min_pos_count=20)
feat_dicts = {feature_set_name: [] for feature_set_name in 
              ('entities', 'entities_between', 'surface', 
               'entities+entities_between', 'entities+entities_between+surface')}
for i, r in enumerate(relations):
    if i % 100 == 0:
        print("{}/{}".format(i, len(relations)))
    doc = docs[r.file_name]
    # Single feature sets
    entities_feat_dict = feature_extractor.create_feature_dict(r, doc, entities=True, entities_between=False, surface=False)
    entities_between_dict = feature_extractor.create_feature_dict(r, doc, entities=False, entities_between=True, surface=False)
    surface_dict = feature_extractor.create_feature_dict(r, doc, entities=False, entities_between=False, surface=True)
    feat_dicts['entities'].append(entities_feat_dict)
    feat_dicts['entities_between'].append(entities_between_dict)
    feat_dicts['surface'].append(surface_dict)
    
    # Now create the combinations
    combo = {}
    combo.update(entities_feat_dict)
    combo.update(entities_between_dict)
    
    feat_dicts['entities+entities_between'].append(combo)
    combo2 = {}
    combo2.update(combo)
    combo2.update(surface_dict)
#     feat_dicts['entities+entities_between'].append(combo)
    feat_dicts['entities+entities_between+surface'].append(combo2)
#     break
    continue
    # Original code
    
    feat_dicts['entities'].append(feature_extractor.create_feature_dict(r, doc, entities=True, entities_between=False, surface=False))
    feat_dicts['entities_between'].append(feature_extractor.create_feature_dict(r, doc, entities=False, entities_between=True, surface=False))
    feat_dicts['surface'].append(feature_extractor.create_feature_dict(r, doc, entities=False, entities_between=False, surface=True))
    
    # Combinations
    feat_dicts['entities+entities_between'].append(feature_extractor.create_feature_dict(r, doc, entities=True, entities_between=True, surface=False))
    feat_dicts['entities+entities_between+surface'].append(feature_extractor.create_feature_dict(r, doc))

0/283




100/283
200/283


In [107]:
with open('tmp_data/feat_dicts', 'wb') as f:
    pickle.dump((feat_dicts, relations), f)

In [29]:
with open('tmp_data/feat_dicts', 'rb') as f:
    feat_dicts = pickle.load(f)

In [108]:
feat_dicts['entities'][7]

{'text_in_anno1': 'doxazosin',
 'text_in_anno2': 'daily',
 'concat_text': 'doxazosin:daily',
 'first_entity_type:<DRUG>': 1,
 'second_entity_type:<FREQUENCY>': 1,
 'entity_types_concat': '<DRUG><=><FREQUENCY>'}

In [109]:
relations[7]

'doxazosin':'DAILY', Drug:Frequency, type=none

In [110]:
feat_dicts['entities_between'][7]

{'entities_between:<DOSE>': 1,
 'entities_between:<ROUTE>': 1,
 'entities_between:<FREQUENCY>': 1,
 'entities_between:<DRUG>': 1,
 'entities_between:<INDICATION>': 1,
 'num_entities_between': 27}

In [111]:
feat_dicts['surface'][7]

{'num_sentences_overlap': 1,
 'num_tokens_between': 28,
 'grams_between:<,>': 1,
 'grams_between:<daily oral tablet>': 1,
 'grams_between:<hours prn>': 1,
 'grams_between:<by ordered>': 1,
 'grams_between:<and>': 1,
 'grams_between:<every oral>': 1,
 'grams_between:<<NUMBER> mg tablet>': 1,
 'grams_between:<, mg tablet>': 1,
 'grams_between:<-- : by>': 1,
 'grams_between:<mg tablet>': 1,
 'grams_between:<<NUMBER> days for>': 1,
 'grams_between:<OOV>': 1,
 'grams_between:<: directions>': 1,
 'grams_between:<tablet>': 1,
 'grams_between:<prn>': 1,
 'grams_between:<: by ordered>': 1,
 'grams_between:<capsule mg>': 1,
 'grams_between:<<NUMBER> mg>': 1,
 'grams_between:<fatty>': 1,
 'grams_between:<->': 1,
 'grams_between:<- -- directions>': 1,
 'grams_between:<every>': 1,
 'grams_between:<, tablet>': 1,
 'grams_between:<daily>': 1,
 'grams_between:<name>': 1,
 'grams_between:<daily oral>': 1,
 'grams_between:<<NUMBER> : capsule>': 1,
 'grams_between:<oral tablet>': 1,
 'grams_between:<-->'

In [112]:
feat_dicts['entities+entities_between+surface'][7]

{'text_in_anno1': 'doxazosin',
 'text_in_anno2': 'daily',
 'concat_text': 'doxazosin:daily',
 'first_entity_type:<DRUG>': 1,
 'second_entity_type:<FREQUENCY>': 1,
 'entity_types_concat': '<DRUG><=><FREQUENCY>',
 'entities_between:<DOSE>': 1,
 'entities_between:<ROUTE>': 1,
 'entities_between:<FREQUENCY>': 1,
 'entities_between:<DRUG>': 1,
 'entities_between:<INDICATION>': 1,
 'num_entities_between': 27,
 'num_sentences_overlap': 1,
 'num_tokens_between': 28,
 'grams_between:<,>': 1,
 'grams_between:<daily oral tablet>': 1,
 'grams_between:<hours prn>': 1,
 'grams_between:<by ordered>': 1,
 'grams_between:<and>': 1,
 'grams_between:<every oral>': 1,
 'grams_between:<<NUMBER> mg tablet>': 1,
 'grams_between:<, mg tablet>': 1,
 'grams_between:<-- : by>': 1,
 'grams_between:<mg tablet>': 1,
 'grams_between:<<NUMBER> days for>': 1,
 'grams_between:<OOV>': 1,
 'grams_between:<: directions>': 1,
 'grams_between:<tablet>': 1,
 'grams_between:<prn>': 1,
 'grams_between:<: by ordered>': 1,
 'gra

In [None]:
# Read in test data
test_reader = made_utils.TextAndBioCParser(VALDIR)
test_docs = test_reader.read_texts_and_xmls(num_docs=-1, include_relations=False)

In [None]:
test_relations = []
for i, (fname, doc) in enumerate(test_docs.items()):
    if i % 10 == 0:
        print('-{}: {} '.format(i, fname))
        print(len(doc.relations))
    new_relations = pair_annotations_in_doc(doc)
    
    # Add Fake relations for training
    neg_relations = set(new_relations).difference(set(doc.relations))
    doc.add_relations(neg_relations)
    if i % 10 == 0:
        print(len(doc.relations))
        
    test_relations += doc.relations
   

with open('tmp_data/val_docs_and_relations.pkl', 'wb') as f:
    pickle.dump((test_docs, test_relations), f)

In [None]:
# feature_extractor = LexicalFeatureExtractor(context_window=(2, 2),
#                             ngram_window=(1, 3), vocab=vocab, pos_vocab=pos_vocab,
#                             min_vocab_count=20, min_pos_count=20)
feat_dicts_test = {feature_set_name: [] for feature_set_name in 
              ('entities', 'entities_between', 'surface', 
               'entities+entities_between', 'entities+entities_between+surface')}
for i, r in enumerate(test_relations):
    if i % 100 == 0:
        print("{}/{}".format(i, len(test_relations)))
    doc = test_docs[r.file_name]
    # Single feature sets
    entities_feat_dict = feature_extractor.create_feature_dict(r, doc, entities=True, entities_between=False, surface=False)
    entities_between_dict = feature_extractor.create_feature_dict(r, doc, entities=False, entities_between=True, surface=False)
    surface_dict = feature_extractor.create_feature_dict(r, doc, entities=False, entities_between=False, surface=True)
    feat_dicts_test['entities'].append(entities_feat_dict)
    feat_dicts_test['entities_between'].append(entities_between_dict)
    feat_dicts_test['surface'].append(surface_dict)
    
    # Now create the combinations
    combo = {}
    combo.update(entities_feat_dict)
    combo.update(entities_between_dict)
    
    feat_dicts_test['entities+entities_between'].append(combo)
    combo2 = {}
    combo2.update(combo)
    combo2.update(surface_dict)
#     feat_dicts['entities+entities_between'].append(combo)
    feat_dicts_test['entities+entities_between+surface'].append(combo2)
#     break
    continue
    

In [43]:
relations[0]

'Magnesium oxide':'a day', Drug:Frequency, type=none

In [39]:
feat_dicts['entities+entities_between+surface'][0]

{'concat_text': 'magnesium oxide:a day',
 'entity_types_concat': '<FREQUENCY><=><DRUG>',
 'first_entity_type:<DRUG>': 1,
 'grams_after:<<NUMBER> mg>': 1,
 'grams_after:<<NUMBER>>': 1,
 'grams_after:<mg>': 1,
 'grams_before:<<NUMBER> mcg>': 1,
 'grams_before:<<NUMBER>>': 1,
 'grams_before:<mcg>': 1,
 'grams_between:<. . <NUMBER>>': 1,
 'grams_between:<. <NUMBER>>': 1,
 'grams_between:<.>': 1,
 'grams_between:<<NUMBER>>': 1,
 'num_entities_between': 0,
 'num_sentences_overlap': 3,
 'num_tokens_between': 2,
 'same_sentence': 0,
 'second_entity_type:<FREQUENCY>': 1,
 'tags_after:<cd nns>': 1,
 'tags_after:<cd>': 1,
 'tags_after:<nns>': 1,
 'tags_before:<cd vbd>': 1,
 'tags_before:<cd>': 1,
 'tags_before:<vbd>': 1,
 'tags_between:<. . cd>': 1,
 'tags_between:<. cd>': 1,
 'tags_between:<.>': 1,
 'tags_between:<cd>': 1,
 'text_in_anno1': 'magnesium oxide',
 'text_in_anno2': 'a day'}

In [40]:
test_relations[0]

'lidocaine':'anesthesia', Drug:Indication, type=none

In [41]:
feat_dicts_test['entities+entities_between+surface'][0]

{'concat_text': 'lidocaine:anesthesia',
 'entity_types_concat': '<DRUG><=><INDICATION>',
 'first_entity_type:<DRUG>': 1,
 'grams_after:<OOV>': 1,
 'grams_before:<% <NUMBER>>': 1,
 'grams_before:<%>': 1,
 'grams_before:<<NUMBER>>': 1,
 'grams_between:<OOV>': 1,
 'grams_between:<injected>': 1,
 'grams_between:<local>': 1,
 'grams_between:<then was>': 1,
 'grams_between:<then>': 1,
 'grams_between:<to>': 1,
 'grams_between:<was>': 1,
 'num_entities_between': 0,
 'num_sentences_overlap': 1,
 'num_tokens_between': 6,
 'same_sentence': 1,
 'second_entity_type:<INDICATION>': 1,
 'tags_after:<OOV>': 1,
 'tags_before:<cd nn>': 1,
 'tags_before:<cd>': 1,
 'tags_before:<nn>': 1,
 'tags_between:<jj to vb>': 1,
 'tags_between:<jj vb>': 1,
 'tags_between:<jj>': 1,
 'tags_between:<rb to vbn>': 1,
 'tags_between:<rb vbd vbn>': 1,
 'tags_between:<rb vbd>': 1,
 'tags_between:<rb vbn>': 1,
 'tags_between:<rb>': 1,
 'tags_between:<to vb vbn>': 1,
 'tags_between:<to vb>': 1,
 'tags_between:<to vbn>': 1,
 '

In [42]:
len(set([r.id for r in relations]))

37666

In [44]:
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse=True, sort=True)
# TODO: Freeze the 1000 features that we use


In [45]:
y_full = [r.type for r in relations]
y_bin = ['any' if y_ != 'none' else y_ for y_ in y_full]
y_dict = {'bin': y_bin,
    'full': y_full}
X_dict = {feature_set_name: {'bin': None, 'full': None} for feature_set_name in feat_dicts.keys()}
X_dict

{'entities': {'bin': None, 'full': None},
 'entities+entities_between': {'bin': None, 'full': None},
 'entities+entities_between+surface': {'bin': None, 'full': None},
 'entities_between': {'bin': None, 'full': None},
 'surface': {'bin': None, 'full': None}}

In [46]:
y_full_test = [r.type for r in test_relations]
y_bin_test = ['any' if y_ != 'none' else y_ for y_ in y_full_test]
y_dict_test = {'bin': y_bin_test,
    'full': y_full_test}
X_dict_test = {feature_set_name: {'bin': None, 'full': None} for feature_set_name in feat_dicts_test.keys()}
X_dict_test

{'entities': {'bin': None, 'full': None},
 'entities+entities_between': {'bin': None, 'full': None},
 'entities+entities_between+surface': {'bin': None, 'full': None},
 'entities_between': {'bin': None, 'full': None},
 'surface': {'bin': None, 'full': None}}

In [47]:
len(feat_dicts['entities+entities_between'])

37676

In [48]:
len(feat_dicts_test['entities+entities_between'])

21053

In [49]:
feature_selectors = {feature_set_name: {'bin': None, 'full': None} for feature_set_name in X_dict_test.keys()}
vectorizers = {feature_set_name: None for feature_set_name in X_dict_test.keys()}

In [50]:


for feature_set_name, features in feat_dicts.items():
    vectorizer = DictVectorizer(sparse=True, sort=True)
    print(feature_set_name)
    k = 1000
    X_vector = vectorizer.fit_transform(features)
    print(X_vector.shape)
    vectorizers[feature_set_name] = vectorizer
    
#     print(X.shape)
    try:
        binary_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)
        X_bin = binary_feature_selector.fit_transform(X_vector, y_dict['bin'])
        
        
        full_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)
        X_full = full_feature_selector.fit_transform(X_vector, y_dict['full']) 
        
        
    except ValueError as e: # Not enough features
#         vectorizer = DictVectorizer(sparse=True, sort=True)
#         X_vector = vectorizer.fit_transform(features)
        binary_feature_selector = base_feature.MyFeatureSelector(vectorizer, k='all')
        X_bin = binary_feature_selector.fit_transform(X_vector, y_dict['bin'])
        
        full_feature_selector = base_feature.MyFeatureSelector(vectorizer, k='all')
        X_full = full_feature_selector.fit_transform(X_vector, y_dict['full']) 
    
    print(X_bin.shape, X_full.shape)
    X_dict[feature_set_name]['bin'] = X_bin
    X_dict[feature_set_name]['full'] = X_full
    
    feature_selectors[feature_set_name]['bin'] = binary_feature_selector
    feature_selectors[feature_set_name]['full'] = full_feature_selector

entities
(37676, 26577)
(37676, 1000) (37676, 1000)
entities_between
(37676, 11)
(37676, 11) (37676, 11)
surface
(37676, 15931)
(37676, 1000) (37676, 1000)
entities+entities_between
(37676, 26588)
(37676, 1000) (37676, 1000)
entities+entities_between+surface
(37676, 42519)
(37676, 1000) (37676, 1000)


In [51]:
# Now transform the test feature dicts

for feature_set_name, features in feat_dicts_test.items():
    print(feature_set_name)

    binary_feature_selector = feature_selectors[feature_set_name]['bin']
    full_feature_selector = feature_selectors[feature_set_name]['full']
    print(len(binary_feature_selector.vectorizer.get_feature_names()))
#     break
    
    try:
        X_bin = binary_feature_selector.vectorizer.transform(features)
        print(X_bin.shape)
        X_bin = binary_feature_selector.transform(X_bin)
        X_full = full_feature_selector.vectorizer.transform(features)
        X_full = full_feature_selector.transform(X_full) 
    except ValueError as e: # Not enough features
        raise e
       
    
    
    X_dict_test[feature_set_name]['bin'] = X_bin
    X_dict_test[feature_set_name]['full'] = X_full

entities
26577
(21053, 26577)
entities_between
11
(21053, 11)
surface
15931
(21053, 15931)
entities+entities_between
26588
(21053, 26588)
entities+entities_between+surface
42519
(21053, 42519)


with open('tmp_data/X_dicts.pkl', 'wb') as f:
    pickle.dump((X_dict, X_dict_test), f)

with open('tmp_data/X_dict_test.pkl', 'wb') as f:
    pickle.dump(X_dict, f)

In [62]:
with open('tmp_data/X_dicts.pkl', 'rb') as f:
    X_dict, X_dict_test = pickle.load(f)

In [53]:
X_dict

{'entities': {'bin': <37676x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 136731 stored elements in Compressed Sparse Row format>,
  'full': <37676x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 163994 stored elements in Compressed Sparse Row format>},
 'entities+entities_between': {'bin': <37676x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 226551 stored elements in Compressed Sparse Row format>,
  'full': <37676x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 253424 stored elements in Compressed Sparse Row format>},
 'entities+entities_between+surface': {'bin': <37676x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 1661898 stored elements in Compressed Sparse Row format>,
  'full': <37676x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 1835785 stored elements in Compressed Sparse Row format>},
 'entities_between': {'bin': <37676x11 sparse matrix of type '<class 'numpy.float64'>'
  	with 89917 stored el

In [54]:
X_dict_test

{'entities': {'bin': <21053x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 75417 stored elements in Compressed Sparse Row format>,
  'full': <21053x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 89225 stored elements in Compressed Sparse Row format>},
 'entities+entities_between': {'bin': <21053x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 142576 stored elements in Compressed Sparse Row format>,
  'full': <21053x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 156041 stored elements in Compressed Sparse Row format>},
 'entities+entities_between+surface': {'bin': <21053x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 1251121 stored elements in Compressed Sparse Row format>,
  'full': <21053x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 1315580 stored elements in Compressed Sparse Row format>},
 'entities_between': {'bin': <21053x11 sparse matrix of type '<class 'numpy.float64'>'
  	with 67172 stored elem

In [55]:
# Now we can train and evaluate each set
def train_clf(X, y, cross_val=False):
    """
    Trains and validates a model using cross-validation.
    """
    clf = RandomForestClassifier(max_depth = None,
                            max_features = None,
                            min_samples_leaf = 2,
                            min_samples_split = 2,
                            n_estimators = 10,
                            n_jobs = 1)
    if cross_val:
        # Cross-validate to make sure this is going right
        pred = cross_val_predict(clf, X, y)
        print(classification_report(y, pred))
    clf.fit(X, y)
    return clf

In [56]:
clfs = {feature_set_name: {"bin": None, "full": None} for feature_set_name in feat_dicts.keys()}

In [57]:
y_pred_dict = {feature_set_name: [] for feature_set_name in feat_dicts.keys()}

In [63]:
# for feature_set_name in X_dict.keys():
for feature_set_name in ['entities+entities_between+surface',]:
    print(feature_set_name)
    # First, train and get predictions for binary classifier
    print('bin', feature_set_name)
    X_train_bin, y_train_bin = X_dict[feature_set_name]['bin'], y_dict['bin']
    X_test_bin, y_test_bin = X_dict_test[feature_set_name]['bin'], y_dict_test['bin']
    bin_clf = train_clf(X_train_bin, y_train_bin)
    
    clfs[feature_set_name]['bin'] = bin_clf
    
    
    # Now, train and get predictions full classifier
    X_train_full = X_dict[feature_set_name]['full']
    y_train_full = y_dict['full']
    
    X_test_full, y_test_full = X_dict_test[feature_set_name]['full'], y_dict_test['full']
    
    print('full', feature_set_name)
    try:
        full_clf = train_clf(X_train_full, y_train_full)
        clfs[feature_set_name]['full'] = full_clf
    except Exception as e:
        raise e
        print(X_dict[feature_set_name]['full'], y_dict['full'])
        raise e
        
        
    # Now predict on the test set
    # Now get the agreed upon scores
    from collections import Counter
    
    print("Predicting binary test: {}".format(X_test_bin.shape))
    pred_bin = bin_clf.predict(X_test_bin)
    
    print(Counter(pred_bin))
    print("Predicting binary test: {}".format(X_test_full.shape))
    pred_full = full_clf.predict(X_test_full)
    print(Counter(pred_full))
    
    
    y_pred_dict[feature_set_name+'_no_binary'] = pred_full
    y_pred_dict[feature_set_name] = [pred_full[i] if pred_bin[i] != 'none' else 'none' for i in range(len(pred_full))]
print("Finished training and predicting")

entities+entities_between+surface
bin entities+entities_between+surface


KeyboardInterrupt: 

In [65]:
no_filter_pred = clfs['entities+entities_between+surface']['full'].predict(X_dict_test['entities+entities_between+surface']['full'])
print(no_filter_pred)

['reason' 'do' 'none' ..., 'fr' 'none' 'severity_type']


In [133]:
with open('tmp_data/preds.pkl', 'wb') as f:
    pickle.dump(y_pred_dict, f)
with open('tmp_data/clfs.pkl', 'wb') as f:
    pickle.dump(clfs, f)

In [None]:
with open('tmp_data/preds.pkl', 'rb') as f:
    y_pred_dict = pickle.load(f)
with open('tmp_data/clfs.pkl', 'rb') as f:
    clfs = pickle.load(f)

In [7]:
with open('tmp_data/preds.pkl', 'rb') as f:
    y_pred_dict = pickle.load(f)

In [62]:
# Write out bioc annotations
# Remove any duplicates that somehow got in

for doc in test_docs.values():
    doc.relations = []
    existing_annos = set()
    to_add = []
    for i, anno in enumerate(doc.annotations):
        if anno.id not in existing_annos:
            to_add.append(anno)
            existing_annos.add(anno.id)
    doc.annotations = to_add

from collections import defaultdict
relations_already_seen = []

for feature_set_name in y_pred_dict.keys():
# for feature_set_name in ('entities+entities_between+surface',):
    print(feature_set_name)
    for i in range(len(y_pred_dict[feature_set_name])):
        p = y_pred_dict[feature_set_name][i]
    #     print(p); break
        r = test_relations[i]
        r.type = p
        doc = test_docs[r.file_name]
        if r.type != 'none':
            doc.relations.append(r)


    for doc in test_docs.values():
        existing_relats = set()
        to_add = []
        for i, relat in enumerate(doc.relations):
            if relat.id not in existing_relats:
                to_add.append(relat)
                existing_relats.add(relat.id)
        doc.relations = to_add

    OUTDIR = 'tmp_data/output_{}'.format(feature_set_name)
    if not os.path.exists(OUTDIR):
        os.mkdir(OUTDIR)
    for d in test_docs.values():
        d.to_bioc_xml(OUTDIR)
        
print("Done")

entities
entities_between
surface
entities+entities_between
entities+entities_between+surface
Done


## Error Analysis

In [97]:
# Read the documents back in, this time with true relations
# Read in test data
gold_test_reader = made_utils.TextAndBioCParser(VALDIR)
gold_test_docs = gold_test_reader.read_texts_and_xmls(num_docs=-1, include_relations=True)

0/176
100/176


In [98]:
gold_test_relations = []
for doc in gold_test_docs.values():
    gold_test_relations += doc.relations
len(gold_test_relations)

4206

In [102]:
len(test_relations)

21053

In [103]:
# Make sure that our predictions are using the full feature set
for i in range(len(y_pred_dict[feature_set_name])):
    p = y_pred_dict['entities+entities_between+surface'][i]
#     print(p); break
    r = test_relations[i]
    r.type = p

In [104]:
# Create a set of all anno -> anno edges in gold
gold_edges = {file_name: {} for file_name in gold_test_docs.keys()}
for relat in gold_test_relations:
     gold_edges[relat.file_name][(relat.annotation_1.start_index,
                               relat.annotation_2.end_index)] = relat

In [105]:
pred_edges = {file_name: {} for file_name in test_docs.keys()}
for relat in test_relations:
    # Exclude any 'none' relations
    if relat.type == 'none':
        continue
    pred_edges[relat.file_name][(relat.annotation_1.start_index,
                               relat.annotation_2.end_index)] = relat

### Types of errors

- **Relation-Type Errors** - We predicted a relation between the two entities but the wrong relation
- **False Negative** - There is a relation, but we missed it
- **False Positive** - There is a relation between the two, but we missed it

In [107]:
type_errors = []
false_negatives = []
for f_name, anno_edges in gold_edges.items():
    for anno_edge in anno_edges:
        # If there's not a relation between two annotations, it's a false negative
        if anno_edge not in pred_edges[f_name]:
            false_negatives.append(anno_edges[anno_edge])
        else:
            true_relat = anno_edges[anno_edge]
            pred_relat = pred_edges[f_name][anno_edge]
            if true_relat.type != pred_relat.type:
                type_errors.append((true_relat, pred_relat))
#         if anno_edges[anno_edge] 
    
# Now go through and find the false positives
false_positives = []
for f_name, anno_edges in pred_edges.items():
    for anno_edge in anno_edges:
        if anno_edge not in gold_edges[f_name]:
            false_positives.append(anno_edges[anno_edge])

In [112]:
print("Number of Type Errors: {}".format(len(type_errors)))
print("Number of False Negatives: {}".format(len(false_negatives)))
print("Number of False Positives: {}".format(len(false_positives)))

Number of Type Errors: 12
Number of False Negatives: 363
Number of False Positives: 332


In [108]:
# (Truth, Pred)
type_errors

[('chemotherapy':'shingles', Drug:ADE, type=adverse,
  'chemotherapy':'shingles', Drug:Indication, type=reason),
 ('Carafate':'mucositis', Drug:Indication, type=reason,
  'Carafate':'mucositis', Drug:ADE, type=adverse),
 ('chemo':'shingles', Drug:ADE, type=adverse,
  'chemo':'shingles', Drug:Indication, type=reason),
 ('steroids':'decreased bone density', Drug:ADE, type=adverse,
  'steroids':'decreased bone density', Drug:Indication, type=reason),
 ('steroids':'low testosterone level', Drug:ADE, type=adverse,
  'steroids':'low testosterone level', Drug:Indication, type=reason),
 ('Zofran':'vomiting', Drug:Indication, type=reason,
  'Zofran':'vomiting', Drug:ADE, type=adverse),
 ('scopolamine':'nausea', Drug:Indication, type=reason,
  'scopolamine':'nausea', Drug:ADE, type=adverse),
 ('Benadryl':'rash', Drug:Indication, type=reason,
  'Benadryl':'rash', Drug:ADE, type=adverse),
 ('steroid':'dyspnea', Drug:ADE, type=adverse,
  'steroid':'dyspnea', Drug:Indication, type=reason),
 ('steroi

In [124]:
import random
for (truth_relat, pred_relat) in random.sample(type_errors, len(type_errors)):
    doc = gold_test_docs[truth_relat.file_name]
    print(doc.file_name)
    print("Truth: {}".format(truth_relat.type))
    print("Pred: {}".format(pred_relat.type))
    print('-    ' + truth_relat.get_example_string(doc))
    print()
    

6_917
Truth: reason
Pred: adverse
-    o monitored. --- Name --- did have some <REASON><INDICATION>nausea</INDICATION> that was refractory to these <DRUG>scopolamine</DRUG></REASON>  patch and Zofran and was additionally g

10_758
Truth: adverse
Pred: reason
-    c  fever with the previous 5 cycles of <ADVERSE><DRUG>chemo</DRUG>, but did have one episode of <ADE>shingles</ADE></ADVERSE>   and has been treated on valacyclovir.

7_410
Truth: adverse
Pred: reason
-    ompression fractures. I also think the <ADVERSE><DRUG>steroid</DRUG> regimen may have increased her volume and that has resulted in <ADE>blood pressure going up</ADE></ADVERSE>  as well as increased edema. I therefor

3_405
Truth: reason
Pred: adverse
-     and drink, has occasional nausea  and <REASON><INDICATION>vomiting</INDICATION>, which was adequately controlled by <DRUG>Zofran</DRUG></REASON> . The patient has  been fever-free sin

10_197
Truth: adverse
Pred: reason
-    fever with either of the cycles of this <ADVERS

In [None]:
import random
for truth_relat in random.sample(false_negatives, 100):
    doc = gold_test_docs[truth_relat.file_name]
    print(doc.file_name)
    print("Truth: {}".format(truth_relat))
    print('-    ' + truth_relat.get_example_string(doc))
    print()
    

In [None]:
for pred_relat in random.sample(false_positives, 100):
    doc = test_docs[pred_relat.file_name]
    print(doc.file_name)
    print("Pred: {}".format(pred_relat))
    print('-    ' + pred_relat.get_example_string(doc))
    print()
    

In [None]:
test_docs['19_566'].text

In [None]:
false_negatives

In [None]:
false_positives

In [137]:
vectorizer.inverse_transform(X_dict_test['entities+entities_between+surface']['full'])[0]

{'concat_text=acyclovir:2 capsule': 1.0,
 'concat_text=air hunger:some': 1.0,
 'concat_text=antivirals:prophylactic': 1.0,
 'concat_text=aspirin:few cycles': 6.0,
 'concat_text=aspirin:in the a.m': 1.0,
 'concat_text=back pain:80%': 1.0,
 'concat_text=back pain:adverse reaction': 1.0,
 'concat_text=back pain:constipation': 1.0,
 'concat_text=bactrim:10 meq': 1.0}

In [None]:
# Instead of computing metrics this way,
# let's use their script to make sure we get the same number.

# For now, let's start with just the full feature_set
y_pred_dict

In [None]:
# For some reasons there are duplicates in the training data
# which probably messed up the cross-validation.

new_docs = {}


In [62]:
for fname, doc in new_docs.items():
    for anno in doc.get_annotations():
        print(anno.id)
        break
    break

NameError: name 'new_docs' is not defined

In [138]:
for i in range(100):
    print(y_pred_dict['entities+entities_between+surface'][i])
    print(feat_dicts_test['entities+entities_between+surface'][i])
    print(test_relations[i])
    print(test_relations[i].file_name)
    break

adverse
{'text_in_anno1': 'lidocaine', 'text_in_anno2': 'anesthesia', 'concat_text': 'lidocaine:anesthesia', 'first_entity_type:<DRUG>': 1, 'second_entity_type:<INDICATION>': 1, 'entity_types_concat': '<DRUG><=><INDICATION>', 'num_entities_between': 0, 'num_sentences_overlap': 1, 'num_tokens_between': 6, 'grams_between:<OOV>': 1, 'grams_between:<local>': 1, 'grams_between:<then was>': 1, 'grams_between:<injected>': 1, 'grams_between:<then>': 1, 'grams_between:<to>': 1, 'grams_between:<was>': 1, 'grams_before:<%>': 1, 'grams_before:<<NUMBER>>': 1, 'grams_before:<% <NUMBER>>': 1, 'grams_after:<OOV>': 1, 'tags_between:<to vb vbn>': 1, 'tags_between:<vb>': 1, 'tags_between:<vbn>': 1, 'tags_between:<rb vbd vbn>': 1, 'tags_between:<rb>': 1, 'tags_between:<rb vbd>': 1, 'tags_between:<rb vbn>': 1, 'tags_between:<jj to vb>': 1, 'tags_between:<rb to vbn>': 1, 'tags_between:<to vbn>': 1, 'tags_between:<to>': 1, 'tags_between:<to vb>': 1, 'tags_between:<jj>': 1, 'tags_between:<vbd>': 1, 'tags_betw

In [109]:
[r for r in new_docs['10_1'].relations]

['doxorubicin':'infusion', Drug:Route, type=manner/route,
 'valacyclovir':'bilateral shingles', Drug:Indication, type=reason,
 'bloating':'mild', SSLIF:Severity, type=severity_type,
 'neutropenia medication':'neutropenia', Drug:Indication, type=reason,
 'Valtrex':'500 mg', Drug:Dose, type=do,
 'Valtrex':'3 times daily', Drug:Frequency, type=fr,
 'posaconazole':'200 mg per 5 mL', Drug:Dose, type=do,
 'posaconazole':'3 times daily', Drug:Frequency, type=fr,
 'ciprofloxacin':'500 mg', Drug:Dose, type=do,
 'ciprofloxacin':'daily', Drug:Frequency, type=fr,
 'antiemetics':'p.r.n.', Drug:Frequency, type=fr,
 'uric acid':'daily', Drug:Frequency, type=fr,
 'uric acid':'prophylaxis', Drug:Indication, type=reason,
 'uric acid':'tumor \nlysis syndrome', Drug:Indication, type=reason,
 'doxorubicin':'infusion', Drug:Route, type=manner/route,
 'valacyclovir':'bilateral shingles', Drug:Indication, type=reason,
 'bloating':'mild', SSLIF:Severity, type=severity_type,
 'neutropenia medication':'neutropen

In [None]:
[a.id for a in new_docs['10_1'].annotations]

In [107]:
from sklearn.metrics import f1_score, precision_score, recall_score
def compute_metrics(y, pred, average='micro'):
    metrics = {}
    labels = set(y)
    labels.remove('none')
    labels = list(sorted(labels))
    metrics['precision'] = precision_score(y, pred, average=average, labels=labels)
    metrics['recall'] = recall_score(y, pred, average=average, labels=labels)
    metrics['f1'] = f1_score(y, pred, average=average, labels=labels)
    return metrics

In [108]:
# Now compute the metrics
metrics = {feature_set_name: None for feature_set_name in y_pred_dict.keys()}

for feature_set_name, y_pred in y_pred_dict.items():
    metrics[feature_set_name] = compute_metrics(y_dict['full'], y_pred, 'micro')
    

In [109]:
sorted(metrics.items(), key=lambda x:x[1]['f1'], reverse=True)

[('entities+entities_between+surface',
  {'f1': 0.9318378304591165,
   'precision': 0.98508826783395065,
   'recall': 0.88404921217353771}),
 ('entities+entities_between',
  {'f1': 0.9058706875918896,
   'precision': 0.95967255077153413,
   'recall': 0.85778113533347722}),
 ('surface',
  {'f1': 0.6509934013629971,
   'precision': 0.73451951503973523,
   'recall': 0.58452406647960287}),
 ('entities',
  {'f1': 0.52739580229043381,
   'precision': 0.76985540870862157,
   'recall': 0.40107921433196631}),
 ('entities_between',
  {'f1': 0.1727288185373545,
   'precision': 0.30428625891835959,
   'recall': 0.12059140945391755})]