This notebook will explore different combinations of the features that were used in submission and save the results/evaluations to compare the different scores.

In [1]:
from bioc import BioCXMLReader

In [2]:
import os, sys
import pickle

In [3]:
sys.path.append('../final_system')

In [4]:
import annotation
import base_feature
import made_utils
import random
from collections import Counter, defaultdict


from nltk import ngrams as nltk_ngrams
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict

In [5]:
DATADIR = '/Users/alec/Data/NLP_Challenge'
# ALLDIR = os.path.join(DATADIR, 'original_data')
TRAINDIR = os.path.join(DATADIR, 'MADE-1.0')
TESTDIR = os.path.join(DATADIR, 'made_test_data')
print(os.path.exists(TRAINDIR))
print(os.path.exists(TESTDIR))

True
True


In [6]:
def normalize_grams(ngram_string):
    """
    Normalizes the values in a string of joined ngrams
    """
    # Substitute numbers
    ngram_string = re.sub('[\d]+|one|two|three|four|five|six|seven|eight|nine|ten', '<NUMBER>', ngram_string)
    return ngram_string



class LexicalFeatureExtractor(base_feature.BaseFeatureExtractor):
    """
    ngram_window - the length of ngrams to include in the vocabulary.
    context_window - the number of ngrams to include before and after the entity.
    """
    def __init__(self, ngram_window=(1, 1), context_window=(2, 2),
                vocab=None, pos_vocab=None, min_vocab_count=5, min_pos_count=5):
        super().__init__()
        self.ngram_window = ngram_window
        if min(ngram_window) < 1 or max(ngram_window) > 3:
            raise NotImplementedError("Ngram Window must be between one and 3")
        self.context_window = context_window
        self.min_vocab_count = min_vocab_count
        self.min_pos_count = min_pos_count

        # Set vocab and POS vocab
        self._unfiltered_vocab = vocab # Contains unigrams-trigrams, no count threshold
        self._unfiltered_pos_vocab = pos_vocab

        self.vocab = self.create_vocab(vocab, min_vocab_count, self.ngram_window) # Only contains ngrams defined by context_window
        #print(self.vocab); exit()
        self.pos_vocab =  self.create_vocab(pos_vocab, min_pos_count, self.ngram_window)
        #self.tokens = [gram for (gram, idx) in self.vocab.items() if len(gram.split()) == 1] # Only unigrams
        self.pos = {} # Will eventually contain mapping for POS tags

        # pyConText tools
        #self.modifiers = itemData.instantiateFromCSVtoitemData("https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/lexical_kb_05042016.tsv")
        #self.targets = itemData.instantiateFromCSVtoitemData("https://raw.githubusercontent.com/abchapman93/MADE_relations/master/feature_extraction/targets.tsv?token=AUOYx9rYHO6A5fiZS3mB9e_3DP83Uws8ks5aownVwA%3D%3D")


        #self.all_features_values = self.create_base_features()



    def create_base_features(self):
        """
        Enumerates possible feature values from the vocab, as well as an OOV value.
        Any features that are binary should only get one index and are encoded as 0.
        """
        # This will be a dictionary that contains all possible values for each feature
        all_features_values = {
            'same_sentence': 0,
            'num_tokens_between': 0,
            'grams_between': ['OOV'] + list(self.vocab),
            'grams_before': ['OOV'] + list(self.vocab),
            'grams_after': ['OOV'] + list(self.vocab),
            'pos_grams_between': ['OOV'] + list(self.pos_vocab),
            #'pos_grams_before': ['OOV'] + list(self.pos_vocab),
            #'pos_grams_after': ['OOV'] + list(self.pos_vocab),
            'first_entity_type': 0,#list(ENTITY_TYPES_MAPPING.values()),
            'second_entity_type': 0,#list(ENTITY_TYPES_MAPPING.values()),

            }
        return all_features_values

    def create_feature_dict(self, relat, doc, entities=True, entities_between=True, surface=True):
        """
        Takes a RelationAnnotation and an AnnotatedDocument.
        Returns the a dictionary containing the defined lexical features.
        """

        lex_features = {}

        if entities:
            lex_features.update(self.get_entity_features(relat, doc))
        if entities_between:
            lex_features.update(self.get_entities_between_features(relat, doc))
        if surface:
            lex_features.update(self.get_surface_features(relat, doc))
        return lex_features
    
    
    def get_entity_features(self, relat, doc):
        features = {}
        
        # The full string of the entities
        anno1, anno2 = relat.get_annotations()
        features['text_in_anno1'] = anno1.text.lower()
        features['text_in_anno2'] = anno2.text.lower()
        features['concat_text'] = anno1.text.lower() + ':' + anno2.text.lower()
        
        # Features for types of the entities
        features['first_entity_type:<{}>'.format(relat.entity_types[0].upper())] = 1
        features['second_entity_type:<{}>'.format(relat.entity_types[1].upper())] = 1
        
        # Feature types for entities, left to right
        sorted_entities = sorted((relat.annotation_1, relat.annotation_2), key=lambda a: a.span[0])
        features['entity_types_concat'] = '<=>'.join(['<{}>'.format(a.type.upper()) for a in sorted_entities])
        return features
    
    
    def get_entities_between_features(self, relat, doc):
       
        features = {}
        # One binary feature for every type of entity between
        entities_between = self.get_entities_between(relat, doc)
        # TODO: Maybe change this to a count
        features.update({
            'entities_between:<{}>'.format(v.type.upper()): 1 for v in entities_between
            })
        features['num_entities_between'] = len(entities_between)

        
        return features
    
    
    
    def get_surface_features(self, relat, doc):        
        
        features = {}
        
        # Same sentence
        features['num_sentences_overlap'] = len(doc.get_sentences_overlap_span(relat.get_span()))
        # Get the number of tokens between
        # NOTE: only unigrams
        
        features['num_tokens_between'] = len(self.get_grams_between(relat, doc, ngram_window=(1, 1)))
        # Get all tokens/POS tags in between
        # Create one feature for each ngram/tag
        features.update({
            'grams_between:<{}>'.format(v): 1 for v in self.get_grams_between(relat, doc)
            })
        features.update({
            'grams_before:<{}>'.format(v): 1 for v in self.get_grams_before(relat, doc)
            })
        features.update({
            'grams_after:<{}>'.format(v): 1 for v in self.get_grams_after(relat, doc)
            })

        features.update({
            'tags_between:<{}>'.format(v): 1 for v in self.get_grams_between(relat, doc, seq='tags')
            })
        features.update({
            'tags_before:<{}>'.format(v): 1 for v in self.get_grams_before(relat, doc, seq='tags')
            })
        features.update({
            'tags_after:<{}>'.format(v): 1 for v in self.get_grams_after(relat, doc, seq='tags')
            })

        # Get features for information about entities/context between
        # Binary feature: Are they in the same sentence?
        features['same_sentence'] = doc.in_same_sentence(relat.get_span())
        return features
        

    def get_grams_between(self, relat, doc, seq='tokens', ngram_window=None):
        """
        Returns the N-grams between the two entities connected in relat.
        Represents it as OOV if it's not in the vocabulary.
        Returns a unique set.
        """

        if seq == 'tokens':
            vocab = self.vocab
        elif seq == 'tags':
            vocab = self.pos_vocab
        else:
            raise ValueError("Must specify seq: {}".format(seq))

        if not ngram_window:
            ngram_window = self.ngram_window

        all_grams = []
        span1, span2 = relat.spans
        # Fixed this: get the start and span of the middle, not of the entire relation
        _, start, end, _ = sorted(span1 +span2)
        tokens_in_span = doc.get_tokens_or_tags_at_span((start, end), seq)
        # NOTE: lower-casing the ngrams, come back to this if you want to encode the casing
        tokens_in_span = [token.lower() for token in tokens_in_span]
        for n in range(ngram_window[0], ngram_window[1] + 1):
            # Now sort the ngrams so that it doesn't matter what order they occur in
            grams = list(nltk_ngrams(tokens_in_span, n))
            grams = self.sort_ngrams(grams)# + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_in_span, n))]
            all_grams.extend(set(grams))
        all_grams = [self.normalize_grams(x) for x in set(all_grams)]
        all_grams = [x if x in vocab else 'OOV' for x in all_grams]
        return set(all_grams)


    def get_grams_before(self, relat,doc, seq='tokens', ngram_window=None):
        """
        Returns the n-grams before the first entity.
        """
        if seq == 'tokens':
            vocab = self.vocab
        elif seq == 'tags':
            vocab = self.pos_vocab
        if not ngram_window:
            ngram_window = self.ngram_window

        all_grams = []
        offset = relat.span[0]
        tokens_before = doc.get_tokens_or_tags_before_or_after(offset, delta=-1,
            n=self.context_window[0], seq=seq, padding=True)
        tokens_before = [token.lower() for token in tokens_before]
        for n in range(ngram_window[0], ngram_window[1] + 1):
            grams = list(nltk_ngrams(tokens_before, n))
            grams = self.sort_ngrams(grams)# + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_in_span, n))]
            all_grams.extend(set(grams))
            #grams = grams + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_before, n))]
        all_grams = [self.normalize_grams(x) for x in set(all_grams)]
        all_grams = [x if x in vocab else 'OOV' for x in all_grams]
        return set(all_grams)

    def get_grams_after(self, relat, doc, seq='tokens', ngram_window=None):
        """
        Returns the n-grams after the final entity.
        """
        if seq == 'tokens':
            vocab = self.vocab
        elif seq == 'tags':
            vocab = self.pos_vocab
        if not ngram_window:
            ngram_window = self.ngram_window

        all_grams = []
        offset = relat.span[1]
        tokens_after = doc.get_tokens_or_tags_before_or_after(offset, delta=1,
                                        n=self.context_window[1], seq=seq)
        tokens_after = [token.lower() for token in tokens_after]
        for n in range(ngram_window[0], ngram_window[1] + 1):
            grams = list(nltk_ngrams(tokens_after, n))
            grams = self.sort_ngrams(grams)# + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_in_span, n))]
            all_grams.extend(set(grams))
            #grams = grams + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_after, n))]
        all_grams = [self.normalize_grams(x) for x in set(all_grams)]
        all_grams = [x if x in vocab else 'OOV' for x in all_grams]
        return set(all_grams)

    def sort_ngrams(self, ngrams):
        return [' '.join(sorted(tup)) for tup in ngrams]

    def normalize_grams(self, ngram_string):
        """
        Normalizes the values in a string of joined ngrams
        """
        # Substitute numbers
        return normalize_grams(ngram_string)

    def get_pos_tags(self):
        pass

    def get_entities_between(self, relat, doc):
        """
        Returns a list of entities that occur between entity1 and entity2
        """
        offset, end = relat.get_span()
        overlapping_entities = []
        # Index the entity in doc by span
        offset_to_entity = {entity.span[0]: entity for entity in doc.get_annotations()
                    if entity.id not in (
                        relat.annotation_1.id, relat.annotation_2.id)
                        }

        while offset < end:
            if offset in offset_to_entity:
                overlapping_entities.append(offset_to_entity[offset])
            offset += 1

        return overlapping_entities


    def get_sent_with_anno(self, anno, doc, entity_type):
        """
        Returns the sentence that contains a given annotation.
        Replaces the text of the annotations with a tag <ENTITY-TYPE>
        """
        tokens = []
        # Step back some window
        offset = anno.start_index

        while offset not in doc._sentences:
            offset -= 1
            if offset < 0:
                break
            if offset in doc._tokens:
                tokens.insert(0, doc._tokens[offset].lower())

        # Now add an entity
        tokens.append(entity_type)

        # Now add all the tokens between them
        offset = anno.start_index

        while offset not in doc._sentences:
            if offset > max(doc._tokens.keys()):
                break
            if offset in doc._tokens:
                tokens.append(doc._tokens[offset].lower())
            offset += 1


        return ' '.join(tokens)


    def __repr__(self):
        return "LexicalFeatureExtractor Ngram Window: {} Vocab: {} terms".format(
                self.ngram_window, len(self.vocab))

In [7]:
# # Let's remove the validation hold-out set
# import glob

# held_out = [os.path.basename(x) for x in glob.glob(os.path.join('..', 'data', 'heldout_xmls', 'corpus', '*'))]
# docs = {fname: doc for (fname, doc) in docs.items() if fname not in held_out}
# len(docs)

In [8]:
def pair_annotations_in_doc(doc, legal_edges=[], max_sent_length=3):
    """
    Takes a single AnnotatedDocument that contains annotations.
    All annotations that have a legal edge between them
    and are have an overlapping sentence length <= max_sent_length,
        ie., they are in either the same sentence or n adjancent sentences,
    are paired to create RelationAnnotations.
    Takes an optional list legal_edges that defines which edges should be allowed.

    Returns a list of new RelationAnnotations with annotation type 'none'.
    """
    if legal_edges == []:
        legal_edges = [('Drug', 'Route'),
                         ('Drug', 'Indication'),
                         ('SSLIF', 'Severity'),
                         ('Drug', 'Dose'),
                         ('Drug', 'Frequency'),
                         ('Drug', 'Duration'),
                         ('Drug', 'ADE'),
                         ('ADE', 'Severity'),
                         ('Indication', 'Severity'),
                         ('SSLIF', 'ADE')]
    true_annotations = doc.get_annotations()
    true_relations = doc.get_relations()
    generated_relations = []
    edges = defaultdict(list)
    edges = set()

    # Map all annotation_1's to annotation_2's
    # in order to identify all positive examples of relations
    # If this is testing data, it may not actually have these
    for relat in true_relations:
        anno1, anno2 = relat.get_annotations()
        edges.add((anno1.id, anno2.id))

    for anno1 in true_annotations:
        for anno2 in true_annotations:

            # Don't pair the same annotation with itself
            if anno1.id == anno2.id:
                continue

            if anno1.span == anno2.span:
                continue

            # Don't generate paris that have already been paried
            if (anno1.id, anno2.id) in edges:
                continue

            # Exclude illegal relations
            if len(legal_edges) and (anno1.type, anno2.type) not in legal_edges:
                continue

            # Check the span between them, make sure it's either 1 or 2
            start1, end1 = anno1.span
            start2, end2 = anno2.span
            sorted_spans = list(sorted([start1, end1, start2, end2]))
            span = (sorted_spans[0], sorted_spans[-1])
            overlapping_sentences = doc.get_sentences_overlap_span(span)
            if len(overlapping_sentences) > max_sent_length:
                continue

            # If they haven't already been paired, pair them
            else:
                generated_relation = annotation.RelationAnnotation.from_null_rel(
                    anno1, anno2, doc.file_name
                )
                edges.add((anno1.id, anno2.id))
                generated_relations.append(generated_relation)
    
    
    
    return list(set(generated_relations + true_relations))

def sample_negative_examples(relations, neg_prop=1.0):
    """
    Takes a list of Relationannotations and
    neg_prop, a float that specifies the proportion of negative
    to positive examples.

    In the future, a more sophisticated method of sampling might be used,
    ie., sampling by the probability of the Annotation types in the nodes.
    """
    pos_relations = []
    neg_relations = []
    for relat in relations:
        if relat.type == 'none':
            neg_relations.append(relat)
        else:
            pos_relations.append(relat)

    pos_size = len(pos_relations)
    neg_sample_size = int(neg_prop * pos_size)

    neg_sample = random.sample(neg_relations, neg_sample_size)
    print("Original Distribution: {} positive relations, {} negative relations".format(
                len(pos_relations),
                len(neg_relations)))
    print("{} positive relations, {} negative relations".format(len(pos_relations),
                len(neg_sample)))
    return pos_relations + neg_sample

In [9]:
reader = made_utils.TextAndBioCParser(TRAINDIR)
docs = reader.read_texts_and_xmls(-1) # TODO: Change to -1

0/876
100/876
200/876
300/876
400/876
500/876
600/876
700/876
800/876


In [91]:
doc = docs['12_123']

In [None]:
relations = []
for i, (fname, doc) in enumerate(docs.items()):
    if i  % 10 == 0:
        print('-{}: {} '.format(i, fname))
        print(len(doc.relations))
    new_relations = pair_annotations_in_doc(doc)
    
    # Add Fake relations for training
    neg_relations = set(new_relations).difference(set(doc.relations))
    # Sample them
    if len(neg_relations) >= 2 * len(doc.relations):
        neg_relations = random.sample(neg_relations, 2 * len(doc.relations))
    else:
        pass
        
    doc.add_relations(neg_relations)
    
    relations.extend(doc.get_relations())
    if i  % 10 == 0:
        print(len(doc.get_relations()))
   

-0: 12_123 
96
256
-10: 10_729 
71
213
-20: 3_586 
24
72
-30: 3_120 
9
11
-40: 7_293 
42
126
-50: 16_201 
14
38
-60: 7_666 
16
34
-70: 10_1 
18
54
-80: 6_656 
15
45
-90: 5_496 
9
27
-100: 6_895 
7
13
-110: 6_40 
53
144
-120: 12_943 
22
66
-130: 14_920 
21
63
-140: 1_260 
47
141
-150: 10_691 
3
7
-160: 12_433 
0
0
-170: 13_435 
10
30
-180: 6_323 
14
39
-190: 1_859 
63
189
-200: 4_144 
18
54
-210: 12_74 
3
9
-220: 3_256 
8
17
-230: 14_350 
21
63
-240: 14_966 
12
23
-250: 19_8 
7
21
-260: 1_424 
91
273
-270: 1_415 
21
49
-280: 10_1014 
24
55
-290: 12_671 
0
0
-300: 8_329 
30
90
-310: 7_797 
11
33
-320: 10_248 
9
17
-330: 3_689 
15
33
-340: 17_1036 
30
90
-350: 13_599 
1
2
-360: 19_202 
41
123
-370: 1_1041 
29
87
-380: 6_286 
0
0
-390: 10_960 
6
13
-400: 10_190 
0
0
-410: 10_367 
20
60
-420: 14_411 
1
2
-430: 7_447 
5
5
-440: 1_996 
25
75
-450: 10_162 
19
45
-460: 14_408 
10
30
-470: 6_609 
1
3
-480: 1_720 
44
132
-490: 10_518 
18
44
-500: 5_207 
6
6
-510: 8_218 
13
39
-520: 17_605 
12
36


In [9]:
import pickle
with open('../final_system/data/vocab.pkl', 'rb') as f:
    vocab, pos_vocab = pickle.load(f)

relations = []
for doc in docs.values():
    relations += doc.relations
random.shuffle(relations)
len(relations)

In [10]:
with open('tmp_data/all_training_docs_and_relations.pkl', 'rb') as f:
    docs, relations = pickle.load(f)
len(relations)

67021

In [93]:
docs.keys()

dict_keys(['12_123', '4_857', '17_839', '10_988', '13_513'])

In [94]:
relat = relations[0]
print(len([r for r in relations if r.type == 'none']))

171


In [95]:
len(relations)

283

In [97]:
print(len(train_relats))
print(len(val_relats))

226
57


In [98]:
print(len({r.id for r in train_relats}.intersection({r.id for r in val_relats})))

0


In [99]:
r = relations[7]
doc = docs[r.file_name]
print(r.span)
r

(548, 637)


'atenolol':'oral', Drug:Route, type=manner/route

In [100]:
rids = [r.id for r in relations]
print(len(rids))
print(len(set(rids)))

283
283


In [101]:
print(len(relations))
len(set(relations))

283


283

In [102]:
relations = list(set(relations))

In [11]:
rtypes = [r.type for r in relations]
from collections import Counter
c = Counter(rtypes)
c

Counter({'do': 5177,
         'fr': 4419,
         'manner/route': 2551,
         'reason': 4554,
         'none': 43856,
         'du': 906,
         'severity_type': 3476,
         'adverse': 2082})

In [111]:
def create_feature_dicts(relations, docs):
    feat_dicts = []
    for i, r in enumerate(relations):
        doc = docs[r.file_name]
        if i % 100 == 0:
            print("{}/{}".format(i, len(relations)))
        feat_dict = feature_extractor.create_feature_dict(r, doc, entities=True, entities_between=True, surface=True)
        feat_dicts.append(feat_dict)
        
        
    return feat_dicts

In [19]:
feature_extractor = LexicalFeatureExtractor(context_window=(2, 2),
                            ngram_window=(1, 3), vocab=vocab, pos_vocab=pos_vocab,
                            min_vocab_count=20, min_pos_count=20)
feat_dicts = create_feature_dicts(relations, docs)

print(len(feat_dicts))

0/67021




100/67021
200/67021




300/67021
400/67021
500/67021
600/67021
700/67021
800/67021
900/67021
1000/67021
1100/67021
1200/67021
1300/67021
1400/67021
1500/67021
1600/67021
1700/67021
1800/67021
1900/67021
2000/67021
2100/67021
2200/67021
2300/67021
2400/67021
2500/67021
2600/67021
2700/67021
2800/67021
2900/67021
3000/67021
3100/67021
3200/67021
3300/67021
3400/67021
3500/67021
3600/67021
3700/67021
3800/67021
3900/67021
4000/67021
4100/67021
4200/67021
4300/67021
4400/67021
4500/67021
4600/67021
4700/67021
4800/67021
4900/67021
5000/67021
5100/67021
5200/67021
5300/67021
5400/67021
5500/67021
5600/67021
5700/67021
5800/67021
5900/67021
6000/67021
6100/67021
6200/67021
6300/67021
6400/67021
6500/67021
6600/67021
6700/67021
6800/67021
6900/67021
7000/67021
7100/67021
7200/67021
7300/67021
7400/67021
7500/67021
7600/67021
7700/67021
7800/67021
7900/67021
8000/67021
8100/67021
8200/67021
8300/67021
8400/67021
8500/67021
8600/67021
8700/67021
8800/67021
8900/67021
9000/67021
9100/67021
9200/67021
9300/67021
9400/6

In [24]:
relations[-1].file_name

'3_703'

In [None]:
with open('tmp_data/feat_dicts.pkl', 'rb') as f:
    feat_dicts = pickle.load(f)

In [29]:
with open('tmp_data/feat_dicts', 'rb') as f:
    feat_dicts = pickle.load(f)

In [25]:
feat_dicts[7]

{'text_in_anno1': 'atenolol',
 'text_in_anno2': 'oral',
 'concat_text': 'atenolol:oral',
 'first_entity_type:<DRUG>': 1,
 'second_entity_type:<ROUTE>': 1,
 'entity_types_concat': '<DRUG><=><ROUTE>',
 'entities_between:<DOSE>': 1,
 'num_entities_between': 2,
 'num_sentences_overlap': 1,
 'num_tokens_between': 11,
 'grams_between:<-- : by>': 1,
 'grams_between:<by ordered>': 1,
 'grams_between:<tablet>': 1,
 'grams_between:<mg>': 1,
 'grams_between:<, tablet>': 1,
 'grams_between:<directions>': 1,
 'grams_between:<- -- :>': 1,
 'grams_between:<<NUMBER> mg tablet>': 1,
 'grams_between:<<NUMBER> : tablet>': 1,
 'grams_between:<: by>': 1,
 'grams_between:<- -- -->': 1,
 'grams_between:<, mg tablet>': 1,
 'grams_between:<- - -->': 1,
 'grams_between:<- : directions>': 1,
 'grams_between:<<NUMBER> :>': 1,
 'grams_between:<- directions>': 1,
 'grams_between:<by>': 1,
 'grams_between:<:>': 1,
 'grams_between:<mg tablet>': 1,
 'grams_between:<ordered>': 1,
 'grams_between:<- name>': 1,
 'grams_b

In [26]:
relations[7]

'atenolol':'oral', Drug:Route, type=manner/route

with open('tmp_data/val_docs_and_relations.pkl', 'wb') as f:
    pickle.dump((test_docs, test_relations), f)

In [27]:
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse=True, sort=True)
# TODO: Freeze the 1000 features that we use


In [28]:
y_full = [r.type for r in relations]
y_bin = ['any' if y_ != 'none' else y_ for y_ in y_full]
y_dict = {'bin': y_bin,
    'full': y_full}
X = {'bin': None, 'full': None}
X

{'bin': None, 'full': None}

In [29]:
feature_selectors = {'bin': None, 'full': None}
# vectorizers = {'bin': None, 'full': None}

In [30]:
vectorizer = DictVectorizer(sparse=True, sort=True)
k=1000
binary_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)
full_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)

# Fit the vectorizer and feature selectors, transform X
X_vector = vectorizer.fit_transform(feat_dicts)
print(X_vector.shape)
try:
    binary_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)
    X_bin = binary_feature_selector.fit_transform(X_vector, y_dict['bin'])


    full_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)
    X_full = full_feature_selector.fit_transform(X_vector, y_dict['full']) 
except ValueError as e: # Not enough features
#         vectorizer = DictVectorizer(sparse=True, sort=True)
#         X_vector = vectorizer.fit_transform(features)
    binary_feature_selector = base_feature.MyFeatureSelector(vectorizer, k='all')
    X_bin = binary_feature_selector.fit_transform(X_vector, y_dict['bin'])

    full_feature_selector = base_feature.MyFeatureSelector(vectorizer, k='all')
    X_full = full_feature_selector.fit_transform(X_vector, y_dict['full']) 
    
print(X_bin.shape, X_full.shape)


feature_selectors['bin'] = binary_feature_selector
feature_selectors['full'] = full_feature_selector

(67021, 59479)
(67021, 1000) (67021, 1000)


with open('tmp_data/X_dicts.pkl', 'wb') as f:
    pickle.dump((X_dict, X_dict_test), f)

with open('tmp_data/X_dict_test.pkl', 'wb') as f:
    pickle.dump(X_dict, f)

In [154]:
y_full

['none',
 'do',
 'none',
 'severity_type',
 'none',
 'du',
 'manner/route',
 'none',
 'fr',
 'do',
 'none',
 'du',
 'manner/route',
 'do',
 'fr',
 'none',
 'none',
 'none',
 'reason',
 'fr',
 'do',
 'manner/route',
 'du',
 'du',
 'manner/route',
 'fr',
 'do',
 'reason',
 'none',
 'none',
 'do',
 'do',
 'none',
 'do',
 'fr',
 'reason',
 'fr',
 'du',
 'fr',
 'fr',
 'none',
 'manner/route',
 'severity_type',
 'manner/route',
 'none',
 'fr',
 'do',
 'do',
 'none',
 'do',
 'do',
 'none',
 'reason',
 'none',
 'reason',
 'severity_type',
 'do',
 'none',
 'manner/route',
 'none',
 'do',
 'fr',
 'reason',
 'none',
 'do',
 'du',
 'reason',
 'none',
 'none',
 'reason',
 'adverse',
 'none',
 'none',
 'none',
 'none',
 'du',
 'reason',
 'reason',
 'manner/route',
 'none',
 'none',
 'reason',
 'severity_type',
 'none',
 'reason',
 'do',
 'do',
 'none',
 'reason',
 'none',
 'none',
 'du',
 'do',
 'none',
 'du',
 'reason',
 'do',
 'severity_type',
 'fr',
 'du',
 'du',
 'manner/route',
 'none',
 'none'

In [46]:
# Now we can train and evaluate each set
def train_clf(X, y, cross_val=False):
    """
    Trains and validates a model using cross-validation.
    """
    clf = RandomForestClassifier(max_depth = None,
                            max_features = None,
                            min_samples_leaf = 2,
                            min_samples_split = 2,
                            n_estimators = 10,
                            n_jobs = 1)
    print(X.shape)
    if cross_val:
        # Cross-validate to make sure this is going right
        pred = cross_val_predict(clf, X, y, verbose=1)
        print(classification_report(y, pred))
#     clf.fit(X, y)
    else:
        clf.fit(X, y)
    return clf

In [34]:
# Binary
train_clf(X_bin, y_bin, cross_val=True)

# Full
train_clf(X_full, y_full, cross_val=True)

(67021, 1000)
             precision    recall  f1-score   support

        any       0.95      0.94      0.94     23165
       none       0.97      0.97      0.97     43856

avg / total       0.96      0.96      0.96     67021



[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [37]:
# Save clfs, vectorizers, feature_selectors
with open('tmp_data/training_data.pkl', 'wb') as f:
    pickle.dump(((X_bin, y_bin), (X_full, y_full)), f)

In [39]:
with open('tmp_data/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    
with open('tmp_data/feature_selectors.pkl', 'wb') as f:
    pickle.dump((binary_feature_selector, full_feature_selector), f)

In [43]:
acc = {}
for i, r in enumerate(relations):
    if r.id in acc:
        print(r.id)
        print(acc[r.id])
        print(r)
        print()
    else:
        acc[r.id] = r

7038
'BCNU':'infusion', Drug:Route, type=manner/route
'chemotherapy':'infusion', Drug:Route, type=none

840
'chemotherapy':'bilateral shingles', Drug:Indication, type=none
'Ativan':'nausea', Drug:Indication, type=reason

1951
'posaconazole':'neutropenia', Drug:Indication, type=none
'ACETAMINOPHEN':'325MG', Drug:Dose, type=do

4546
'white count of about 27,800, which decreased':'mild', SSLIF:Severity, type=none
'Folic acid':'daily', Drug:Frequency, type=fr

2220
'ciprofloxacin':'200 mg per 5 mL', Drug:Dose, type=none
'magnesium oxide':'daily', Drug:Frequency, type=fr

2221
'ciprofloxacin':'3 times daily', Drug:Frequency, type=none
'Remeron':'15 mg', Drug:Dose, type=do

1918
'posaconazole':'3 times daily', Drug:Frequency, type=none
'gammaglobulin':'intravenous', Drug:Route, type=manner/route

2251
'ciprofloxacin':'neutropenia', Drug:Indication, type=none
'decrease \nsensation to LLE':'mild', SSLIF:Severity, type=severity_type

1620
'Valtrex':'200 mg per 5 mL', Drug:Dose, type=none
'OXYCO

In [47]:
# Now let's train on all of the data
# Binary
clf_bin = train_clf(X_bin, y_bin, cross_val=False)

# Full
clf_full = train_clf(X_full, y_full, cross_val=False)

(67021, 1000)
(67021, 1000)


In [48]:
clf_bin

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
# Read in the test data
test_reader = made_utils.TextAndBioCParser(TESTDIR)
test_docs = test_reader.read_texts_and_xmls(-1, include_relations=False) # TODO: Change to -1

0/213
100/213
200/213


In [51]:
with open('tmp_data/test_docs.pkl', 'wb') as f:
    pickle.dump(test_docs, f)

In [57]:
test_relations = []
for i, (fname, doc) in enumerate(test_docs.items()):
    if i  % 10 == 0:
        print('-{}: {} '.format(i, fname))
        print(len(doc.relations))
    doc.relations = []
    possible_relations = pair_annotations_in_doc(doc)
    
        
    doc.add_relations(possible_relations)
    
    test_relations.extend(doc.get_relations())
    if i  % 10 == 0:
        print(len(doc.get_relations()))
   

-0: 18_821 
347
347
-10: 3_922 
8
8
-20: 21_952 
44
44
-30: 18_999 
412
412
-40: 20_888 
12
12
-50: 18_313 
0
0
-60: 14_1072 
206
206
-70: 11_1024 
0
0
-80: 18_454 
324
324
-90: 18_693 
3
3
-100: 20_935 
0
0
-110: 11_587 
119
119
-120: 1_1069 
349
349
-130: 18_1064 
213
213
-140: 13_1085 
13
13
-150: 18_926 
519
519
-160: 11_243 
3
3
-170: 4_1082 
28
28
-180: 18_698 
324
324
-190: 20_775 
36
36
-200: 20_975 
34
34
-210: 11_534 
13
13


In [53]:
with open('tmp_data/test_docs_and_relations.pkl', 'wb') as f:
    pickle.dump((test_relations, test_docs), f)

In [59]:
# Create feature_dicts
test_feat_dicts = create_feature_dicts(test_relations, test_docs)

0/23266




100/23266
200/23266
300/23266
400/23266
500/23266
600/23266
700/23266
800/23266
900/23266
1000/23266
1100/23266
1200/23266
1300/23266
1400/23266
1500/23266
1600/23266
1700/23266
1800/23266
1900/23266
2000/23266
2100/23266
2200/23266
2300/23266
2400/23266
2500/23266
2600/23266
2700/23266
2800/23266
2900/23266
3000/23266
3100/23266
3200/23266
3300/23266
3400/23266
3500/23266
3600/23266
3700/23266
3800/23266
3900/23266
4000/23266
4100/23266
4200/23266
4300/23266
4400/23266
4500/23266
4600/23266
4700/23266
4800/23266
4900/23266
5000/23266
5100/23266
5200/23266
5300/23266
5400/23266
5500/23266
5600/23266
5700/23266
5800/23266
5900/23266
6000/23266
6100/23266
6200/23266
6300/23266
6400/23266
6500/23266
6600/23266
6700/23266
6800/23266
6900/23266
7000/23266
7100/23266
7200/23266
7300/23266
7400/23266
7500/23266
7600/23266
7700/23266
7800/23266
7900/23266
8000/23266
8100/23266
8200/23266
8300/23266
8400/23266
8500/23266
8600/23266
8700/23266
8800/23266
8900/23266
9000/23266
9100/23266
9200/232

In [63]:
r = test_relations[-1]
feat_dict = test_feat_dicts[-1]

In [61]:
r

'DIPHENHYDRAMINE HCL':'DAY OF CHEMOTHERAPY', Drug:Frequency, type=none

In [62]:
feat_dict

{'text_in_anno1': 'diphenhydramine hcl',
 'text_in_anno2': 'day of chemotherapy',
 'concat_text': 'diphenhydramine hcl:day of chemotherapy',
 'first_entity_type:<DRUG>': 1,
 'second_entity_type:<FREQUENCY>': 1,
 'entity_types_concat': '<FREQUENCY><=><DRUG>',
 'entities_between:<DRUG>': 1,
 'entities_between:<DOSE>': 1,
 'entities_between:<FREQUENCY>': 1,
 'entities_between:<ROUTE>': 1,
 'entities_between:<INDICATION>': 1,
 'num_entities_between': 17,
 'num_sentences_overlap': 3,
 'num_tokens_between': 53,
 'grams_between:<to>': 1,
 'grams_between:<: medication>': 1,
 'grams_between:<# mr>': 1,
 'grams_between:<: patient>': 1,
 'grams_between:<- -- patient>': 1,
 'grams_between:<every>': 1,
 'grams_between:<ud>': 1,
 'grams_between:<hospital medicine>': 1,
 'grams_between:<morning>': 1,
 'grams_between:<-<NUMBER>>': 1,
 'grams_between:<<NUMBER> mg po>': 1,
 'grams_between:<-- medical_record_number>': 1,
 'grams_between:<#>': 1,
 'grams_between:<-- :>': 1,
 'grams_between:<,>': 1,
 'gram

In [64]:
X_test = vectorizer.transform(test_feat_dicts)
X_test_bin = binary_feature_selector.transform(X_test)
X_test_full = full_feature_selector.transform(X_test)
print(X_test_bin.shape)
print(X_test_full.shape)



y_pred_bin = clf_bin.predict(X_test_bin)
y_pred_full = clf_full.predict(X_test_full)

(23266, 1000)
(23266, 1000)


In [71]:
y_pred_bin

array(['none', 'none', 'none', ..., 'any', 'none', 'none'], dtype='<U4')

In [72]:
y_pred_full

array(['none', 'none', 'none', ..., 'do', 'none', 'none'], dtype='<U13')

In [73]:
with open('tmp_data/preds.pkl', 'wb') as f:
    pickle.dump((y_pred_bin, y_pred_full), f)

In [74]:
y_pred_test = [y_pred_full[i] if y_pred_bin[i] != 'none' else 'none' for  i in range(len(y_pred_full))]
y_pred_test

['none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'do',
 'manner/route',
 'fr',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'reason',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'reason',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'reason',
 'none',
 'none',
 'none',
 'none',
 'none',
 'fr',
 'none',
 'manner/route',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'manner/route',
 'fr',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none

In [78]:
def to_bioc_xml(doc, outdir):
    outpath = os.path.join(outdir, doc.file_name + '.bioc.xml')
    writer = bioc.BioCXMLWriter()
    writer.collection = bioc.BioCCollection()
    
    collection = writer.collection
    document = bioc.BioCDocument()
    document.id = doc.file_name

    passage = bioc.BioCPassage()
    passage.offset = '0'
    document.add_passage(passage)
    collection.add_document(document)

    # Add annotations that already have bioc annotations
    for anno in doc.get_annotations():
        passage.add_annotation(anno.bioc_anno)

    for relat in doc.get_relations():
        # Create new BioCRelation
        relation = bioc.bioc_relation.BioCRelation()
        relation.id = relat.id
        relation.put_infon('type', relat.type)

        # Reference that nodes that contain the annotations
        node1 = bioc.bioc_node.BioCNode()
        node1.role = 'annotation 1'
        node1.refid = relat.annotation_1.id
        relation.add_node(node1)

        node2 = bioc.bioc_node.BioCNode()
        node2.role = 'annotation 2'
        node2.refid = relat.annotation_2.id
        relation.add_node(node2)

        passage.add_relation(relation)

    writer.write(outpath)

In [80]:
acc ={}
for relation in test_relations:
    if relation.id in acc:
        print(acc[relation.id])
        print(relation)
    else:
        acc[relation.id] = relation

In [84]:
acc['114309114310'].file_name

'18_1076'

In [85]:
# Write out bioc annotations
# Remove any duplicates that somehow got in

for doc in test_docs.values():
    doc.relations = []
    existing_annos = set()
    to_add = []
    for i, anno in enumerate(doc.annotations):
        if anno.id not in existing_annos:
            to_add.append(anno)
            existing_annos.add(anno.id)
    doc.annotations = to_add

from collections import defaultdict
relations_already_seen = []

for i in range(len(y_pred_test)):
    p = y_pred_test[i]
#     print(p); break
    r = test_relations[i]
    r.type = p
    doc = test_docs[r.file_name]
    if r.type != 'none':
        doc.relations.append(r)


for doc in test_docs.values():
    existing_relats = set()
    to_add = []
    for i, relat in enumerate(doc.relations):
        if relat.id not in existing_relats:
            to_add.append(relat)
            existing_relats.add(relat.id)
    doc.relations = to_add

OUTDIR = 'tmp_data/output_{}'.format('test_set_task_one')
if not os.path.exists(OUTDIR):
    os.mkdir(OUTDIR)
for d in test_docs.values():
    to_bioc_xml(d, OUTDIR)
        
print("Done")

Done


In [77]:
import bioc
bioc.BioCXMLWriter

## Task 3

In [123]:
# Now do the same with Kelly's output
# Read in the test data
KELLYDIR = '/Users/alec/Data/NLP_Challenge/task1_test_set_predictions'
task_3_reader = made_utils.TextAndBioCParser(KELLYDIR)
task_3_docs = task_3_reader.read_texts_and_xmls(-1, include_relations=False) # TODO: Change to -1

0/213
100/213
200/213


In [126]:
doc = task_3_docs['1_1069']
doc.annotations[0]

[Lymphoplasmacytoid lymphoma involving bone marrow and spleen], 397:457, type=[SSLIF]

In [134]:
def create_feature_dicts(relations, docs):
    feat_dicts = []
    for i, r in enumerate(relations):
        doc = docs[r.file_name]
        if i % 100 == 0:
            print("{}/{}".format(i, len(relations)))
        feat_dict = feature_extractor.create_feature_dict(r, doc, entities=True, entities_between=True, surface=True)
        feat_dicts.append(feat_dict)
#         print(r, feat_dict)
        
        
        
    return feat_dicts
task_3_feat_dicts = create_feature_dicts(task_3_relations, task_3_docs)

0/18637




100/18637
200/18637
300/18637
400/18637
500/18637
600/18637
700/18637
800/18637
900/18637
1000/18637
1100/18637
1200/18637
1300/18637
1400/18637
1500/18637
1600/18637
1700/18637
1800/18637
1900/18637
2000/18637
2100/18637
2200/18637
2300/18637
2400/18637
2500/18637
2600/18637
2700/18637
2800/18637
2900/18637
3000/18637
3100/18637
3200/18637
3300/18637
3400/18637
3500/18637
3600/18637
3700/18637
3800/18637
3900/18637
4000/18637
4100/18637
4200/18637
4300/18637
4400/18637
4500/18637
4600/18637
4700/18637
4800/18637
4900/18637
5000/18637
5100/18637
5200/18637
5300/18637
5400/18637
5500/18637
5600/18637
5700/18637
5800/18637
5900/18637
6000/18637
6100/18637
6200/18637
6300/18637
6400/18637
6500/18637
6600/18637
6700/18637
6800/18637
6900/18637
7000/18637
7100/18637
7200/18637
7300/18637
7400/18637
7500/18637
7600/18637
7700/18637
7800/18637
7900/18637
8000/18637
8100/18637
8200/18637
8300/18637
8400/18637
8500/18637
8600/18637
8700/18637
8800/18637
8900/18637
9000/18637
9100/18637
9200/186

In [135]:
task_3_relations[0]

'bleomycin':'TID', Drug:Frequency, type=none

In [136]:
task_3_feat_dicts[0]

{'text_in_anno1': 'bleomycin',
 'text_in_anno2': 'tid',
 'concat_text': 'bleomycin:tid',
 'first_entity_type:<DRUG>': 1,
 'second_entity_type:<FREQUENCY>': 1,
 'entity_types_concat': '<DRUG><=><FREQUENCY>',
 'entities_between:<ADE>': 1,
 'entities_between:<DRUG>': 1,
 'entities_between:<DOSE>': 1,
 'entities_between:<ROUTE>': 1,
 'num_entities_between': 4,
 'num_sentences_overlap': 3,
 'num_tokens_between': 35,
 'grams_between:<: diet regular>': 1,
 'grams_between:<- .>': 1,
 'grams_between:<with>': 1,
 'grams_between:<home>': 1,
 'grams_between:<of>': 1,
 'grams_between:<meds>': 1,
 'grams_between:<mg>': 1,
 'grams_between:<- date>': 1,
 'grams_between:<<NUMBER> mg oral>': 1,
 'grams_between:<date>': 1,
 'grams_between:<right upper>': 1,
 'grams_between:<history surgical>': 1,
 'grams_between:<: diet>': 1,
 'grams_between:<on>': 1,
 'grams_between:<past>': 1,
 'grams_between:<no with>': 1,
 'grams_between:<reconciliation>': 1,
 'grams_between:<history>': 1,
 'grams_between:<for>': 1,


In [137]:
X_test = vectorizer.transform(task_3_feat_dicts)
X_test_bin = binary_feature_selector.transform(X_test)
X_test_full = full_feature_selector.transform(X_test)
print(X_test_bin.shape)
print(X_test_full.shape)



y_pred_bin = clf_bin.predict(X_test_bin)
y_pred_full = clf_full.predict(X_test_full)

(18637, 1000)
(18637, 1000)


In [138]:
y_pred_test = [y_pred_full[i] if y_pred_bin[i] != 'none' else 'none' for  i in range(len(y_pred_full))]
y_pred_test

['none',
 'fr',
 'none',
 'none',
 'du',
 'none',
 'none',
 'none',
 'do',
 'manner/route',
 'do',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'adverse',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'manner/route',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'fr',
 'none',
 'none',
 'do',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'manner/route',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'manner/route',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'reason',
 'none',
 'do',
 'none',
 'none',
 'fr',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'none',
 'do',
 'manner/route',
 

In [139]:
# Write out bioc annotations
# Remove any duplicates that somehow got in

for doc in task_3_docs.values():
#     test_doc = test_docs[doc.file_name]
#     doc.annotations = test_doc.annotations
    doc.relations = []
    existing_annos = set()
    to_add = []
    for i, anno in enumerate(doc.annotations):
#         print(anno)
        if anno.id not in existing_annos:
            to_add.append(anno)
            existing_annos.add(anno.id)
    doc.annotations = to_add

from collections import defaultdict
relations_already_seen = []

for i in range(len(y_pred_test)):
    p = y_pred_test[i]
#     print(p); break
    r = task_3_relations[i]
    r.type = p
    doc = task_3_docs[r.file_name]
    if r.type != 'none':
        doc.relations.append(r)


for doc in task_3_docs.values():
    existing_relats = set()
    to_add = []
    for i, relat in enumerate(doc.relations):
        if relat.id not in existing_relats:
            to_add.append(relat)
            existing_relats.add(relat.id)
    doc.relations = to_add
#     doc.annotations = []

OUTDIR = 'tmp_data/output_{}'.format('test_set_task_three')
if not os.path.exists(OUTDIR):
    os.mkdir(OUTDIR)
for d in task_3_docs.values():
    to_bioc_xml(d, OUTDIR)
        
print("Done")

Done


In [140]:
task_3_feat_dicts[-1]

{'text_in_anno1': 'chemotherapy',
 'text_in_anno2': 'tablet                8 mg',
 'concat_text': 'chemotherapy:tablet                8 mg',
 'first_entity_type:<DRUG>': 1,
 'second_entity_type:<DOSE>': 1,
 'entity_types_concat': '<DRUG><=><DOSE>',
 'entities_between:<DRUG>': 1,
 'entities_between:<DOSE>': 1,
 'entities_between:<ROUTE>': 1,
 'entities_between:<FREQUENCY>': 1,
 'entities_between:<INDICATION>': 1,
 'num_entities_between': 28,
 'num_sentences_overlap': 2,
 'num_tokens_between': 54,
 'grams_between:<to>': 1,
 'grams_between:<: medication>': 1,
 'grams_between:<# mr>': 1,
 'grams_between:<: patient>': 1,
 'grams_between:<as hours needed>': 1,
 'grams_between:<- -- patient>': 1,
 'grams_between:<every>': 1,
 'grams_between:<ud>': 1,
 'grams_between:<hospital medicine>': 1,
 'grams_between:<-<NUMBER>>': 1,
 'grams_between:<<NUMBER> mg po>': 1,
 'grams_between:<sustained>': 1,
 'grams_between:<-- medical_record_number>': 1,
 'grams_between:<#>': 1,
 'grams_between:<-- :>': 1,


In [141]:
task_3_relations[-1]

'CHEMOTHERAPY':'TABLET                8 MG', Drug:Dose, type=none

# Old

In [63]:
# for feature_set_name in X_dict.keys():
for feature_set_name in ['entities+entities_between+surface',]:
    print(feature_set_name)
    # First, train and get predictions for binary classifier
    print('bin', feature_set_name)
    X_train_bin, y_train_bin = X_dict[feature_set_name]['bin'], y_dict['bin']
    X_test_bin, y_test_bin = X_dict_test[feature_set_name]['bin'], y_dict_test['bin']
    bin_clf = train_clf(X_train_bin, y_train_bin)
    
    clfs[feature_set_name]['bin'] = bin_clf
    
    
    # Now, train and get predictions full classifier
    X_train_full = X_dict[feature_set_name]['full']
    y_train_full = y_dict['full']
    
    X_test_full, y_test_full = X_dict_test[feature_set_name]['full'], y_dict_test['full']
    
    print('full', feature_set_name)
    try:
        full_clf = train_clf(X_train_full, y_train_full)
        clfs[feature_set_name]['full'] = full_clf
    except Exception as e:
        raise e
        print(X_dict[feature_set_name]['full'], y_dict['full'])
        raise e
        
        
    # Now predict on the test set
    # Now get the agreed upon scores
    from collections import Counter
    
    print("Predicting binary test: {}".format(X_test_bin.shape))
    pred_bin = bin_clf.predict(X_test_bin)
    
    print(Counter(pred_bin))
    print("Predicting binary test: {}".format(X_test_full.shape))
    pred_full = full_clf.predict(X_test_full)
    print(Counter(pred_full))
    
    
    y_pred_dict[feature_set_name+'_no_binary'] = pred_full
    y_pred_dict[feature_set_name] = [pred_full[i] if pred_bin[i] != 'none' else 'none' for i in range(len(pred_full))]
print("Finished training and predicting")

entities+entities_between+surface
bin entities+entities_between+surface


KeyboardInterrupt: 

In [65]:
no_filter_pred = clfs['entities+entities_between+surface']['full'].predict(X_dict_test['entities+entities_between+surface']['full'])
print(no_filter_pred)

['reason' 'do' 'none' ..., 'fr' 'none' 'severity_type']


In [133]:
with open('tmp_data/preds.pkl', 'wb') as f:
    pickle.dump(y_pred_dict, f)
with open('tmp_data/clfs.pkl', 'wb') as f:
    pickle.dump(clfs, f)

In [None]:
with open('tmp_data/preds.pkl', 'rb') as f:
    y_pred_dict = pickle.load(f)
with open('tmp_data/clfs.pkl', 'rb') as f:
    clfs = pickle.load(f)

In [7]:
with open('tmp_data/preds.pkl', 'rb') as f:
    y_pred_dict = pickle.load(f)

In [62]:
# Write out bioc annotations
# Remove any duplicates that somehow got in

for doc in test_docs.values():
    doc.relations = []
    existing_annos = set()
    to_add = []
    for i, anno in enumerate(doc.annotations):
        if anno.id not in existing_annos:
            to_add.append(anno)
            existing_annos.add(anno.id)
    doc.annotations = to_add

from collections import defaultdict
relations_already_seen = []

for feature_set_name in y_pred_dict.keys():
# for feature_set_name in ('entities+entities_between+surface',):
    print(feature_set_name)
    for i in range(len(y_pred_dict[feature_set_name])):
        p = y_pred_dict[feature_set_name][i]
    #     print(p); break
        r = test_relations[i]
        r.type = p
        doc = test_docs[r.file_name]
        if r.type != 'none':
            doc.relations.append(r)


    for doc in test_docs.values():
        existing_relats = set()
        to_add = []
        for i, relat in enumerate(doc.relations):
            if relat.id not in existing_relats:
                to_add.append(relat)
                existing_relats.add(relat.id)
        doc.relations = to_add

    OUTDIR = 'tmp_data/output_{}'.format(feature_set_name)
    if not os.path.exists(OUTDIR):
        os.mkdir(OUTDIR)
    for d in test_docs.values():
        d.to_bioc_xml(OUTDIR)
        
print("Done")

entities
entities_between
surface
entities+entities_between
entities+entities_between+surface
Done


## Error Analysis

In [97]:
# Read the documents back in, this time with true relations
# Read in test data
gold_test_reader = made_utils.TextAndBioCParser(VALDIR)
gold_test_docs = gold_test_reader.read_texts_and_xmls(num_docs=-1, include_relations=True)

0/176
100/176


In [98]:
gold_test_relations = []
for doc in gold_test_docs.values():
    gold_test_relations += doc.relations
len(gold_test_relations)

4206

In [102]:
len(test_relations)

21053

In [103]:
# Make sure that our predictions are using the full feature set
for i in range(len(y_pred_dict[feature_set_name])):
    p = y_pred_dict['entities+entities_between+surface'][i]
#     print(p); break
    r = test_relations[i]
    r.type = p

In [104]:
# Create a set of all anno -> anno edges in gold
gold_edges = {file_name: {} for file_name in gold_test_docs.keys()}
for relat in gold_test_relations:
     gold_edges[relat.file_name][(relat.annotation_1.start_index,
                               relat.annotation_2.end_index)] = relat

In [105]:
pred_edges = {file_name: {} for file_name in test_docs.keys()}
for relat in test_relations:
    # Exclude any 'none' relations
    if relat.type == 'none':
        continue
    pred_edges[relat.file_name][(relat.annotation_1.start_index,
                               relat.annotation_2.end_index)] = relat

### Types of errors

- **Relation-Type Errors** - We predicted a relation between the two entities but the wrong relation
- **False Negative** - There is a relation, but we missed it
- **False Positive** - There is a relation between the two, but we missed it

In [107]:
type_errors = []
false_negatives = []
for f_name, anno_edges in gold_edges.items():
    for anno_edge in anno_edges:
        # If there's not a relation between two annotations, it's a false negative
        if anno_edge not in pred_edges[f_name]:
            false_negatives.append(anno_edges[anno_edge])
        else:
            true_relat = anno_edges[anno_edge]
            pred_relat = pred_edges[f_name][anno_edge]
            if true_relat.type != pred_relat.type:
                type_errors.append((true_relat, pred_relat))
#         if anno_edges[anno_edge] 
    
# Now go through and find the false positives
false_positives = []
for f_name, anno_edges in pred_edges.items():
    for anno_edge in anno_edges:
        if anno_edge not in gold_edges[f_name]:
            false_positives.append(anno_edges[anno_edge])

In [112]:
print("Number of Type Errors: {}".format(len(type_errors)))
print("Number of False Negatives: {}".format(len(false_negatives)))
print("Number of False Positives: {}".format(len(false_positives)))

Number of Type Errors: 12
Number of False Negatives: 363
Number of False Positives: 332


In [108]:
# (Truth, Pred)
type_errors

[('chemotherapy':'shingles', Drug:ADE, type=adverse,
  'chemotherapy':'shingles', Drug:Indication, type=reason),
 ('Carafate':'mucositis', Drug:Indication, type=reason,
  'Carafate':'mucositis', Drug:ADE, type=adverse),
 ('chemo':'shingles', Drug:ADE, type=adverse,
  'chemo':'shingles', Drug:Indication, type=reason),
 ('steroids':'decreased bone density', Drug:ADE, type=adverse,
  'steroids':'decreased bone density', Drug:Indication, type=reason),
 ('steroids':'low testosterone level', Drug:ADE, type=adverse,
  'steroids':'low testosterone level', Drug:Indication, type=reason),
 ('Zofran':'vomiting', Drug:Indication, type=reason,
  'Zofran':'vomiting', Drug:ADE, type=adverse),
 ('scopolamine':'nausea', Drug:Indication, type=reason,
  'scopolamine':'nausea', Drug:ADE, type=adverse),
 ('Benadryl':'rash', Drug:Indication, type=reason,
  'Benadryl':'rash', Drug:ADE, type=adverse),
 ('steroid':'dyspnea', Drug:ADE, type=adverse,
  'steroid':'dyspnea', Drug:Indication, type=reason),
 ('steroi

In [124]:
import random
for (truth_relat, pred_relat) in random.sample(type_errors, len(type_errors)):
    doc = gold_test_docs[truth_relat.file_name]
    print(doc.file_name)
    print("Truth: {}".format(truth_relat.type))
    print("Pred: {}".format(pred_relat.type))
    print('-    ' + truth_relat.get_example_string(doc))
    print()
    

6_917
Truth: reason
Pred: adverse
-    o monitored. --- Name --- did have some <REASON><INDICATION>nausea</INDICATION> that was refractory to these <DRUG>scopolamine</DRUG></REASON>  patch and Zofran and was additionally g

10_758
Truth: adverse
Pred: reason
-    c  fever with the previous 5 cycles of <ADVERSE><DRUG>chemo</DRUG>, but did have one episode of <ADE>shingles</ADE></ADVERSE>   and has been treated on valacyclovir.

7_410
Truth: adverse
Pred: reason
-    ompression fractures. I also think the <ADVERSE><DRUG>steroid</DRUG> regimen may have increased her volume and that has resulted in <ADE>blood pressure going up</ADE></ADVERSE>  as well as increased edema. I therefor

3_405
Truth: reason
Pred: adverse
-     and drink, has occasional nausea  and <REASON><INDICATION>vomiting</INDICATION>, which was adequately controlled by <DRUG>Zofran</DRUG></REASON> . The patient has  been fever-free sin

10_197
Truth: adverse
Pred: reason
-    fever with either of the cycles of this <ADVERS

In [None]:
import random
for truth_relat in random.sample(false_negatives, 100):
    doc = gold_test_docs[truth_relat.file_name]
    print(doc.file_name)
    print("Truth: {}".format(truth_relat))
    print('-    ' + truth_relat.get_example_string(doc))
    print()
    

In [None]:
for pred_relat in random.sample(false_positives, 100):
    doc = test_docs[pred_relat.file_name]
    print(doc.file_name)
    print("Pred: {}".format(pred_relat))
    print('-    ' + pred_relat.get_example_string(doc))
    print()
    

In [None]:
test_docs['19_566'].text

In [None]:
false_negatives

In [None]:
false_positives

In [137]:
vectorizer.inverse_transform(X_dict_test['entities+entities_between+surface']['full'])[0]

{'concat_text=acyclovir:2 capsule': 1.0,
 'concat_text=air hunger:some': 1.0,
 'concat_text=antivirals:prophylactic': 1.0,
 'concat_text=aspirin:few cycles': 6.0,
 'concat_text=aspirin:in the a.m': 1.0,
 'concat_text=back pain:80%': 1.0,
 'concat_text=back pain:adverse reaction': 1.0,
 'concat_text=back pain:constipation': 1.0,
 'concat_text=bactrim:10 meq': 1.0}

In [None]:
# Instead of computing metrics this way,
# let's use their script to make sure we get the same number.

# For now, let's start with just the full feature_set
y_pred_dict

In [None]:
# For some reasons there are duplicates in the training data
# which probably messed up the cross-validation.

new_docs = {}


In [62]:
for fname, doc in new_docs.items():
    for anno in doc.get_annotations():
        print(anno.id)
        break
    break

NameError: name 'new_docs' is not defined

In [138]:
for i in range(100):
    print(y_pred_dict['entities+entities_between+surface'][i])
    print(feat_dicts_test['entities+entities_between+surface'][i])
    print(test_relations[i])
    print(test_relations[i].file_name)
    break

adverse
{'text_in_anno1': 'lidocaine', 'text_in_anno2': 'anesthesia', 'concat_text': 'lidocaine:anesthesia', 'first_entity_type:<DRUG>': 1, 'second_entity_type:<INDICATION>': 1, 'entity_types_concat': '<DRUG><=><INDICATION>', 'num_entities_between': 0, 'num_sentences_overlap': 1, 'num_tokens_between': 6, 'grams_between:<OOV>': 1, 'grams_between:<local>': 1, 'grams_between:<then was>': 1, 'grams_between:<injected>': 1, 'grams_between:<then>': 1, 'grams_between:<to>': 1, 'grams_between:<was>': 1, 'grams_before:<%>': 1, 'grams_before:<<NUMBER>>': 1, 'grams_before:<% <NUMBER>>': 1, 'grams_after:<OOV>': 1, 'tags_between:<to vb vbn>': 1, 'tags_between:<vb>': 1, 'tags_between:<vbn>': 1, 'tags_between:<rb vbd vbn>': 1, 'tags_between:<rb>': 1, 'tags_between:<rb vbd>': 1, 'tags_between:<rb vbn>': 1, 'tags_between:<jj to vb>': 1, 'tags_between:<rb to vbn>': 1, 'tags_between:<to vbn>': 1, 'tags_between:<to>': 1, 'tags_between:<to vb>': 1, 'tags_between:<jj>': 1, 'tags_between:<vbd>': 1, 'tags_betw

In [109]:
[r for r in new_docs['10_1'].relations]

['doxorubicin':'infusion', Drug:Route, type=manner/route,
 'valacyclovir':'bilateral shingles', Drug:Indication, type=reason,
 'bloating':'mild', SSLIF:Severity, type=severity_type,
 'neutropenia medication':'neutropenia', Drug:Indication, type=reason,
 'Valtrex':'500 mg', Drug:Dose, type=do,
 'Valtrex':'3 times daily', Drug:Frequency, type=fr,
 'posaconazole':'200 mg per 5 mL', Drug:Dose, type=do,
 'posaconazole':'3 times daily', Drug:Frequency, type=fr,
 'ciprofloxacin':'500 mg', Drug:Dose, type=do,
 'ciprofloxacin':'daily', Drug:Frequency, type=fr,
 'antiemetics':'p.r.n.', Drug:Frequency, type=fr,
 'uric acid':'daily', Drug:Frequency, type=fr,
 'uric acid':'prophylaxis', Drug:Indication, type=reason,
 'uric acid':'tumor \nlysis syndrome', Drug:Indication, type=reason,
 'doxorubicin':'infusion', Drug:Route, type=manner/route,
 'valacyclovir':'bilateral shingles', Drug:Indication, type=reason,
 'bloating':'mild', SSLIF:Severity, type=severity_type,
 'neutropenia medication':'neutropen

In [None]:
[a.id for a in new_docs['10_1'].annotations]

In [107]:
from sklearn.metrics import f1_score, precision_score, recall_score
def compute_metrics(y, pred, average='micro'):
    metrics = {}
    labels = set(y)
    labels.remove('none')
    labels = list(sorted(labels))
    metrics['precision'] = precision_score(y, pred, average=average, labels=labels)
    metrics['recall'] = recall_score(y, pred, average=average, labels=labels)
    metrics['f1'] = f1_score(y, pred, average=average, labels=labels)
    return metrics

In [108]:
# Now compute the metrics
metrics = {feature_set_name: None for feature_set_name in y_pred_dict.keys()}

for feature_set_name, y_pred in y_pred_dict.items():
    metrics[feature_set_name] = compute_metrics(y_dict['full'], y_pred, 'micro')
    

In [109]:
sorted(metrics.items(), key=lambda x:x[1]['f1'], reverse=True)

[('entities+entities_between+surface',
  {'f1': 0.9318378304591165,
   'precision': 0.98508826783395065,
   'recall': 0.88404921217353771}),
 ('entities+entities_between',
  {'f1': 0.9058706875918896,
   'precision': 0.95967255077153413,
   'recall': 0.85778113533347722}),
 ('surface',
  {'f1': 0.6509934013629971,
   'precision': 0.73451951503973523,
   'recall': 0.58452406647960287}),
 ('entities',
  {'f1': 0.52739580229043381,
   'precision': 0.76985540870862157,
   'recall': 0.40107921433196631}),
 ('entities_between',
  {'f1': 0.1727288185373545,
   'precision': 0.30428625891835959,
   'recall': 0.12059140945391755})]