This notebook will explore different combinations of the features that were used in submission and save the results/evaluations to compare the different scores.

In [None]:
import os, sys
import pickle
import bioc

In [None]:
sys.path.append('../final_system')

In [None]:
import annotation
import base_feature
import made_utils
import random
from collections import Counter, defaultdict


from nltk import ngrams as nltk_ngrams
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict

In [None]:
DATADIR = '/Users/alec/Data/NLP_Challenge'
# ALLDIR = os.path.join(DATADIR, 'original_data')
TRAINDIR = os.path.join(DATADIR, 'MADE-1.0')
TESTDIR = os.path.join(DATADIR, 'made_test_data')
print(os.path.exists(TRAINDIR))
print(os.path.exists(TESTDIR))

# Prep
First, here are some functions that we'll define. They are defined elsewhere in the package, but we'll include them here for simplicity's sake.

In [None]:
def normalize_grams(ngram_string):
    """
    Normalizes the values in a string of joined ngrams
    """
    # Substitute numbers
    ngram_string = re.sub('[\d]+|one|two|three|four|five|six|seven|eight|nine|ten', '<NUMBER>', ngram_string)
    return ngram_string



class LexicalFeatureExtractor(base_feature.BaseFeatureExtractor):
    """This class will create a set of features from a Relation object
    and return a dictionary of features that can later be vectorized.
    
    ngram_window - the length of ngrams to include in the vocabulary.
    context_window - the number of ngrams to include before and after the entity.
    """
    def __init__(self, ngram_window=(1, 1), context_window=(2, 2),
                vocab=None, pos_vocab=None, min_vocab_count=5, min_pos_count=5):
        super().__init__()
        self.ngram_window = ngram_window
        if min(ngram_window) < 1 or max(ngram_window) > 3:
            raise NotImplementedError("Ngram Window must be between one and 3")
        self.context_window = context_window
        self.min_vocab_count = min_vocab_count
        self.min_pos_count = min_pos_count

        # Set vocab and POS vocab
        self._unfiltered_vocab = vocab # Contains unigrams-trigrams, no count threshold
        self._unfiltered_pos_vocab = pos_vocab

        self.vocab = self.create_vocab(vocab, min_vocab_count, self.ngram_window) # Only contains ngrams defined by context_window
        #print(self.vocab); exit()
        self.pos_vocab =  self.create_vocab(pos_vocab, min_pos_count, self.ngram_window)
        #self.tokens = [gram for (gram, idx) in self.vocab.items() if len(gram.split()) == 1] # Only unigrams
        self.pos = {} # Will eventually contain mapping for POS tags

        # pyConText tools
        #self.modifiers = itemData.instantiateFromCSVtoitemData("https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/lexical_kb_05042016.tsv")
        #self.targets = itemData.instantiateFromCSVtoitemData("https://raw.githubusercontent.com/abchapman93/MADE_relations/master/feature_extraction/targets.tsv?token=AUOYx9rYHO6A5fiZS3mB9e_3DP83Uws8ks5aownVwA%3D%3D")


        #self.all_features_values = self.create_base_features()



    def create_base_features(self):
        """
        Enumerates possible feature values from the vocab, as well as an OOV value.
        Any features that are binary should only get one index and are encoded as 0.
        """
        # This will be a dictionary that contains all possible values for each feature
        all_features_values = {
            'same_sentence': 0,
            'num_tokens_between': 0,
            'grams_between': ['OOV'] + list(self.vocab),
            'grams_before': ['OOV'] + list(self.vocab),
            'grams_after': ['OOV'] + list(self.vocab),
            'pos_grams_between': ['OOV'] + list(self.pos_vocab),
            #'pos_grams_before': ['OOV'] + list(self.pos_vocab),
            #'pos_grams_after': ['OOV'] + list(self.pos_vocab),
            'first_entity_type': 0,#list(ENTITY_TYPES_MAPPING.values()),
            'second_entity_type': 0,#list(ENTITY_TYPES_MAPPING.values()),

            }
        return all_features_values

    def create_feature_dict(self, relat, doc, entities=True, entities_between=True, surface=True):
        """
        Takes a RelationAnnotation and an AnnotatedDocument.
        Returns the a dictionary containing the defined lexical features.
        """

        lex_features = {}

        if entities:
            lex_features.update(self.get_entity_features(relat, doc))
        if entities_between:
            lex_features.update(self.get_entities_between_features(relat, doc))
        if surface:
            lex_features.update(self.get_surface_features(relat, doc))
        return lex_features
    
    
    def get_entity_features(self, relat, doc):
        features = {}
        
        # The full string of the entities
        anno1, anno2 = relat.get_annotations()
        features['text_in_anno1'] = anno1.text.lower()
        features['text_in_anno2'] = anno2.text.lower()
        features['concat_text'] = anno1.text.lower() + ':' + anno2.text.lower()
        
        # Features for types of the entities
        features['first_entity_type:<{}>'.format(relat.entity_types[0].upper())] = 1
        features['second_entity_type:<{}>'.format(relat.entity_types[1].upper())] = 1
        
        # Feature types for entities, left to right
        sorted_entities = sorted((relat.annotation_1, relat.annotation_2), key=lambda a: a.span[0])
        features['entity_types_concat'] = '<=>'.join(['<{}>'.format(a.type.upper()) for a in sorted_entities])
        return features
    
    
    def get_entities_between_features(self, relat, doc):
       
        features = {}
        # One binary feature for every type of entity between
        entities_between = self.get_entities_between(relat, doc)
        # TODO: Maybe change this to a count
        features.update({
            'entities_between:<{}>'.format(v.type.upper()): 1 for v in entities_between
            })
        features['num_entities_between'] = len(entities_between)

        
        return features
    
    
    
    def get_surface_features(self, relat, doc):        
        
        features = {}
        
        # Same sentence
        features['num_sentences_overlap'] = len(doc.get_sentences_overlap_span(relat.get_span()))
        # Get the number of tokens between
        # NOTE: only unigrams
        
        features['num_tokens_between'] = len(self.get_grams_between(relat, doc, ngram_window=(1, 1)))
        # Get all tokens/POS tags in between
        # Create one feature for each ngram/tag
        features.update({
            'grams_between:<{}>'.format(v): 1 for v in self.get_grams_between(relat, doc)
            })
        features.update({
            'grams_before:<{}>'.format(v): 1 for v in self.get_grams_before(relat, doc)
            })
        features.update({
            'grams_after:<{}>'.format(v): 1 for v in self.get_grams_after(relat, doc)
            })

        features.update({
            'tags_between:<{}>'.format(v): 1 for v in self.get_grams_between(relat, doc, seq='tags')
            })
        features.update({
            'tags_before:<{}>'.format(v): 1 for v in self.get_grams_before(relat, doc, seq='tags')
            })
        features.update({
            'tags_after:<{}>'.format(v): 1 for v in self.get_grams_after(relat, doc, seq='tags')
            })

        # Get features for information about entities/context between
        # Binary feature: Are they in the same sentence?
        features['same_sentence'] = doc.in_same_sentence(relat.get_span())
        return features
        

    def get_grams_between(self, relat, doc, seq='tokens', ngram_window=None):
        """
        Returns the N-grams between the two entities connected in relat.
        Represents it as OOV if it's not in the vocabulary.
        Returns a unique set.
        """

        if seq == 'tokens':
            vocab = self.vocab
        elif seq == 'tags':
            vocab = self.pos_vocab
        else:
            raise ValueError("Must specify seq: {}".format(seq))

        if not ngram_window:
            ngram_window = self.ngram_window

        all_grams = []
        span1, span2 = relat.spans
        # Fixed this: get the start and span of the middle, not of the entire relation
        _, start, end, _ = sorted(span1 +span2)
        tokens_in_span = doc.get_tokens_or_tags_at_span((start, end), seq)
        # NOTE: lower-casing the ngrams, come back to this if you want to encode the casing
        tokens_in_span = [token.lower() for token in tokens_in_span]
        for n in range(ngram_window[0], ngram_window[1] + 1):
            # Now sort the ngrams so that it doesn't matter what order they occur in
            grams = list(nltk_ngrams(tokens_in_span, n))
            grams = self.sort_ngrams(grams)# + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_in_span, n))]
            all_grams.extend(set(grams))
        all_grams = [self.normalize_grams(x) for x in set(all_grams)]
        all_grams = [x if x in vocab else 'OOV' for x in all_grams]
        return set(all_grams)


    def get_grams_before(self, relat,doc, seq='tokens', ngram_window=None):
        """
        Returns the n-grams before the first entity.
        """
        if seq == 'tokens':
            vocab = self.vocab
        elif seq == 'tags':
            vocab = self.pos_vocab
        if not ngram_window:
            ngram_window = self.ngram_window

        all_grams = []
        offset = relat.span[0]
        tokens_before = doc.get_tokens_or_tags_before_or_after(offset, delta=-1,
            n=self.context_window[0], seq=seq, padding=True)
        tokens_before = [token.lower() for token in tokens_before]
        for n in range(ngram_window[0], ngram_window[1] + 1):
            grams = list(nltk_ngrams(tokens_before, n))
            grams = self.sort_ngrams(grams)# + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_in_span, n))]
            all_grams.extend(set(grams))
            #grams = grams + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_before, n))]
        all_grams = [self.normalize_grams(x) for x in set(all_grams)]
        all_grams = [x if x in vocab else 'OOV' for x in all_grams]
        return set(all_grams)

    def get_grams_after(self, relat, doc, seq='tokens', ngram_window=None):
        """
        Returns the n-grams after the final entity.
        """
        if seq == 'tokens':
            vocab = self.vocab
        elif seq == 'tags':
            vocab = self.pos_vocab
        if not ngram_window:
            ngram_window = self.ngram_window

        all_grams = []
        offset = relat.span[1]
        tokens_after = doc.get_tokens_or_tags_before_or_after(offset, delta=1,
                                        n=self.context_window[1], seq=seq)
        tokens_after = [token.lower() for token in tokens_after]
        for n in range(ngram_window[0], ngram_window[1] + 1):
            grams = list(nltk_ngrams(tokens_after, n))
            grams = self.sort_ngrams(grams)# + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_in_span, n))]
            all_grams.extend(set(grams))
            #grams = grams + [' '.join(sorted(tup)) for tup in list(nltk_ngrams(tokens_after, n))]
        all_grams = [self.normalize_grams(x) for x in set(all_grams)]
        all_grams = [x if x in vocab else 'OOV' for x in all_grams]
        return set(all_grams)

    def sort_ngrams(self, ngrams):
        return [' '.join(sorted(tup)) for tup in ngrams]

    def normalize_grams(self, ngram_string):
        """
        Normalizes the values in a string of joined ngrams
        """
        # Substitute numbers
        return normalize_grams(ngram_string)

    def get_pos_tags(self):
        pass

    def get_entities_between(self, relat, doc):
        """
        Returns a list of entities that occur between entity1 and entity2
        """
        offset, end = relat.get_span()
        overlapping_entities = []
        # Index the entity in doc by span
        offset_to_entity = {entity.span[0]: entity for entity in doc.get_annotations()
                    if entity.id not in (
                        relat.annotation_1.id, relat.annotation_2.id)
                        }

        while offset < end:
            if offset in offset_to_entity:
                overlapping_entities.append(offset_to_entity[offset])
            offset += 1

        return overlapping_entities


    def get_sent_with_anno(self, anno, doc, entity_type):
        """
        Returns the sentence that contains a given annotation.
        Replaces the text of the annotations with a tag <ENTITY-TYPE>
        """
        tokens = []
        # Step back some window
        offset = anno.start_index

        while offset not in doc._sentences:
            offset -= 1
            if offset < 0:
                break
            if offset in doc._tokens:
                tokens.insert(0, doc._tokens[offset].lower())

        # Now add an entity
        tokens.append(entity_type)

        # Now add all the tokens between them
        offset = anno.start_index

        while offset not in doc._sentences:
            if offset > max(doc._tokens.keys()):
                break
            if offset in doc._tokens:
                tokens.append(doc._tokens[offset].lower())
            offset += 1


        return ' '.join(tokens)


    def __repr__(self):
        return "LexicalFeatureExtractor Ngram Window: {} Vocab: {} terms".format(
                self.ngram_window, len(self.vocab))

In [None]:
def pair_annotations_in_doc(doc, legal_edges=[], max_sent_length=3):
    """
    Takes a single AnnotatedDocument that contains annotations.
    All annotations that have a legal edge between them
    and are have an overlapping sentence length <= max_sent_length,
        ie., they are in either the same sentence or n adjancent sentences,
    are paired to create RelationAnnotations.
    Takes an optional list legal_edges that defines which edges should be allowed.

    Returns a list of new RelationAnnotations with annotation type 'none'.
    """
    if legal_edges == []:
        legal_edges = [('Drug', 'Route'),
                         ('Drug', 'Indication'),
                         ('SSLIF', 'Severity'),
                         ('Drug', 'Dose'),
                         ('Drug', 'Frequency'),
                         ('Drug', 'Duration'),
                         ('Drug', 'ADE'),
                         ('ADE', 'Severity'),
                         ('Indication', 'Severity'),
                         ('SSLIF', 'ADE')]
    true_annotations = doc.get_annotations()
    true_relations = doc.get_relations()
    generated_relations = []
    edges = defaultdict(list)
    edges = set()

    # Map all annotation_1's to annotation_2's
    # in order to identify all positive examples of relations
    # If this is testing data, it may not actually have these
    for relat in true_relations:
        anno1, anno2 = relat.get_annotations()
        edges.add((anno1.id, anno2.id))

    for anno1 in true_annotations:
        for anno2 in true_annotations:

            # Don't pair the same annotation with itself
            if anno1.id == anno2.id:
                continue

            if anno1.span == anno2.span:
                continue

            # Don't generate paris that have already been paried
            if (anno1.id, anno2.id) in edges:
                continue

            # Exclude illegal relations
            if len(legal_edges) and (anno1.type, anno2.type) not in legal_edges:
                continue

            # Check the span between them, make sure it's either 1 or 2
            start1, end1 = anno1.span
            start2, end2 = anno2.span
            sorted_spans = list(sorted([start1, end1, start2, end2]))
            span = (sorted_spans[0], sorted_spans[-1])
            overlapping_sentences = doc.get_sentences_overlap_span(span)
            if len(overlapping_sentences) > max_sent_length:
                continue

            # If they haven't already been paired, pair them
            else:
                generated_relation = annotation.RelationAnnotation.from_null_rel(
                    anno1, anno2, doc.file_name
                )
                edges.add((anno1.id, anno2.id))
                generated_relations.append(generated_relation)
    
    
    
    return list(set(generated_relations + true_relations))

In [None]:
def add_neg_relations(docs, neg_prop=2):
    """
    Takes a list of Relationannotations and
    neg_prop, a float that specifies the proportion of negative
    to positive examples.
    If the documents don't have relations, ie., are test documents,
    neg_prop should be False and it will take all negative relations 
    as possible relations.
    """
    relations = []
    for i, (fname, doc) in enumerate(docs.items()):
        if i  % 10 == 0:
            print('-{}: {} '.format(i, fname))
            print(len(doc.relations))
        new_relations = pair_annotations_in_doc(doc)

        # Add Fake relations for training
        neg_relations = set(new_relations).difference(set(doc.relations))
        # Sample them
        if neg_prop and len(neg_relations) >= neg_prop * len(doc.relations):
            neg_relations = random.sample(neg_relations, neg_prop * len(doc.relations))
        else:
            pass

        doc.add_relations(neg_relations)

        relations.extend(doc.get_relations())
        if i  % 10 == 0:
            print(len(doc.get_relations()))
    return relations
   

In [None]:
def create_feature_dicts(relations, docs):
    """
    Iterates through a list of relations.
    Returns a list of feature dicts
    """
    feat_dicts = []
    for i, r in enumerate(relations):
        doc = docs[r.file_name]
        if i % 100 == 0:
            print("{}/{}".format(i, len(relations)))
        feat_dict = feature_extractor.create_feature_dict(r, doc, entities=True, entities_between=True, surface=True)
        feat_dicts.append(feat_dict)
        
        
    return feat_dicts

# Task 2
Relations with gold-standard annotations

In [None]:
# Read in the data
reader = made_utils.TextAndBioCParser(TRAINDIR)
docs = reader.read_texts_and_xmls(-1) # TODO: Change to -1

In [None]:
doc = docs['12_123']
doc

In [None]:
with open('tmp_data/all_training_docs_and_relations.pkl', 'rb') as f:
    docs, relations = pickle.load(f)
len(relations)

In [None]:
rtypes = [r.type for r in relations]
from collections import Counter
c = Counter(rtypes)
c

## Feature Extraction

In [None]:
# Load in the vocabulary that will be used for features
with open('../final_system/data/vocab.pkl', 'rb') as f:
    vocab, pos_vocab = pickle.load(f)

In [None]:
feature_extractor = LexicalFeatureExtractor(context_window=(2, 2),
                            ngram_window=(1, 3), vocab=vocab, pos_vocab=pos_vocab,
                            min_vocab_count=20, min_pos_count=20)
feat_dicts = create_feature_dicts(relations, docs)

print(len(feat_dicts))

In [None]:
with open('tmp_data/feat_dicts.pkl', 'rb') as f:
    feat_dicts = pickle.load(f)

In [None]:
relations[-1]

In [None]:
feat_dicts[-1]

## Transform into vectors for training

In [None]:
y_full = [r.type for r in relations]
y_bin = ['any' if y_ != 'none' else y_ for y_ in y_full]

In [None]:
vectorizer = DictVectorizer(sparse=True, sort=True)
k=1000

binary_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)
full_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)

# Fit the vectorizer and feature selectors, transform X
X_vector = vectorizer.fit_transform(feat_dicts)
print(X_vector.shape)
try:
    binary_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)
    X_bin = binary_feature_selector.fit_transform(X_vector, y_bin)


    full_feature_selector = base_feature.MyFeatureSelector(vectorizer, k=k)
    X_full = full_feature_selector.fit_transform(X_vector, y_full) 
except ValueError as e: # Not enough features if only working with a few docs
    binary_feature_selector = base_feature.MyFeatureSelector(vectorizer, k='all')
    X_bin = binary_feature_selector.fit_transform(X_vector, y_bin)

    full_feature_selector = base_feature.MyFeatureSelector(vectorizer, k='all')
    X_full = full_feature_selector.fit_transform(X_vector, y_full) 
    
print(X_bin.shape, X_full.shape)

In [None]:
# Now we can train and evaluate each set
def train_clf(X, y, cross_val=False):
    """
    Trains and validates a model.
    If cross_val is true, will first cross-validate
    and report the scores
    and return the unfitted classifier.
    Otherwise, trains on the data and returns the fitted classifier.
    """
    clf = RandomForestClassifier(max_depth = None,
                            max_features = None,
                            min_samples_leaf = 2,
                            min_samples_split = 2,
                            n_estimators = 10,
                            n_jobs = 1)
    print(X.shape)
    if cross_val:
        # Cross-validate to make sure this is going right
        pred = cross_val_predict(clf, X, y, verbose=1)
        print(classification_report(y, pred))
#     clf.fit(X, y)
    else:
        clf.fit(X, y)
    return clf

In [None]:
# Binary
train_clf(X_bin, y_bin, cross_val=True)

# Full
train_clf(X_full, y_full, cross_val=True)

In [None]:
# Now let's train on all of the data
# Binary
clf_bin = train_clf(X_bin, y_bin, cross_val=False)

# Full
clf_full = train_clf(X_full, y_full, cross_val=False)

In [None]:
# Save data, clfs, vectorizers, feature_selectors
with open('tmp_data/training_data.pkl', 'wb') as f:
    pickle.dump(((X_bin, y_bin), (X_full, y_full)), f)

In [None]:
with open('tmp_data/clfs.pkl', 'wb') as f:
    pickle.dump((clf_bin, clf_full), f)

In [None]:
with open('tmp_data/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    
with open('tmp_data/feature_selectors.pkl', 'wb') as f:
    pickle.dump((binary_feature_selector, full_feature_selector), f)

## Evaluate on Testing Data

In [None]:
# Read in the test data
test_reader = made_utils.TextAndBioCParser(TESTDIR)
test_docs = test_reader.read_texts_and_xmls(-1, include_relations=False) # TODO: Change to -1

In [None]:
# Load in clfs, vectorizers, feature extractors
with open('tmp_data/clfs.pkl', 'rb') as f:
    clf_bin, clf_full = pickle.load(f)
    
with open('tmp_data/vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
    
with open('tmp_data/feature_selectors.pkl', 'rb') as f:
    binary_feature_selector, full_feature_selector = pickle.load(f)

In [None]:
with open('tmp_data/test_docs.pkl', 'wb') as f:
    pickle.dump(test_docs, f)

In [None]:
test_relations = add_neg_relations(test_docs, False)

In [None]:
with open('tmp_data/test_docs_and_relations.pkl', 'wb') as f:
    pickle.dump((test_relations, test_docs), f)

In [None]:
# Create feature_dicts
test_feat_dicts = create_feature_dicts(test_relations, test_docs)

In [None]:
r = test_relations[-1]
feat_dict = test_feat_dicts[-1]

In [None]:
r

In [None]:
feat_dict

In [None]:
X_test = vectorizer.transform(test_feat_dicts)
X_test_bin = binary_feature_selector.transform(X_test)
X_test_full = full_feature_selector.transform(X_test)
print(X_test_bin.shape)
print(X_test_full.shape)



y_pred_bin = clf_bin.predict(X_test_bin)
y_pred_full = clf_full.predict(X_test_full)

In [None]:
y_pred_bin

In [None]:
y_pred_full

In [None]:
with open('tmp_data/preds.pkl', 'wb') as f:
    pickle.dump((y_pred_bin, y_pred_full), f)

In [None]:
def to_bioc_xml(doc, outdir):
    """
    Writes an AnnotatedDocument to a .bioc.xml file
    That can be used with the bioc_evaluation.py file
    """
    outpath = os.path.join(outdir, doc.file_name + '.bioc.xml')
    writer = bioc.BioCXMLWriter()
    writer.collection = bioc.BioCCollection()
    
    collection = writer.collection
    document = bioc.BioCDocument()
    document.id = doc.file_name

    passage = bioc.BioCPassage()
    passage.offset = '0'
    document.add_passage(passage)
    collection.add_document(document)

    # Add annotations that already have bioc annotations
    for anno in doc.get_annotations():
        passage.add_annotation(anno.bioc_anno)

    for relat in doc.get_relations():
        # Create new BioCRelation
        relation = bioc.bioc_relation.BioCRelation()
        relation.id = relat.id
        relation.put_infon('type', relat.type)

        # Reference that nodes that contain the annotations
        node1 = bioc.bioc_node.BioCNode()
        node1.role = 'annotation 1'
        node1.refid = relat.annotation_1.id
        relation.add_node(node1)

        node2 = bioc.bioc_node.BioCNode()
        node2.role = 'annotation 2'
        node2.refid = relat.annotation_2.id
        relation.add_node(node2)

        passage.add_relation(relation)

    writer.write(outpath)

In [None]:
# Write out bioc annotations
# Remove any duplicates that somehow got in
def write_bioc_results(pred, relations, docs, outdir):
    """
    Adds predicted relation types to relations
    And filters out relations with a 'none' prediction label.
    Removes any duplicates and writes out to outdir.
    """
    for doc in docs.values():
        doc.relations = []
        existing_annos = set()
        to_add = []
        for i, anno in enumerate(doc.annotations):
            if anno.id not in existing_annos:
                to_add.append(anno)
                existing_annos.add(anno.id)
        doc.annotations = to_add

    from collections import defaultdict
    relations_already_seen = []

    for i in range(len(pred)):
        p = y_pred_test[i]
    #     print(p); break
        r = relations[i]
        r.type = p
        doc = test_docs[r.file_name]
        if r.type != 'none':
            doc.relations.append(r)


    for doc in docs.values():
        existing_relats = set()
        to_add = []
        for i, relat in enumerate(doc.relations):
            if relat.id not in existing_relats:
                to_add.append(relat)
                existing_relats.add(relat.id)
        doc.relations = to_add

    
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    for d in docs.values():
        to_bioc_xml(d, outdir)

    print("Done")

In [None]:
y_pred_test = [y_pred_full[i] if y_pred_bin[i] != 'none' else 'none' for  i in range(len(y_pred_full))]
y_pred_test
outdir = 'tmp_data/output_{}'.format('test_set_task_one')
write_bioc_results(y_pred_test, test_relations, test_docs, outdir)

## Task 3

In [None]:
# Now do the same with Kelly's output
# Read in the test data
KELLYDIR = '/Users/alec/Data/NLP_Challenge/task1_test_set_predictions'
task_3_reader = made_utils.TextAndBioCParser(KELLYDIR)
task_3_docs = task_3_reader.read_texts_and_xmls(-1, include_relations=False) # TODO: Change to -1

In [None]:
doc = task_3_docs['1_1069']
doc.annotations[0]

In [None]:
task_3_relations = add_neg_relations(task_3_docs, False)

In [None]:
task_3_feat_dicts = create_feature_dicts(task_3_relations, task_3_docs)

In [None]:
X_test = vectorizer.transform(task_3_feat_dicts)
X_test_bin = binary_feature_selector.transform(X_test)
X_test_full = full_feature_selector.transform(X_test)
print(X_test_bin.shape)
print(X_test_full.shape)



y_pred_bin = clf_bin.predict(X_test_bin)
y_pred_full = clf_full.predict(X_test_full)

In [None]:
y_pred_test = [y_pred_full[i] if y_pred_bin[i] != 'none' else 'none' for  i in range(len(y_pred_full))]
y_pred_test

In [None]:
outdir = 'tmp_data/output_{}'.format('test_set_task_three')
write_bioc_results(y_pred_test, task_3_relations, task_3_docs, outdir)