In [3]:
import xml.etree.ElementTree as ET
from nltk import sent_tokenize
import json
from collections import Counter
from tqdm import tqdm

In [36]:
import xml.etree.ElementTree as ET
from nltk import sent_tokenize

class Annotation():
    def __init__(self, annotation_xml_object):
        self.id = annotation_xml_object.attrib['id']
        infons = {a.attrib['key']: a.text for a in annotation_xml_object.findall('infon')}
        self.file, self.type = infons['file'], infons['type']
        self.location = {i[0]: int(i[1]) for i in annotation_xml_object.find('location').attrib.items()}
        self.text = annotation_xml_object.find('text').text
        self.encoded_text = None
    
    def __str__(self):
        return 'ID: {}\nLocation: {}\nText: {}'.format(self.id, self.location, self.text)
    
    def as_dict(self):
        return {'id': self.id, 'type': self.type, 'location': self.location, 'text': self.text}

class Relation():    
    def __init__(self, rel_xml_object):
        self.id = rel_xml_object.attrib['id']
        infons = {a.attrib['key']: a.text for a in rel_xml_object.findall('infon')}
        self.file, self.type = infons['file'], infons['type']
        try:
            self.relation_type = infons['relation type']
        except KeyError:
            self.relation_type = None
        self.nodes = {n.attrib['role'].lower(): n.attrib['refid'] for n in rel_xml_object.findall('node')}
    
    def __str__(self):
        return 'Relation {}\nType {}\nFrom {} to {}'.format(self.id, self.relation_type, *self.nodes.values())
    
    def as_dict(self):
        return {'id': self.id, 'type': self.type, 'nodes': self.nodes, 'relation_type': self.relation_type}

class Document():
    def __init__(self, document_xml_object, ignore_text=True):
        self.id = document_xml_object[0].text
        
        self.text = document_xml_object[1][1].text if not ignore_text else ''
        self.sentences = sent_tokenize(self.text)
        if self.id == '14731280':  # hack to fix a bad sentence split
            self.sentences = self.sentences[:3] + [self.sentences[3] + self.sentences[4]] + self.sentences[5:]
        
        self.annotations = {a.id: a for a in [Annotation(a) for a in document_xml_object[1].findall('annotation')]}
        self.__generate_sentence_based_locations_for_annotations()
        
        self.relations = {r.id: r for r in [Relation(r) for r in document_xml_object[1].findall('relation')]}
        self.__create_entity_tokens()
        self.__extract_annotation_ids_for_genes()
        self.__extract_gene_relations()
        
    def __str__(self):
        return 'ID: {}\nAnnotations: {}\nRelations: {}'.format(
            self._id, len(self.annotations), len(self.relations))
    
    def __extract_annotation_ids_for_genes(self):
        self.gene_ids = set([a.id for a in self.annotations.values() if a.type == 'Gene'])
    
    def __extract_gene_relations(self):
        self.gene_relations = {r.id: r for r in self.relations.values() if
            all([_id in self.gene_ids for _id in r.nodes.values()])}
    
    def __generate_sentence_based_locations_for_annotations(self):
        for ann in self.annotations.values():
            char_offset = ann.location['offset']
            for i, sent in enumerate(self.sentences):
                if char_offset - len(sent) < 0:
                    ann.location = {'sentence_idx': i, 'char_idx': char_offset, 'length': ann.location['length']}
                    break
                else:
                    char_offset -= len(sent) + 1  # subtract one for the space between 2 sentences
    
    def __create_entity_tokens(self):
        for ann in self.annotations.values():
            ann.text = ann.text.replace(' ', '_')
            sent_id, start, length = ann.location['sentence_idx'], ann.location['char_idx'], ann.location['length']
            sent = self.sentences[sent_id]
            self.sentences[sent_id] = sent[:start] + sent[start:start+length].replace(' ', '_') + sent[start+length:]
    
    def __rel2sentence(self, rels):
        sentences = []
        for rel in rels:
            src = self.annotations[rel.nodes['cause']]
            target = self.annotations[rel.nodes['theme']]
            src_sentence_id, target_sentence_id = [it.location['sentence_idx'] for it in [src, target]]
            
            try:
                assert src_sentence_id == target_sentence_id
            except AssertionError:
                pass
#                 print(self.id)
#                 print(src_sentence_id, target_sentence_id)
#                 print((src.id, src.text), (target.id, target.text))
#                 print(self.sentences[src_sentence_id])
#                 print()
#                 print(self.sentences[target_sentence_id])
#                 print('---')
            sentences.append({
                'id': '{}-{}'.format(self.id, src_sentence_id),
                'sentence': self.sentences[src_sentence_id],
                'src': src.as_dict(), 'target': target.as_dict()
            })
        return sentences
    
    def get_sentences_with_gene_relation(self):
        return self.__rel2sentence(self.gene_relations.values())
    
    def get_sentences_without_gene_relation(self):
        non_gene_rels = [rel for rel_id, rel in self.relations.items() if rel_id not in self.gene_relations]
        return self.__rel2sentence(non_gene_rels)
    
    def _test_annotation_location_types(self):
        for ann in self.annotations.values():
            full_text_based = self.text[ann.location['offset']:ann.location['offset']+ann.location['length']]
            start, length = ann.location['char_idx'], ann.location['length']
            sent_based = self.sentences[ann.location['sentence_idx']][start:start+length]
            if not full_text_based == sent_based:
                print('-----')
                print('DocID: ' + self.id)
                print('AnnotationID: ' + ann.id)
                print('Should:\n' + ann.text)
                print('Full text:\n' + full_text_based)
                print('Sent based:\n' + sent_based)
                print('----')
    
class CorpusParser():
    def __init__(self, path):
        self.tree = ET.parse(path)
        self.root = self.tree.getroot()
        self.documents = [Document(d, ignore_text=False) for d in self.root if d.tag == 'document']
    
    def get_sentences_with_annotations(self, genes_only=False):
        sentences = []
        for d in self.documents:
            for sent_idx, sent in enumerate(d.sentences):
                sent_dict = {'sentence': sent}
                sent_dict['annotations'] = [ann.as_dict() for ann in d.annotations.values()
                                            if ann.location['sentence_idx'] == sent_idx 
                                            and (not genes_only or ann.type == 'Gene')]
                ann_ids = set([ann['id'] for ann in sent_dict['annotations']])
                sent_dict['relations'] = [rel.as_dict() for rel in d.gene_relations.values()
                                          if rel.nodes['cause'] in ann_ids]
                sentences.append(sent_dict)    
        return sentences
    
    def get_all_sentences_with_gene_relation(self):
        sentences = []
        for d in self.documents:
            sents = d.get_sentences_with_gene_relation()
            if sents is not None:
                sentences += sents
        return sentences

    def get_all_sentences_without_gene_relation(self):
        sentences = []
        for d in self.documents:
            sents = d.get_sentences_without_gene_relation()
            if sents is not None:
                sentences += sents
        return sentences

    def get_all_sentences(self):
        return sum([d.sentences for d in self.documents], [])
    
    def get_all_relations(self):
        return sum([list(d.relations.values()) for d in self.documents], [])
    
    def get_all_gene_relations(self):
        return sum([list(d.gene_relations.values()) for d in self.documents], [])

In [37]:
corpus = CorpusParser('genereg_bioc.xml')

sents = corpus.get_all_sentences_with_gene_relation()
distinct_sents = set([s['id'] for s in sents])
len(distinct_sents)

580

In [38]:
sentences = corpus.get_sentences_with_annotations(genes_only=True)
print(sentences[:2])
pairs = []
relas = []
for s in sentences:
    anns, rels = s['annotations'], s['relations']
    for i in range(len(anns)-1):
        for j in range(i+1, len(anns)):
            pairs.append((anns[i]['id'], anns[j]['id']))
    for r in rels:
        relas.append(r)

print(len(pairs))
pairs = set(pairs)
print(len(pairs))

pos_pairs, neg_pairs = [], []
seen = set()
from collections import Counter
c = Counter()
for r in relas:
    src = r['nodes']['cause']
    target = r['nodes']['theme']
    c.update({r['relation_type']: 1})
    if (src, target) in pairs:
        pos_pairs.append((src, target))
    elif (target, src) in pairs:
        pos_pairs.append((target, src))
        
print(c)
neg_pairs = pairs - set(pos_pairs)

print(len(set(pos_pairs)))
print('pos_pairs', len(pos_pairs))
print('neg_pairs', len(neg_pairs))

    
# print('num_of_rels_in_all_sents', sum([len(s['relations']) for s in sentences]))
# print('gene rels_w_sents', len(corpus.get_all_sentences_with_gene_relation()))
# print('gene_rels', len(corpus.get_all_gene_relations()))
# print('all_rels', len(corpus.get_all_relations()))

# len(corpus.get_all_sentences())

# j = json.dumps([s for s in sentences if len(s['relations'])>1][:10], indent=2)
# j = j.split('\n')
# for l in j:
#     print(l)

[{'sentence': 'Mechanisms of acid resistance in enterohemorrhagic Escherichia coli.', 'annotations': [], 'relations': []}, {'sentence': 'Enterohemorrhagic strains of Escherichia coli must pass through the acidic gastric barrier to cause gastrointestinal disease.', 'annotations': [], 'relations': []}]
2767
2767
Counter({'PositiveRegulationOfGeneExpression': 465, 'RegulationOfGeneExpression': 417, 'NegativeRegulationOfGeneExpression': 282})
1154
pos_pairs 1164
neg_pairs 1613


In [127]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from bllipparser import RerankingParser

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # might be a bad solution

print('Load Genereg corpus...')
corpus = CorpusParser('genereg_bioc.xml')
print('...done')

Load Genereg corpus...
...done


In [114]:
print('Load RerankingParser...')
rrp = RerankingParser.from_unified_model_dir('../../ppi-benchmark/Parsing/Models/McClosky-2009/biomodel')
print('...done')
print('Load lemmatizer...')
lemmatizer = WordNetLemmatizer()
print('...done')

Load RerankingParser...


RuntimeError: Parser is already loaded and can only be loaded once.

In [115]:
def match_tokens_to_original_sentence(sentence, tokens):
    '''Annotate tokens with their starting position in the original sentence'''
    cur_char_idx, cur_token_idx = 0, 0
    new_tokens = []
    while cur_token_idx < len(tokens):
        cur_token_text = tokens[cur_token_idx]['text']
        if sentence[cur_char_idx:].startswith(cur_token_text):
            # consume our token
            new_token = {k:v for k,v in tokens[cur_token_idx].items()}
            new_token['start'] = cur_char_idx
            new_tokens.append(new_token)
            cur_char_idx += len(cur_token_text)
            cur_token_idx += 1
        elif sentence[cur_char_idx:cur_char_idx+1] == ' ':
            # we have a whitespace that got lost during tokenization
            cur_char_idx += 1
        elif cur_token_text in ['-LRB-', '-RRB-']:  # more special chars will be needed
            # we just forget about these chars (assumption being that they are never part of an entity)
            cur_token_idx += 1
            cur_char_idx += 1

    return new_tokens      

In [116]:
def tag_tokens_with_annotations(tokens_with_pos, annotations):
    '''
    Given annotations, tag and modify tokens based on their position in the original sentence.
    tokens_with_pos should be a dict of the form {'start': start pos in sentence, 'text': token's text}
    '''

    def get_tokens_touched_by_annotation(ann_start, ann_length, tokens_with_pos):
        token_candidates = []
        for idx, t in enumerate(tokens_with_pos):
            positions_of_token = set(range(t['start'], t['start']+len(t['text'])))
            positions_of_entity = set(range(ann_start, ann_start+ann_length))
            if len(positions_of_token & positions_of_entity) > 0:
                token_candidates.append((idx, t))
        return token_candidates

    for ann in annotations:
        ann_start, ann_length = ann['location']['char_idx'], ann['location']['length']
        token_candidates = get_tokens_touched_by_annotation(ann_start, ann_length, tokens_with_pos)
        token_idxs = token_candidates[0][0], token_candidates[-1][0]  # indizes of tokens touched

        # merge if multiple tokens are touched
        merged_token_text = ''.join([t['text'] for _, t in token_candidates])
        merged_token = {
            'start': token_candidates[0][1]['start'],
            'text': merged_token_text,
            'pos': token_candidates[0][1]['pos'],  # heuristic: take the first one
            'entity': ann['text'],
            'entity_id': ann['id']
        }

        # handle special cases where an entity does not align with the token
        prefix_token, suffix_token = None, None

        prefix_diff = ann_start - merged_token['start']
        if prefix_diff > 0:
            # cut off prefix and put in new token
            prefix_token = {'start': merged_token['start'],
                            'text': merged_token['text'][:prefix_diff],
                            'pos': merged_token['pos']}
            merged_token['text'] = merged_token['text'][prefix_diff:]
            merged_token['start'] = merged_token['start'] + prefix_diff

        suffix_diff = (merged_token['start']+len(merged_token['text'])) - (ann_start+ann_length)
        if suffix_diff > 0 :
            # cut off suffix and put in new token
            suffix_token = {'start': ann_start+ann_length+1,
                            'text': merged_token['text'][-suffix_diff:],
                            'pos': merged_token['pos']}
            merged_token['text'] = merged_token['text'][:-suffix_diff]
        new_tokens = [t for t in [prefix_token, merged_token, suffix_token] if t is not None]
        
        # build modified token list
        tokens_with_pos = tokens_with_pos[:token_idxs[0]] + new_tokens + tokens_with_pos[token_idxs[1]+1:]

    return tokens_with_pos

In [129]:
sents_with_annotations = corpus.get_sentences_with_annotations()
assert len(sents_with_annotations) == len(corpus.get_all_sentences())

In [None]:
def to_token_dict(token_tuple):
    return [{'text': text, 'pos': pos} for text, pos in token_tuple]

# tag sentences
for item in tqdm(sents_with_annotations):
    item = to_token_dict(rrp.tag(item['sentence']))

with open('tagged_tokens.txt', 'w') as outf:
    outf.wrtie(json.dumps(sents_with_annotations, indent=2))
    
sents_with_annotations[:3]


  0%|          | 0/3191 [00:00<?, ?it/s][A
  0%|          | 1/3191 [00:00<09:35,  5.54it/s][A
  0%|          | 2/3191 [00:00<09:28,  5.61it/s][A
  0%|          | 3/3191 [00:01<20:18,  2.62it/s][A
  0%|          | 5/3191 [00:01<16:04,  3.30it/s][A
  0%|          | 6/3191 [00:01<14:38,  3.63it/s][A
  0%|          | 7/3191 [00:01<13:23,  3.96it/s][A
  0%|          | 8/3191 [00:02<26:19,  2.01it/s][A
  0%|          | 9/3191 [00:03<24:13,  2.19it/s][A
  0%|          | 10/3191 [00:04<34:42,  1.53it/s][A
  0%|          | 11/3191 [00:04<26:03,  2.03it/s][A
  0%|          | 12/3191 [00:04<20:33,  2.58it/s][A
  0%|          | 13/3191 [00:04<17:28,  3.03it/s][A
  0%|          | 14/3191 [00:05<15:57,  3.32it/s][A
  0%|          | 15/3191 [00:05<22:59,  2.30it/s][A
  1%|          | 16/3191 [00:06<22:55,  2.31it/s][A
  1%|          | 17/3191 [00:06<22:31,  2.35it/s][A
  1%|          | 18/3191 [00:06<20:18,  2.60it/s][A
  1%|          | 19/3191 [00:07<15:59,  3.30it/s][A
  1%|    

In [138]:
for tokens, annotations in tqdm(zip(tokens, sents_with_annotations)):
    tokens = match_tokens_to_original_sentence(annotations['sentence'], tokens)

matched[:3]

[([{'pos': 'NNS', 'start': 0, 'text': 'Mechanisms'},
   {'pos': 'IN', 'start': 11, 'text': 'of'},
   {'pos': 'NN', 'start': 14, 'text': 'acid'},
   {'pos': 'NN', 'start': 19, 'text': 'resistance'},
   {'pos': 'IN', 'start': 30, 'text': 'in'},
   {'pos': 'JJ', 'start': 33, 'text': 'enterohemorrhagic'},
   {'pos': 'FW', 'start': 51, 'text': 'Escherichia'},
   {'pos': 'FW', 'start': 63, 'text': 'coli'},
   {'pos': '.', 'start': 67, 'text': '.'}],
  {'annotations': [],
   'relations': [],
   'sentence': 'Mechanisms of acid resistance in enterohemorrhagic Escherichia coli.'}),
 ([{'pos': 'JJ', 'start': 0, 'text': 'Enterohemorrhagic'},
   {'pos': 'NNS', 'start': 18, 'text': 'strains'},
   {'pos': 'IN', 'start': 26, 'text': 'of'},
   {'pos': 'FW', 'start': 29, 'text': 'Escherichia'},
   {'pos': 'FW', 'start': 41, 'text': 'coli'},
   {'pos': 'MD', 'start': 46, 'text': 'must'},
   {'pos': 'VB', 'start': 51, 'text': 'pass'},
   {'pos': 'IN', 'start': 56, 'text': 'through'},
   {'pos': 'DT', 'sta