## Analyze XML file

In [71]:
import xml.etree.ElementTree as ET
tree = ET.parse('meta.xml')
root = tree.getroot()

xml_pairs_el = []

for docs in root.iter('document'):
    for sent in docs.iter('sentence'):
        entities = {el.get('id'): el.get('text') for el in sent.findall('entity')}
        for pair in sent.findall('pair'):
            pair.set('sentence', sent.get('text').strip())  # enrich paris with the sentence text
            pair.set('e1_name', entities[pair.get('e1')])
            pair.set('e2_name', entities[pair.get('e2')]) 
            xml_pairs_el.append(pair)

print('pairs: ', len(xml_pairs_el))
xml_pair_ids = [p.get('id') for p in xml_pairs_el]
print('pair_ids: ', len(xml_pair_ids))

pairs:  1979
pair_ids:  1979


In [72]:
xml_pos_pairs = [p for p in xml_pairs_el if p.get('interaction') == 'True']
xml_neg_pairs = [p for p in xml_pairs_el if p.get('interaction') != 'True']
print('all', len(xml_pair_ids))
print('pos', len(xml_pos_pairs))
print('neg', len(xml_neg_pairs))

all 1979
pos 1461
neg 518


## Cross reference with CSV data source

### Compare with both.csv

In [73]:
import csv
with open('tf_extraction_paper_thomas/TF.GO_enriched2.PMID-jSRE_v2_PD.both.sorted.csv') as f:
    reader = csv.DictReader(f)
    csv_data = [l for l in reader]

In [74]:
csv_pairs = {i['pair_id']: i['feedback_PD'] for i in csv_data}
len(csv_pairs)

3516

In [75]:
xml_pairs_in_csv = [p for p in xml_pairs_el if p.get('id') in csv_pairs]
len(xml_pairs_in_csv)

1972

1972 ready tagged pairs in the XML that are also present in the CSV

In [76]:
xml_pairs_NOT_in_csv = [p for p in xml_pair_ids if p not in csv_pairs]
xml_pairs_NOT_in_csv

['PMID.dPMID7533060.s8.p3',
 'PMID.dPMID8100127.s3.p6',
 'PMID.dPMID7774103.s6.p3',
 'PMID.dPMID19551868.s1.p2',
 'PMID.dPMID19526525.s11.p1',
 'PMID.dPMID9681824.s0.p9',
 'PMID.dPMID1325459.s3.p6']

these are missing in the _both_ version but are present in _any_
### Analyze the available XML data witht he additional info from the Supplement2.csv

In [77]:
def simple_classes(data):
    for instance in data:
        if instance['class'] == 'TP':
            instance['simple_class'] = 'True'
        elif instance['class'] == 'FP':
            instance['simple_class'] = 'False'
        elif instance['class'] == 'NN':
            if instance['details'] == 'cooperation/competition in transcription':
                instance['simple_class'] = 'True'
            else:
                instance['simple_class'] = 'False'
    return data

with open('tf_extraction_paper_thomas/Supplement2.csv', 'r') as f:
    suppl2 = simple_classes([l for l in csv.DictReader(f)])

suppl2_sents = set([i['sentence'] for i in suppl2])
xml_pairs_in_suppl2 = [p for p in xml_pairs_el if p.get('sentence') in suppl2_sents]
xml_pairs_not_in_suppl2 = [p for p in xml_pairs_el if p.get('sentence') not in suppl2_sents]
print('found ', len(xml_pairs_in_suppl2), ' of ', len(xml_pairs_el))
[p.get('sentence') for p in xml_pairs_not_in_suppl2]

found  1977  of  1979


['In contrast, ablation of VDR expression enhances FoxO3a phosphorylation, as does knockdown of Sirt1, consistent with the coupling of FoxO acetylation and phosphorylation.',
 'Cha, a basic helix-loop-helix transcription factor involved in the regulation of upstream stimulatory factor activity.']

Almost all sentences are present in the supplements file!

In [89]:
suppl2_sents = set([(i['sentence'], i['simple_class']) for i in suppl2])
xml_pairs_in_suppl2 = [p for p in xml_pairs_el if (p.get('sentence'),  p.get('interaction')) in suppl2_sents]
xml_pairs_not_in_suppl2 = [p for p in xml_pairs_el if (p.get('sentence'),  p.get('interaction')) not in suppl2_sents]
print('found ', len(xml_pairs_in_suppl2), ' of ', len(xml_pairs_el))

found  1951  of  1979


In [98]:
xml_sents = set([(p.get('sentence'),  p.get('interaction')) for p in xml_pairs_el])
csv_pairs_in_xml = [i for i in suppl2 if (i['sentence'], i['simple_class']) in xml_sents]
csv_pairs_not_in_xml = [i for i in suppl2 if (i['sentence'], i['simple_class']) not in xml_sents]
print('found ', len(csv_pairs_in_xml), ' of ', len(suppl2))

found  2003  of  2500


In [92]:
import Levenshtein
print(len(xml_pairs_not_in_suppl2))
found = 0
for el in xml_pairs_not_in_suppl2[:5]:
    for s in suppl2:
        csv_sent, xml_sent = s['sentence'], el.get('sentence')
#         dist = Levenshtein.distance(csv_sent, xml_sent)
#         if dist < 100:
        if csv_sent == xml_sent:
            print(csv_sent, '\n'+ xml_sent)
            print(el.get('e1_name'), el.get('e2_name'))
            print(s['gene1'], s['gene2'])
            print(s['simple_class'], el.get('interaction'))
            print('-----')
            found += 1
            break
found

28
p73 Interacts with c-Myc to regulate Y-box-binding protein-1 expression. 
p73 Interacts with c-Myc to regulate Y-box-binding protein-1 expression.
Y-box-binding protein-1 p73
Y-box-binding protein-1 p73
True False
-----
Our data suggest that p73 stimulates the transcription of the YB-1 promoter by enhancing recruitment of the c-Myc-Max complex to the E-box 
Our data suggest that p73 stimulates the transcription of the YB-1 promoter by enhancing recruitment of the c-Myc-Max complex to the E-box
YB-1 p73
YB-1 p73
True False
-----
Taken together, these findings suggest that Gli, and probably also Gli2, are good candidates for transcriptional activators of the HNF-3beta floor plate enhancer, and the binding site for Gli proteins is a key element for response to Shh signalling. 
Taken together, these findings suggest that Gli, and probably also Gli2, are good candidates for transcriptional activators of the HNF-3beta floor plate enhancer, and the binding site for Gli proteins is a key el

5

The classification differs between supplements and ULF. May ULF can be used to annotate the ambigous 1000...

### Annotating ambigous supplement2 sentences with ULF

In [87]:
def simple_classes(data):
    for instance in data:
        if instance['class'] == 'TP':
            instance['simple_class'] = 'True'
        elif instance['class'] == 'FP':
            instance['simple_class'] = 'False'
        elif instance['class'] == 'NN':
            if instance['details'] == 'cooperation/competition in transcription':
                instance['simple_class'] = 'True'
            else:
                instance['simple_class'] = 'False'
    return data

with open('tf_extraction_paper_thomas/Supplement2.ambigous.csv', 'r') as f:
    ambigous = simple_classes([l for l in csv.DictReader(f)])

xml_triples = set([(p.get('sentence'), p.get('e1_name'), p.get('e2_name')) for p in xml_pairs_el])

counter = 0
for i in ambigous:
    triple = (i['sentence'], i['gene1'], i['gene2'])
    if triple in xml_triples:
        counter +=1
    if (triple[0], triple[2], triple[1]) in xml_triples:
        counter +=1
print(counter, 'of', len(ambigous), 'found')

820 of 996 found


### Analyze the coding schema of both.csv

In [81]:
from collections import Counter
true_c = Counter([csv_pairs[p.get('id')] for p in xml_pairs_in_csv if p.get('interaction') == 'True'])
false_c = Counter([csv_pairs[p.get('id')] for p in xml_pairs_in_csv if p.get('interaction') == 'False'])
true_c.most_common()

[('-->', 401),
 ('<--', 376),
 ('', 232),
 ('c', 178),
 ('e', 160),
 ('<->', 59),
 ('p', 42),
 ('i', 3),
 ('b', 2),
 ('c ', 1)]

In [82]:
false_c.most_common()

[('x', 273),
 ('', 172),
 ('w', 37),
 ('n', 20),
 ('[4149-->4904, 4609-->4904]', 2),
 ('[2735-->2735]', 2),
 ('wrong_Tf, Hos is a cell_line in that context', 2),
 ('[51176-->51176]', 1),
 ('[3202-->5241]', 1),
 ('[4791-->4791, 5970-->4791]', 1),
 ('[7490-->2297]', 1),
 ('[3169-->7080, 3170-->7080]', 1),
 ('b', 1),
 ('[5914-->5915, 6256-->5915, 7421-->5915]', 1),
 ('[1385-->2353, 1386-->2353, 1390-->2353]', 1),
 ('[4149-->7157, 4609-->7157]', 1),
 ('[3642-->3642]', 1)]

In [83]:
true_strings = [i for i, _ in true_c.most_common()]
true_strings

['-->', '<--', '', 'c', 'e', '<->', 'p', 'i', 'b', 'c ']

In [84]:
def simple_class_for_bothCSV(data):
    for i in data:
        if i['feedback_PD'] in ['-->', '<--', '', 'c', 'e', '<->', 'p', 'i', 'b', 'c ']:
            i['simple_class'] = 'True'
        else:
            i['simple_class'] = 'False'
    return data

simple_csv_data = simple_class_for_bothCSV(csv_data)
print('all: ', len(simple_csv_data))
print('pos: ', len([i for i in simple_csv_data if i['simple_class'] == 'True']))
print('neg: ', len([i for i in simple_csv_data if i['simple_class'] != 'True']))

all:  3516
pos:  3139
neg:  377


# Conclusion

* `meta.xml` consists of almost 2000 perfectly annotated pairs
    * improvement over the 1400 hand annotated subset of `Supplements2.csv`
* only ~1400 of these pairs are found in Supplement2 when comparing via the sentence