In [83]:
import csv, re, json
from collections import Counter

fp = 'Supplement2.csv'
with open(fp) as f:
    reader = csv.DictReader(f)
    d = list(reader)


## Label calculation for each sentence

In [15]:
def count(d, key):
    return Counter([i[key] for i in d]).most_common()

From the paper
* 1535 positive
* 643 false positives
* 322 somewhat positive

In [16]:
count(d, 'class')

[('TP', 1535), ('FP', 631), ('NN', 334)]

some NN's are negative. The paper only counts 'cooperation/competition in transcription' to the somewhat positives.

In [17]:
count(d, 'details')

[('database', 875),
 ('general or irrelevant information', 512),
 ('binding to regulatory DNA-region/potential direct influence on expression',
  352),
 ('cooperation/competition in transcription', 322),
 ('potential direct influence on expression', 291),
 ('incorrectly recognized gene name or identifier', 91),
 ('negative statement', 28),
 ('binding to regulatory DNA-region', 17),
 ('indirect interaction', 12)]

Now the numbers add up. Create 0/1 classified dataset.

## Example instance calculation
It is only known which genes triggered the classification, not where they are in the sentence. If a sentence contains several mentions of the same gene, it is not possible to guess the correct one.

First, a translation dictionary of some known naming differences is defined. Then, gene_find() employs heuristics to circumvent some systematic errors (capitalization etc.)

In [31]:
def gene_find(gene, sent):
    trans = {
        'Peroxisome proliferator-activated receptor-gamma/NFkB': 'Peroxisome proliferator-activated receptor-gamma',
        'NEUROG3/E47': 'NEUROG3',
        'Oct4/Gata-6': 'Oct4-Gata-6',
        'Beta2/NeuoD1': 'Beta2/NeuroD1',
        'TCF/Smad4': 'Smad4',
        'Smad3/c-myc?': 'Smad3',
        'PAX3/FKHR': 'PAX3-FKHR',
        'MLL1': 'MLL',
        'YY1/N1IC': 'N1IC'
    }
    g = list(re.finditer(re.escape(gene), sent))
    if g == []:
        g = list(re.finditer(re.escape(gene.lower()), sent.lower()))
    if g == []:
        trans_gene = trans.get(gene, gene)
        g = list(re.finditer(re.escape(trans_gene), sent))
    return g

In [32]:
unambiguity={'perfect':[], 'multiple': [], 'no':[]}
for row in d:
    gene1s = gene_find(row['gene1'], row['sentence'])
    gene2s = gene_find(row['gene2'], row['sentence'])

    
    if len(gene1s) == 1 and len(gene2s) == 1:
        unambiguity['perfect'].append(row)
    elif len(gene1s) == 0 or len(gene2s) == 0:
        row['matches'] = {'gene1': gene1s, 'gene2': gene2s}
        unambiguity['no'].append(row)
    elif len(gene1s) > 1 or len(gene2s) > 1:
        unambiguity['multiple'].append(row)

for k, v in unambiguity.items():
    print('{}: {}'.format(k, len(v)))

perfect: 1504
multiple: 996
no: 0


1504 sentences are unique. For the rest, a resolution has to be found. Export the unambigous ones.

In [47]:
with open(fp + 'unambigous', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=d[0].keys())
    writer.writeheader()
    for row in enriched:
        writer.writerow(row)

In [8]:
with open(fp + '.ambigous', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=d[0].keys())
    writer.writeheader()
    for row in unambiguity['multiple']:
        writer.writerow(row)

Add 4 columns to the csv with information about the char position of the genes in the sentence

In [90]:
import xml.etree.ElementTree as ET

def build_ulf_xml_dict(ulf_xml_path):
    """Read ULF XML and create a dict of (sentence, gene1, gene2) -> (gene1 offset, gene2 offset)"""
    root = ET.parse(ulf_xml_path).getroot()
    offset_dict = {}
    counter = 0
    for docs in root.iter('document'):
        for sent in docs.iter('sentence'):
            entities = {el.get('id'): (el.get('text'), el.get('charOffset')) for el in sent.findall('entity')}
            for pair in sent.findall('pair'):
                offset_dict[(
                    sent.get('text').strip(),
                    entities[pair.get('e1')][0],
                    entities[pair.get('e2')][0]
                )] = (entities[pair.get('e1')][1], entities[pair.get('e2')][1])
                if entities[pair.get('e1')][1] is None:
                    print(sent.get('text').strip())
    return offset_dict


def enrich(rows):
    ulf = build_ulf_xml_dict('../feedback_ulf.annotated.parsed.xml')
    count = 0
    for row in rows:
        row['gene1_char_start'], row['gene1_char_end'], row['gene2_char_start'], row['gene2_char_end'] = None, None, None, None
        gene1s = gene_find(row['gene1'], row['sentence'])
        gene2s = gene_find(row['gene2'], row['sentence'])

        if len(gene1s) == 1 and len(gene2s) == 1:
            row['gene1_char_start'], row['gene1_char_end'] = [int(i) for i in gene1s[0].span()]
            row['gene2_char_start'], row['gene2_char_end'] = [int(i) for i in gene2s[0].span()]
        else:  # lookup the triple (sentence, gene1, gene2) in ULF XML
            triple = (row['sentence'], row['gene1'], row['gene2'])
            try:
                [gene1_char_start, gene1_char_end], [gene2_char_start, gene2_char_end] = \
                    [pos.split('-') for pos in ulf[triple]]
                row['gene1_char_start'], row['gene1_char_end'] = int(gene1_char_start), int(gene1_char_end)+1
                row['gene2_char_start'], row['gene2_char_end'] = int(gene2_char_start), int(gene2_char_end)+1
                # sanity check - none should be printed
                if row['sentence'][row['gene1_char_start']:row['gene1_char_end']] != row['gene1']:
                    print(row['sentence'][row['gene1_char_start']:row['gene1_char_end']], '//', row['gene1'])
                if row['sentence'][row['gene2_char_start']:row['gene2_char_end']] != row['gene2']:
                    print(row['sentence'][row['gene2_char_start']:row['gene2_char_end']], '//', row['gene2'])
            except KeyError:
                count += 1
    print(count)
    return rows

In [91]:
enriched = enrich(d)
with open(fp + '.enriched', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=enriched[0].keys())
    writer.writeheader()
    for row in enriched:
        writer.writerow(row)

259
