# BC5CDR


In [1]:
import gzip
import io
import os
import re
import tarfile
import urllib.request
import zipfile

DATA_ROOT = "datasets/"

if not os.path.exists(DATA_ROOT):
    os.mkdir(DATA_ROOT)

def download(url, fpath):
    opener = urllib.request.build_opener()
    opener.addheaders = [("User-agent", "Mozilla/5.0")]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(url, fpath)
    
url = "http://www.biocreative.org/media/store/files/2016/CDR_Data.zip"

outfpath = f"{DATA_ROOT}/{url.strip().split('/')[-1]}" 
download(url, outfpath)

# unzip files
with zipfile.ZipFile(outfpath, 'r') as zip_ref:
    zip_ref.extractall(DATA_ROOT)
 

In [18]:
import bioc

class Annotation:
    def __init__(self, offsets, text, type_, kb_id):
        self.offsets = offsets
        self.text = text
        self.type_ = type_
        self.kb_id = kb_id

class Document:
    def __init__(self, text, ents, relations):
        self.text = text
        self.ents = ents
        self.relations = relations
        
def parse_bioc_annotations(xdoc, kb_id_key=None):
    text = ' '.join([section.text for section in xdoc.passages])
    ents = []
    for section in xdoc.passages:
        for span in section.annotations:
            char_start = span.locations[0].offset
            char_end = span.locations[0].offset + span.locations[0].length
            offsets = (char_start, char_end)
            kb_id = span.infons[kb_id_key] if kb_id_key else -1
            ents.append(Annotation(offsets, span.text, span.infons['type'], kb_id))
    
    return Document(text,
                    sorted(ents, key=lambda x:x.offsets, reverse=False),
                    [rel.infons for rel in xdoc.relations])
            
def load_corpus(fname):
    reader = bioc.BioCXMLDocumentReader(fname)
    return [
        parse_bioc_annotations(xdoc, 'MESH')
        for i,xdoc in enumerate(reader)
    ]

train = load_corpus(f"{DATA_ROOT}/CDR_Data/CDR.Corpus.v010516/CDR_TrainingSet.BioC.xml")
dev = load_corpus(f"{DATA_ROOT}/CDR_Data/CDR.Corpus.v010516/CDR_DevelopmentSet.BioC.xml")
test = load_corpus(f"{DATA_ROOT}/CDR_Data/CDR.Corpus.v010516/CDR_TestSet.BioC.xml")

print(len(train), len(dev), len(test))


500 500 500


## Prompts

In [49]:
def list_mentions_for_an_entity_type(x, entity_type):
    """
    answer_choices: N/A
    answers_in_prompt: yes
    original task: yes
    """
    tmpl = "Create a comma-separated list of all {} names mentioned in the following PubMed abstract. "
    tmpl += "If there are no {} mentions, print None.\n"
    tmpl += '"{}"\n|||{}'
    
    target = ", ".join([e.text for e in x.ents if e.type_ == entity_type])
    if not target:
        target = "None"
    return tmpl.format(entity_type, entity_type, x.text, target)

prompt = list_mentions_for_an_entity_type(train[0], 'Chemical')
print(prompt)

prompt = list_mentions_for_an_entity_type(train[0], 'Disease')
print(prompt)


Create a comma-separated list of all Chemical names mentioned in the following PubMed abstract. If there are no Chemical mentions, print None.
"Naloxone reverses the antihypertensive effect of clonidine. In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensive effect of 100 mg/kg alpha-methyldopa was also partially reversed by naloxone. Naloxone alone did not affect either blood pressure or heart rate. In brain membranes from spontaneously hypertensive rats clonidine, 10(-8) to 10(-5) M, did not influence stereoselective binding of [3H]-naloxone (8 nM), and naloxone, 10(-8) to 10(-4) M, did not influence clonidine-suppressible binding of [3H]-dihydroergocryptine (1 nM). These findings indicate that in spontaneously hypertensive rats the effects of central alpha-adrenoceptor stimulation involve activation of opiate recep

In [55]:
import collections

def create_multiple_choice(x):
    """
    Can disease x be induced by chemical y? Yes/No
    Can chemical y induce disease x? Yes/No
    
    
    Chemical y can induce which of the following diseases? 
    
    - 
    - None of the above
    
    """
    # build mapping of MESH ids to surface forms
    kb = collections.defaultdict(set)
    for e in x.ents:
        kb[e.kb_id].add(e.text.lower())    
    kb = dict(kb)
    
    
    print(x.relations)



create_multiple_choice(train[0])


{'D009270': {'naloxone'}, 'D003000': {'clonidine'}, 'D006973': {'hypertensive'}, '-1': {'nalozone', '[3h]-naloxone', '[3h]-dihydroergocryptine'}, 'D007022': {'hypotensive'}, 'D008750': {'alpha-methyldopa'}}
[{'relation': 'CID', 'Chemical': 'D008750', 'Disease': 'D007022'}]


## Summary stats of prompt dataset

In [37]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [38]:
toks = tokenizer.tokenize(prompt)


In [40]:
len(toks)

426