This notebook contains code for loading in all competition texts and annotations, and converting these to gate readable format

In [6]:
#imports
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json

TRIALPATH = "data/trial"
TRAINPATH = "data/train"

#setup SpaCy
import spacy
import en_core_web_lg
spacy.prefer_gpu()
nlp = en_core_web_lg.load()

In [8]:
class exerpt:
    """
    Class exerpt
    Description: a simple class to contain data for the measeval competition

    self.name : the measeval given name associated to the document
    self.txt : the raw text of the document
    self.ann : the brat annotations of the document(deprecated)
    self.tsv : a pandas dataframe containing all the tab seperated value data
    self.grobid : grobid quantities json output for quantity detection
    self.doc : The spacy doc generated from processing on this particular document
    """
    
    def __init__(self, name, txt, ann, tsv, grobid):
        self.name = name
        self.txt = txt
        self.ann = ann
        self.tsv = tsv
        self.grobid = grobid
        self.context = False
        self.posTag = False
        self.doc = nlp(self.txt)
        
    def getContext(self, count = 3):
        if self.context == False: 
            beforeContext = []
            afterContext = []
            self.context = True
            for start,end in zip(self.tsv["startOffset"].values, self.tsv["endOffset"].values ):#self.tsv.iterrows():
                a = getIndexSeperated(exerpt.txt[0],end,count = count)
                b = getIndexSeperated(exerpt.txt[0],start,count = count,forward = False)
                beforeContext.append(self.txt[0][b:start])
                afterContext.append(self.txt[0][end:a])
            self.tsv.insert(3,"beforeContext", beforeContext)
            self.tsv.insert(3,"afterContext", afterContext)
            
    def getPosTag(self):
        if self.context and self.posTag == False: 
            bTag = []
            aTag = []
            self.posTag = True
            for before, after in zip(self.tsv["beforeContext"].values,self.tsv["afterContext"].values):
                bTag.append(nltk.pos_tag(word_tokenize(before)))
                aTag.append(nltk.pos_tag(word_tokenize(after)))
            self.tsv.insert(3,"beforeTag", bTag)
            self.tsv.insert(3,"afterTag", aTag)
            

In [9]:
def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}

#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid")))
            )

#load all train data
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRAINPATH, "grobid", fn[:-4] + ".grobid"))))

The following for loop converts the current spaCy and human annotation format into a gate readable one using text/x-json-twitter format

In [10]:

for doc in data.values():

    testjson  = doc.doc.to_json()


    twitjson = {"full_text": testjson["text"],"entities":{}}


    for tok in testjson["tokens"]:
        tempToken = {}
        tempToken["indices"] = [tok["start"],tok["end"]] 
        tempToken["category"] = tok["tag"]
        tempToken["dep"] = tok["dep"]
        tempToken["id"] = tok["id"]
        tempToken["head"] = tok["head"]
        try:
            twitjson["entities"]["Token"].append(tempToken)
        except KeyError:
            twitjson["entities"]["Token"] = [tempToken] 

    for ent in testjson["ents"]:
        tempEnt = {}
        tempEnt["indices"] = [ent["start"],ent["end"]] 
        try:
            twitjson["entities"][ent["label"]].append(tempEnt)
        except KeyError:
            twitjson["entities"][ent["label"]] = [tempEnt] 

    for sent in testjson["sents"]:
        tempSent = {}
        tempSent["indices"] = [sent["start"],sent["end"]] 
        try:
            twitjson["entities"]["sentence"].append(tempSent)
        except KeyError:
            twitjson["entities"]["sentence"] = [tempSent] 
            
    for index, row in doc.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["indices"] = [row["startOffset"],row["endOffset"]] 
        tempAnnot["annotSet"] = row["annotSet"]
        tempAnnot["annotId"] = row["annotId"]
        tempAnnot["text"] = row["text"]
        tempAnnot["other"] = row["other"]
        try:
            twitjson["entities"]["MEval-"+row["annotType"]].append(tempAnnot)
        except KeyError:
            twitjson["entities"]["MEval-"+row["annotType"]] = [tempAnnot] 

    json.dump(twitjson, open(f'jsondocs/{doc.name}.json',"w"))

In [11]:
data.keys()

dict_keys(['S0012821X12004384-1302', 'S0012821X12004384-1405', 'S0012821X12004384-1415', 'S0012821X12004384-1594', 'S0012821X12004384-1599', 'S0012821X13002185-1061', 'S0012821X13002185-1200', 'S0012821X13002185-1217', 'S0012821X13002185-1231', 'S0012821X13002185-835', 'S0012821X13007309-1482', 'S0012821X13007309-1509', 'S0012821X13007309-1605', 'S0012821X13007309-1989', 'S0016236113008041-3031', 'S0016236113008041-3112', 'S0016236113008041-3153', 'S0016236113008041-3171', 'S0016236113008041-3186', 'S0016236113008041-3207', 'S0016236113008041-3269', 'S0016236113008041-3290', 'S0016236113008041-890', 'S0016236113008041-913', 'S0016236113008041-967', 'S0019103511004994-1382', 'S0019103511004994-1511', 'S0019103511004994-1565', 'S0019103512002801-1342', 'S0019103512002801-1496', 'S0019103512002801-1608', 'S0019103512002801-1824', 'S0019103512002801-1849', 'S0019103512002801-1927', 'S0019103512003533-3299', 'S0019103512003533-3348', 'S0019103512003533-4685', 'S0019103512003533-4971', 'S001

In [13]:
data['S2211124713006475-1205'].doc.to_json()

{'text': 'We next addressed whether hESC-derived trigeminal neurons can engraft in the adult mouse CNS and project toward their physiological target. The trigeminal nuclei in the brainstem receive afferent innervation from the trigeminal sensory ganglion that is relayed to the contralateral thalamus. The pons was selected as site for transplantation, because it is surgically accessible and located within proximity of the trigeminal brain stem nuclei that receive afferent input from the trigeminal ganglia. Hence, GFP+ human trigeminal neuron clusters were injected into adult NOD/SCID mice via stereotactic surgery (see Experimental Procedures). Histological analysis 4 weeks after transplantation showed survival of GFP+ human cell graft in the ventral pons (Figure S5E). Although GFP+ cell bodies remained tightly clustered at injection site, GFP+ fibers showed extensive projections into the host brain (n = 6) including the endogenous trigeminal nuclei (Figure S5F). Expression of BRN3A conf

In [14]:
data['S2211124713006475-1205'].doc.ents

(CNS, pons, GFP+, Histological, 4 weeks, GFP+, S5E, GFP+, GFP+, S5H)