This notebook contains code for loading in all competition texts and annotations, and converting these to gate readable format

In [31]:
#imports
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json
import code
from code.exerpt import exerpt
TRIALPATH = "data/trial"
TRAINPATH = "data/train"
from code.common import LATEST_tsv

#setup SpaCy
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

from benepar.spacy_plugin import BeneparComponent
nlp.add_pipe(BeneparComponent('benepar_en2_large'))
spacy.prefer_gpu()


False

Rule-based Pipeline component for retrieving Quantities. 

Rules: 
    1. any CD followed by a unit that has a nouns POS
    2. any cardinal, money, ordinal, percent, date, time or quantity followed by a unit that has a noun POS
    3. any token that is LIKE_NUM followed by a unit that has a noun POS



In [32]:
#add pipeline component 
from spacy.matcher import Matcher
from spacy.tokens import Doc
Doc.set_extension("unit", default = "def", force = True)


def customMatcher(nlp):
    """
    Description: matcher giving the most recall so far
    """
    matchList = open("gazetteers/combined_measurements.lst","r",encoding="utf-8").read().split("\n")
    matcher = Matcher(nlp.vocab)
    pattern = []
    for word in matchList: 
        pattern.append([{"TAG": {"REGEX": "^[CD]"}},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        pattern.append([{"ENT_TYPE": {"IN": ["CARDINAL", "MONEY", "ORDINAL", "PERCENT", "DATE", "TIME", "QUANTITY"]},
                        "TAG":{"REGEX": "^[DT]"},"op": "!"},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        pattern.append([{"LIKE_NUM": True},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        #pattern.append([{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        
    matcher.add("Unit", None, *pattern)
        
    return matcher

def gazetteer(doc):
    matcher = customMatcher(nlp)
    matches = matcher(doc)
    doc._.unit = []
    for match_id, start, end in matches:
        tempSpan = doc[start:end]
        doc._.unit.append({'start': tempSpan.start_char, 'end': tempSpan.end_char, 'label': 'UNIT', 'text' : doc[start:end]})
    return doc
        
        
        
#nlp.add_pipe(gazetteer, last=True)
    

In [33]:
#pipeline component H0

Doc.set_extension("h0Number", default = "def", force = True)
Doc.set_extension("h0Unit", default = "def", force = True)
Doc.set_extension("h0MeasuredEntity", default = "def", force = True)
Doc.set_extension("h0Measurements", default = "def", force = True)

#ents-ORIG = ["CARDINAL", "MONEY", "PERCENT", "DATE", "TIME", "QUANTITY"]
ents = ["CARDINAL", "MONEY", "PERCENT", "DATE", "TIME", "QUANTITY"]
ENTITIES = ""
for x in ents:
    ENTITIES += x
    
def numberMatcher(nlp):
    """
    Description: matcher giving the most recall so far
    """
    matcher = Matcher(nlp.vocab)
    pattern = []
    pattern.append([{"LIKE_NUM": True}])
    pattern.append([{"ENT_TYPE": {"IN": ents}}])
    matcher.add("h0Number", None, *pattern)
        
    return matcher


def h0(doc):
    matcher = numberMatcher(nlp)
    matches = matcher(doc)
    doc._.h0Number = []
    doc._.h0Unit = []
    doc._.h0MeasuredEntity = []
    doc._.h0Measurements = []
    for match_id, start, end in matches:
        
        tempSpan = doc[start:end]
        tempTok = doc[start]
        tempNum = {
            'start': tempSpan.start_char, 
            'end': tempSpan.end_char, 
            'label': 'h0Number', 
            'text' : tempTok.text,
            'span' : tempSpan,
            's' : start,
            'e' : end
        }
        
        doc._.h0Number.append(tempNum)
        
        tempHead = tempTok.head
        spanHead = doc[tempHead.i:tempHead.i+1]
        tempUnit = {
            'start': spanHead.start_char, 
            'end': spanHead.end_char, 
            'label': 'h0Unit', 
            'text' : tempHead.text,
            'span' : spanHead,
            's' : tempHead.i,
            'e' : tempHead.i+1
        }
        
        doc._.h0Unit.append(tempUnit)
        
        tempHeadHead = None
        spanHeadHead = None
        if tempHead.dep_ == "pobj":
            tempHeadHead = tempTok.head.head.head
            spanHeadHead = doc[tempHeadHead.i:tempHeadHead.i+1]
        else:
            tempHeadHead = tempTok.head.head
            spanHeadHead = doc[tempHeadHead.i:tempHeadHead.i+1]
            
        
        tempME = {
            'start': spanHeadHead.start_char, 
            'end': spanHeadHead.end_char, 
            'label': 'h0MeasuredEntity', 
            'text' : tempHeadHead.text,
            'span' : spanHeadHead,
            's' : tempHeadHead.i,
            'e' : tempHeadHead.i+1
        }
        
        doc._.h0MeasuredEntity.append(tempME)
        
        doc._.h0Measurements.append({
            "Number" : tempNum,
            "Unit" : tempUnit,
            "MeasuredEntity": tempME
        })
        
        
    return doc


nlp.add_pipe(h0, last=True)

In [34]:
#
#TEST ONE
#

def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}
#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid"))),
                nlp
            )
        break

In [35]:
import time
t1 = time.time()
def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}

#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid"))),
                nlp
            )

#load all train data
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRAINPATH, "grobid", fn[:-4] + ".grobid"))),
                nlp
            )
            
t2 = time.time()
print(t2-t1, "Seconds elapsed")
print((t2-t1)/60, "Minutes elapsed")

344.07167291641235 Seconds elapsed
5.734527881940206 Minutes elapsed


In [36]:
other = []
for e in data.values():
    for x in e.measurements.values():
        try:
            print(x["MeasuredProperty"]["text"], x["MeasuredEntity"]["text"],x["Quantity"]["text"],)
        except KeyError:
            continue



%low salinity dinoflagellate cysts Samples 0% to 80%
peridinoid cysts DA1 on average 5%
southern boundary northern rain belt 40°N
before CIE onset from 103 yrs
before CIE onset above 2618 m
present today on Antarctica ice volume between 60% and 100%
emplacement of an ice volume Oi1 ∼400 ka
began decline ∼2 Ma
NaOH wet alkaline digestion 0.2 M
m/Δm∼3500 mass spectrometer 5%
m/Δm∼3500 mass spectrometer 95%
30ε diatom silicon isotope fractionation factor ∼−1‰
differences in the abundance diatoms 40%
correlation δ30Si values of size fractions between 2 and 20 μm r2=0.92
apparent 30ε δ30Si values of size fractions between 2 and 20 μm ∼–1‰
O ammonite Watinoceras devonense F
O W. devonense F
O Mytiloides puebloensis F
positive shift VPDB δ13Corg 2–3‰
VPDB δ13Corg upper Hartland Shale ∼−27‰
below the CTB upper Hartland Shale 4.3 m
gradual fall δ13Corg ∼−27‰
thick package of black organic-rich calcareous shales, termed the “Niveau Thomel” ∼20 m
came from Vergons some of the samples n=4
187Os/18

one P simulation time 12 h
time step of Δt one P = 12 h sinusoidal period 864 s
averaged power extracted one cycle 22%
averaged power extracted one cycle 48.4 MW to 59.0 MW
full period simulation time one
output farm 2%
load factors wind farms in the UK 5–13% per year
onset PEO and pigmentary retinopathy before the age of 20 years
protein concentration cerebrospinal fluid (CSF) greater than 0.1 g/L
maximum yield for the year nutrient sufficiency recommendation system 90–95%
period of time soil test values 4–8 years
nominal resolution LEO 1525 field emission scanning electron microscope 1.5 nm
heating rate microscopy samples 5 K/min
rapid cooling to ambient temperature at a rate microscopy samples about 30 K/min
negative band solid state CD spectrum of GMP 260 nm
negative band spectrum of complex 1 215–225 nm
charged cells 4.0 V
rate cells 12 mA g− 1
rates cells 1/20 (12 mA g− 1)–30C (7260 mA g− 1)
RED drops packets 25%
DropTail packets 7.6%
decreases Rb from 0.25 Mbps to 0.01 Mbps
diag

In [37]:
other

[]

In [38]:
from code.helpers import *


In [39]:
#testing 
doc = data["S0019103512002801-1927"].doc
s1=doc[3:7]
s2=doc[3:4]
if not intersectSpanSpan(s1,s2):
    print("error1")
s1=doc[3:7]
s2=doc[3:7]
if not intersectSpanSpan(s1,s2):
    print("error2")
s1=doc[3:7]
s2=doc[5:7]
if not intersectSpanSpan(s1,s2):
    print("error3")
s2=doc[3:7]
s1=doc[3:4]
if not intersectSpanSpan(s1,s2):
    print("error4")
s2=doc[3:7]
s1=doc[5:7]
if not intersectSpanSpan(s1,s2):
    print("error5")

In [40]:
#get false positives and true positives
Doc.set_extension("h0NumberTps", default = "def", force = True)
Doc.set_extension("h0UnitTps", default = "def", force = True)
Doc.set_extension("h0MeasuredEntityTps", default = "def", force = True)
Doc.set_extension("h0NumberFps", default = "def", force = True)
Doc.set_extension("h0UnitFps", default = "def", force = True)
Doc.set_extension("h0MeasuredEntityFps", default = "def", force = True)
Doc.set_extension("h0MeasurementTps", default = "def", force = True)

for e in data.values():
    doc = e.doc
    doc._.h0NumberTps = []
    doc._.h0UnitTps = []
    doc._.h0MeasuredEntityTps = []
    doc._.h0NumberFps = []
    doc._.h0UnitFps = []
    doc._.h0MeasuredEntityFps = []
    doc._.h0MeasurementTps = []

    
    for meas in doc._.h0Measurements:
        num = meas["Number"]
        unit = meas["Unit"]
        me = meas["MeasuredEntity"]
        
        for m in e.measurements.values():
            try:
                if(intersectSpan(num["span"],m["Quantity"]["startOffset"],m["Quantity"]["endOffset"]) and intersectSpan(unit["span"],m["Quantity"]["startOffset"],m["Quantity"]["endOffset"])):
                    doc._.h0NumberTps.append(num)
                    doc._.h0UnitTps.append(unit)
                    if(intersectSpan(me["span"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"]) or
                      intersectSpanNum(me["start"],me["end"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])):
                        doc._.h0MeasuredEntityTps.append(num)
                        doc._.h0MeasurementTps.append(meas)
                        if(e.name == "S0016236113008041-3112" and num["text"] ==10):
                            print("Quantity:",m["Quantity"]["text"], "MeasuredEntity",m["MeasuredEntity"]["text"])
                            print(me["span"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])
                            print(me["start"],me["end"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])
                    else:
                        r=dict(meas)
                        del r["MeasuredEntity"]
                        doc._.h0MeasurementTps.append(r)
                        
                        if(e.name == "S0016236113008041-3112" and num["text"] == '10'):
                            print("Quantity:",m["Quantity"]["text"], "MeasuredEntity",m["MeasuredEntity"]["text"])
                            print(me["span"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])
                            print(me["start"],me["end"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])
                        
            except KeyError:
                continue#print("No quantity")

Quantity: 10% MeasuredEntity acid
acid 751 755
775 779 751 755


In [11]:
data["S0016236113008041-3112"].doc._.h0MeasurementTps

[{'Number': {'start': 766,
   'end': 768,
   'label': 'h0Number',
   'text': '10',
   'span': 10,
   's': 140,
   'e': 141},
  'Unit': {'start': 768,
   'end': 769,
   'label': 'h0Unit',
   'text': '%',
   'span': %,
   's': 141,
   'e': 142}}]

In [16]:
d = list(data.values())[1]

In [17]:

for sent in d.doc.sents:
    print(sent._.parse_string)

(S (NP (JJ Dinoflagellate) (NNS cysts)) (VP (VBP have) (VP (VBN been) (VP (VBN used) (ADVP (RB extensively)) (PP (IN for) (S (VP (VBG reconstructing) (NP (NP (NNS paleoenvironments)) (PP (IN in) (NP (DT the) (NNP Paleogene))))))) (PRN (-LRB- -LRB-) (S (VP (VB see) (NP (NP (NN overview)) (PP (IN in) (NP (NP (NNP Sluijs) (CC et) (NNP al) (NNP .)) (, ,) (NP (CD 2005))))))) (-RRB- -RRB-)) (, ,) (SBAR (IN as) (S (NP (PRP they)) (VP (VBP are) (ADJP (RB particularly) (JJ sensitive) (PP (TO to) (NP (NP (NNS changes)) (PP (IN in) (NP (NP (NN salinity)) (, ,) (NP (NN temperature)) (, ,) (CC and) (NP (NN nutrient) (NNS levels))))))))))))) (-LRB- -LRB-))
(NP (NNP Powell) (NN et) (NNP al) (. .))
(NP (, ,) (NP (CD 1992)) (: ;) (NP (NP (NNP Pross) (CC and) (NNP Brinkhuis)) (NP (CD 2005))) (: ;) (NP (NP (. Sluijs) (CC et) (NNP al) (NNP .)) (, ,) (NP (CD 2005))) (-RRB- -RRB-) (. .))
(S (NP (PRP We)) (VP (VBP calculate) (`` “) (NP (NN %) (JJ low) (NN salinity) (JJ dinoflagellate) (NNS cysts)) ('' ”) (PR

The following for loop converts the current spaCy and human annotation format into a gate readable one using text/x-json-twitter format

In [28]:
goldCount = {
    "Quantity": 0,
    "MeasuredEntity" : 0,
    "MeasuredProperty" : 0,
    "Qualifier" : 0       
}
h0Count = {
    "Number":0,
    "Unit":0,
    "MeasuredEntity":0,
    "total":0
}


counts={
    "goldCount" : goldCount,
    "h0Count":h0Count
}

for e in data.values():
    for index, row in e.tsv.iterrows():
            counts["goldCount"][row["annotType"]] += 1
    counts["h0Count"]["Number"] += len(e.doc._.h0NumberTps)
    counts["h0Count"]["Unit"] += len(e.doc._.h0UnitTps)
    counts["h0Count"]["MeasuredEntity"] += len(e.doc._.h0MeasuredEntityTps)
    counts["h0Count"]["total"] += len(e.doc._.h0Measurements)
    
    
counts["QuantityPrecision"] = counts["h0Count"]["Number"]/counts["h0Count"]["total"]
counts["QuantityRecall"] = counts["h0Count"]["Number"]/counts["goldCount"]["Quantity"]
counts["QuantityF1"] = 2*(counts["QuantityRecall"]*counts["QuantityPrecision"])/(counts["QuantityRecall"]+counts["QuantityPrecision"])
counts["MEPrecision"] = counts["h0Count"]["MeasuredEntity"]/counts["h0Count"]["total"]
counts["MERecall"] = counts["h0Count"]["MeasuredEntity"]/counts["goldCount"]["Quantity"]
counts["MEF1"] = 2*(counts["MERecall"]*counts["MEPrecision"])/(counts["MERecall"]+counts["MEPrecision"])

     

In [29]:
#json.dump(counts, open(f"performance{ENTITIES}.json","w",encoding= "utf-8"),indent = 3)
counts

{'goldCount': {'Quantity': 1087,
  'MeasuredEntity': 1056,
  'MeasuredProperty': 686,
  'Qualifier': 293},
 'h0Count': {'Number': 1076,
  'Unit': 1076,
  'MeasuredEntity': 201,
  'total': 2913},
 'QuantityPrecision': 0.36937864744249915,
 'QuantityRecall': 0.9898804047838087,
 'QuantityF1': 0.5379999999999999,
 'MEPrecision': 0.0690010298661174,
 'MERecall': 0.18491260349586017,
 'MEF1': 0.10049999999999999}

In [28]:
import importlib
importlib.reload(code.output)   
from code.output import getAscii

os.system("rm ascii/noannot/*")
os.system("rm ascii/nome/*")
os.system("rm ascii/normal/*")




for e in data.values():
    if (len(e.doc._.h0MeasuredEntityTps) < len(e.doc._.h0NumberTps)):
        file = open(f"ascii/nome/{e.name}.txt","w",encoding="utf-8")
        getAscii(e, file)
        file.close()
    elif(len(e.doc._.h0MeasurementTps) > 0):
        file = open(f"ascii/normal/{e.name}.txt","w",encoding="utf-8")
        getAscii(e, file)
        file.close()
    else:
        file = open(f"ascii/noannot/{e.name}.txt","w",encoding="utf-8")
        getAscii(e, file)
        file.close()
        

AttributeError: module 'code' has no attribute 'output'

In [None]:
"""
One Long document in json format
"""


import math
twitjson = {"full_text": "","entities":{}}
offset = 0

for doc in data.values():

    testjson  = doc.doc.to_json()


    
    twitjson["full_text"] = twitjson["full_text"] +  testjson["text"]


    for tok in testjson["tokens"]:
        tempToken = {}
        tempToken["indices"] = [offset + tok["start"],offset + tok["end"]] 
        tempToken["category"] = tok["tag"]
        tempToken["kind"] = tok["dep"]
        tempToken["id"] = tok["id"]
        tempToken["head"] = tok["head"]
        try:
            twitjson["entities"]["Token"].append(tempToken)
        except KeyError:
            twitjson["entities"]["Token"] = [tempToken] 

#     for ent in testjson["ents"]:
#         tempEnt = {}
#         tempEnt["indices"] = [offset + ent["start"],offset + ent["end"]] 
#         try:
#             twitjson["entities"][ent["label"]].append(tempEnt)
#         except KeyError:
#             twitjson["entities"][ent["label"]] = [tempEnt]
            
#     for unit in doc.doc._.unit:
#         tempUnit = {}
#         tempUnit["indices"] = [int(unit["start"]),int(unit["end"])]
#         tempUnit["text"]= unit["text"].text
#         try:
#             twitjson["entities"]["unit"].append(tempUnit)
#         except KeyError:
#             twitjson["entities"]["unit"] = [tempUnit]

            
    for sent in testjson["sents"]:
        tempSent = {}
        tempSent["indices"] = [offset + sent["start"],offset + sent["end"]] 
        try:
            twitjson["entities"]["sentence"].append(tempSent)
        except KeyError:
            twitjson["entities"]["sentence"] = [tempSent] 
            
    for index, row in doc.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["indices"] = [offset + row["startOffset"],offset + row["endOffset"]] 
        tempAnnot["annotSet"] = row["annotSet"]
        tempAnnot["annotId"] = row["annotId"]
        tempAnnot["text"] = row["text"]
        if(type(row["other"]) == str):
            tempAnnot["other"] = row["other"]
        else:
            tempAnnot["other"] = "nothing"
            
        try:
            twitjson["entities"]["MEval-"+row["annotType"]].append(tempAnnot)
        except KeyError:
            twitjson["entities"]["MEval-"+row["annotType"]] = [tempAnnot] 
            
#      doc._.h0Number = []
#     doc._.h0Unit = []
#     doc._.h0MeasuredEntity = []
            
            
    for num in doc.doc._.h0Number:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Number"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Number"] = [temp]
            
    for num in doc.doc._.h0Unit:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Unit"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Unit"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntity:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntity"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntity"] = [temp]
            
    #True Positives        
    for num in doc.doc._.h0NumberTps:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0NumberTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0NumberTP"] = [temp]
            
    for num in doc.doc._.h0UnitTps:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0UnitTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0UnitTP"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntityTps:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntityTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntityTP"] = [temp]
            
    twitjson["full_text"] = twitjson["full_text"] +  "\n\n"
    offset += len(twitjson["full_text"])
    
    if offset > 1000 and offset < 4000:
        json.dump(twitjson, open(f'jsondoctest/sample.json',"w"), indent=3)


json.dump(twitjson, open(f'jsondoctest/alldocs.json',"w"), indent=3)

In [38]:
def createFeature(key, value, file):
    file.write(f"""<Feature>
  <Name className="java.lang.String">{key}</Name>
  <Value className="java.lang.String">{value}</Value>
</Feature>\n""")

def createAnnotation(ID, tpe, start, end, features, file):
    file.write(f"<Annotation Id=\"{ID}\" Type=\"{tpe}\" StartNode=\"{start}\" EndNode=\"{end}\">\n")
    for key in features.keys():
        createFeature(key,features[key],file)
    file.write("</Annotation>\n")
    
def createNode(token,doc,offset,file,prevEnd):
    txt = token.text
    txt = txt.replace("'","&apos;")
    txt = txt.replace("\"","&quot;")
    txt = txt.replace("&","&amp;")
    txt = txt.replace("<","&lt;")
    txt = txt.replace(">","&gt;")
    
    start = doc[token.i:token.i+1].start_char+offset
    end = offset+doc[token.i:token.i+1].end_char
    
    if(start == prevEnd):
        file.write("{}<Node id=\"{}\"/>".format(txt,end))
        
    elif(start > prevEnd):
        file.write(" <Node id=\"{}\"/>{}<Node id=\"{}\"/>".format(start,txt,end))
    else:
        print("case Unhandled")
    
    return end
    

In [39]:
"""
One Long document in xml format
"""
import os 
file = open("gatexmlforalldocs.xml", "w", encoding = "utf-8")
txtFile = open("textFileForGatexml.txt", "w", encoding = "utf-8")

file.write("""<?xml version='1.0' encoding='utf-8'?>
<GateDocument version="3">
<GateDocumentFeatures>""")
createFeature("gate.SourceURL",os.path.join(os.getcwd(), "textFileForGatexml.txt"),file)
createFeature("MimeType","text/plain",file)
createFeature("docNewLineType","",file)
file.write("\n</GateDocumentFeatures>\n\n")
file.write("<TextWithNodes>")


offset = 0
annotId = 0
annotz = []
for e in data.values():
    testjson  = e.doc.to_json()
    prevEnd = -1
    
    for sent in e.doc.sents:
        for token in sent:
            prevEnd = createNode(token,e.doc,offset,file,prevEnd)


    
    txtFile.write(testjson["text"] + "\n\n")
    


    for tok in testjson["tokens"]:
        tempToken = {}
        tempToken["category"] = tok["tag"]
        tempToken["kind"] = tok["dep"]
        tempToken["id"] = tok["id"]
        tempToken["head"] = tok["head"]
        
        annotz.append([annotId, "Token", offset + tok["start"], offset + tok["end"], tempToken, file])
        #createAnnotation(annotId, "Token", offset + tok["start"], offset + tok["end"], tempToken, file)
        annotId += 1
            
            
    for sent in testjson["sents"]:
        tempSent = {}
        annotz.append([annotId, "sentence", offset + sent["start"], offset + sent["end"], tempSent, file])
        #createAnnotation(annotId, "sentence", offset + sent["start"], offset + sent["end"], tempSent, file)
        annotId += 1 
        
            
    for index, row in e.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["annotSet"] = row["annotSet"]
        tempAnnot["annotId"] = row["annotId"]
        tempAnnot["text"] = row["text"]
        if(type(row["other"]) == str):
            tempAnnot["other"] = row["other"]
        else:
            tempAnnot["other"] = "nothing"
            
        annotz.append([annotId, "MEval-"+row["annotType"] , offset + row["startOffset"], offset + row["endOffset"], tempAnnot, file])    
        #createAnnotation(annotId, "MEval-"+row["annotType"] , offset + row["startOffset"], offset + row["endOffset"], tempAnnot, file)
        annotId += 1
            
    for num in doc.doc._.h0Number:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0Number", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0Number", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    for num in doc.doc._.h0Unit:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0Unit", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0Unit", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    for num in doc.doc._.h0MeasuredEntity:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0MeasuredEntity", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0MeasuredEntity", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    #True Positives        
    for num in doc.doc._.h0NumberTps:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0NumberTP", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0NumberTP", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
        
    for num in doc.doc._.h0UnitTps:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0UnitTP", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0UnitTP", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    for num in doc.doc._.h0MeasuredEntityTps:
        temp= {}
        temp["text"] = num["text"]
        
        annotz.append([annotId, "h0MeasuredEntityTP", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0MeasuredEntityTP", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    offset += len(testjson["text"])
    break
    
    
file.write("\n</TextWithNodes>\n\n")    
    
file.write("<AnnotationSet Name=\"Bens annots\">\n")

for x in annotz:
    createAnnotation(*x)
    
file.write("</AnnotationSet>")
file.write("</GateDocument>")    
file.close()

txtFile.close()



In [42]:
"""
Document by document in json format
"""


import math
for doc in data.values():
    print(doc.name)

    testjson  = doc.doc.to_json()


    twitjson = {"text": testjson["text"],"entities":{}}

    for tok in testjson["tokens"]:
        tempToken = {}
        tempToken["indices"] = [tok["start"],tok["end"]] 
        tempToken["category"] = tok["tag"]
        tempToken["kind"] = tok["dep"]
        tempToken["id"] = tok["id"]
        tempToken["head"] = tok["head"]
        
            
        try:
            twitjson["entities"]["Token"].append(tempToken)
        except KeyError:
            twitjson["entities"]["Token"] = [tempToken] 

    for ent in testjson["ents"]:
        tempEnt = {}
        tempEnt["indices"] = [ent["start"],ent["end"]] 
        try:
            twitjson["entities"][ent["label"]].append(tempEnt)
        except KeyError:
            twitjson["entities"][ent["label"]] = [tempEnt]
            
            
    for sent in doc.doc.sents:
        for tok in sent: 
            tempEnt = {}
            if tok.dep_ == "root":
                tempEnt["args"] = ["",tok.text]
            else:
                tempEnt["args"] = [tok.head.text,tok.text]
                
            tempEnt["kind"] = tok.dep_
            
            mn = min(doc.doc[tok.head.i:tok.head.i+1].start_char,doc.doc[tok.i:tok.i+1].start_char)
            mx = max(doc.doc[tok.head.i:tok.head.i+1].end_char,doc.doc[tok.i:tok.i+1].end_char)
            
            tempEnt["indices"] = [mn,mx]
        
            try:
                twitjson["entities"]["NickDependency"].append(tempEnt)
            except KeyError:
                twitjson["entities"]["NickDependency"] = [tempEnt]
            
            
#     for unit in doc.doc._.unit:
#         tempUnit = {}
#         tempUnit["indices"] = [int(unit["start"]),int(unit["end"])]
#         tempUnit["text"]= unit["text"].text
#         try:
#             twitjson["entities"]["unit"].append(tempUnit)
#         except KeyError:
#             twitjson["entities"]["unit"] = [tempUnit]

            
    for sent in testjson["sents"]:
        tempSent = {}
        tempSent["indices"] = [sent["start"],sent["end"]] 
        try:
            twitjson["entities"]["sentence"].append(tempSent)
        except KeyError:
            twitjson["entities"]["sentence"] = [tempSent] 
            
    for index, row in doc.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["indices"] = [row["startOffset"],row["endOffset"]] 
        tempAnnot["annotSet"] = row["annotSet"]
        tempAnnot["annotId"] = row["annotId"]
        tempAnnot["text"] = row["text"]
        if(type(row["other"]) == str):
            tempAnnot["other"] = row["other"]
        else:
            tempAnnot["other"] = "nothing"
            
        try:
            twitjson["entities"]["MEval-"+row["annotType"]].append(tempAnnot)
        except KeyError:
            twitjson["entities"]["MEval-"+row["annotType"]] = [tempAnnot] 
            
#      doc._.h0Number = []
#     doc._.h0Unit = []
#     doc._.h0MeasuredEntity = []
            
            
    for num in doc.doc._.h0Number:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Number"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Number"] = [temp]
            
    for num in doc.doc._.h0Unit:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Unit"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Unit"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntity:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntity"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntity"] = [temp]
            
    #True Positives        
    for num in doc.doc._.h0NumberTps:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0NumberTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0NumberTP"] = [temp]
            
    for num in doc.doc._.h0UnitTps:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0UnitTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0UnitTP"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntityTps:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntityTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntityTP"] = [temp]
    


    json.dump(twitjson, open(f'jsondocs/{doc.name}.json',"w"), indent=3)
    
    


S0012821X12004384-1302


AttributeError: [E046] Can't retrieve unregistered extension attribute 'h0NumberTps'. Did you forget to call the `set_extension` method?