This notebook contains code for loading in all competition texts and annotations, and converting these to gate readable format

In [64]:
#imports
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json
import code
from code.exerpt import exerpt
TRIALPATH = "data/trial"
TRAINPATH = "data/train"
LATEST_tsv = None

#setup SpaCy
import spacy
import en_core_web_lg
spacy.prefer_gpu()
nlp = en_core_web_lg.load()

from benepar.spacy_plugin import BeneparComponent
nlp.add_pipe(BeneparComponent('benepar_en2'))





Rule-based Pipeline component for retrieving Quantities. 

Rules: 
    1. any CD followed by a unit that has a nouns POS
    2. any cardinal, money, ordinal, percent, date, time or quantity followed by a unit that has a noun POS
    3. any token that is LIKE_NUM followed by a unit that has a noun POS



In [65]:
#add pipeline component 
from spacy.matcher import Matcher
from spacy.tokens import Doc
Doc.set_extension("unit", default = "def", force = True)


def customMatcher(nlp):
    """
    Description: matcher giving the most recall so far
    """
    matchList = open("gazetteers/combined_measurements.lst","r",encoding="utf-8").read().split("\n")
    matcher = Matcher(nlp.vocab)
    pattern = []
    for word in matchList: 
        pattern.append([{"TAG": {"REGEX": "^[CD]"}},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        pattern.append([{"ENT_TYPE": {"IN": ["CARDINAL", "MONEY", "ORDINAL", "PERCENT", "DATE", "TIME", "QUANTITY"]},
                        "TAG":{"REGEX": "^[DT]"},"op": "!"},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        pattern.append([{"LIKE_NUM": True},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        #pattern.append([{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        
    matcher.add("Unit", None, *pattern)
        
    return matcher

def gazetteer(doc):
    matcher = customMatcher(nlp)
    matches = matcher(doc)
    doc._.unit = []
    for match_id, start, end in matches:
        tempSpan = doc[start:end]
        doc._.unit.append({'start': tempSpan.start_char, 'end': tempSpan.end_char, 'label': 'UNIT', 'text' : doc[start:end]})
    return doc
        
        
        
#nlp.add_pipe(gazetteer, last=True)
    

In [66]:

Doc.set_extension("meAnnots", default = "def", force = True)
from code.helpers import findOffset, intersectSpan, getSentences
import math

def annotationCreation(doc):
    global LATEST_tsv
    doc._.meAnnots = {}
    count = 0
    lookup = {tok.idx : tok.i for tok in doc}
    annotminmax={}
    for index, row in LATEST_tsv.iterrows():
        if(row["annotType"] == "Quantity"):
            count+=1
            if count > 1:
                #set sentence values for each
                 doc._.meAnnots[f"Annotation{count-1}"]["sentences"] = getSentences(annotminmax[f"offset{count-1}min"],annotminmax[f"offset{count-1}max"],doc)
        
        #get min and max offsets
        try:
            annotminmax[f"offset{count}max"] = max(annotminmax[f"offset{count}max"],row["endOffset"])
        except KeyError:
             annotminmax[f"offset{count}max"] = row["endOffset"]
                
        try:
            annotminmax[f"offset{count}min"] = min(annotminmax[f"offset{count}min"],row["startOffset"])
        except KeyError:
             annotminmax[f"offset{count}min"] = row["startOffset"]
            
        
        #doc._.meAnnots[f"Annotation{count}"]
        
        #check for creating a new dict
        try:
            if(type(doc._.meAnnots[f"Annotation{count}"]) == type(dict)):
                continue
        except KeyError:
            doc._.meAnnots[f"Annotation{count}"] = {}
        
        tempSpan = None
        
        try:
            tempSpan = doc[lookup[findOffset(row["startOffset"],doc.text)]:lookup[findOffset(row["endOffset"],doc.text)]]
        except KeyError:
            print("FindOffset method has created a key error ")
            print("origrange: (",row["startOffset"],",",row["endOffset"],")")
            print("range: (",findOffset(row["startOffset"],doc.text),",",findOffset(row["endOffset"],doc.text),")")
            print(lookup)
            
        doc._.meAnnots[f"Annotation{count}"][row["annotType"]] = tempSpan
      
    doc._.meAnnots[f"Annotation{count}"]["sentences"] = getSentences(annotminmax[f"offset{count}min"],annotminmax[f"offset{count}max"],doc)
        
    return doc     
nlp.add_pipe(annotationCreation, last=True)



In [67]:
#pipeline component H0

Doc.set_extension("h0Number", default = "def", force = True)
Doc.set_extension("h0Unit", default = "def", force = True)
Doc.set_extension("h0MeasuredEntity", default = "def", force = True)
Doc.set_extension("h0Measurements", default = "def", force = True)

#ents-ORIG = ["CARDINAL", "MONEY", "PERCENT", "DATE", "TIME", "QUANTITY"]
ents = ["CARDINAL", "MONEY", "PERCENT", "DATE", "TIME", "QUANTITY"]
ENTITIES = ""
for x in ents:
    ENTITIES += x
    
def numberMatcher(nlp):
    """
    Description: matcher giving the most recall so far
    """
    matcher = Matcher(nlp.vocab)
    pattern = []
    pattern.append([{"LIKE_NUM": True}])
    pattern.append([{"ENT_TYPE": {"IN": ents}}])
    matcher.add("h0Number", None, *pattern)
        
    return matcher


def h0(doc):
    matcher = numberMatcher(nlp)
    matches = matcher(doc)
    doc._.h0Number = []
    doc._.h0Unit = []
    doc._.h0MeasuredEntity = []
    doc._.h0Measurements = []
    for match_id, start, end in matches:
        
        tempSpan = doc[start:end]
        tempTok = doc[start]
        tempNum = {
            'start': tempSpan.start_char, 
            'end': tempSpan.end_char, 
            'label': 'h0Number', 
            'text' : tempTok.text,
            'span' : tempSpan,
            's' : start,
            'e' : end
        }
        
        doc._.h0Number.append(tempNum)
        
        tempHead = tempTok.head
        spanHead = doc[tempHead.i:tempHead.i+1]
        tempUnit = {
            'start': spanHead.start_char, 
            'end': spanHead.end_char, 
            'label': 'h0Unit', 
            'text' : tempHead.text,
            'span' : spanHead,
            's' : tempHead.i,
            'e' : tempHead.i+1
        }
        
        doc._.h0Unit.append(tempUnit)
        
        tempHeadHead = None
        spanHeadHead = None
        if tempHead.dep_ == "pobj":
            tempHeadHead = tempTok.head.head.head
            spanHeadHead = doc[tempHeadHead.i:tempHeadHead.i+1]
        else:
            tempHeadHead = tempTok.head.head
            spanHeadHead = doc[tempHeadHead.i:tempHeadHead.i+1]
            
        
        tempME = {
            'start': spanHeadHead.start_char, 
            'end': spanHeadHead.end_char, 
            'label': 'h0MeasuredEntity', 
            'text' : tempHeadHead.text,
            'span' : spanHeadHead,
            's' : tempHeadHead.i,
            'e' : tempHeadHead.i+1
        }
        
        doc._.h0MeasuredEntity.append(tempME)
        
        doc._.h0Measurements.append({
            "Number" : tempNum,
            "Unit" : tempUnit,
            "MeasuredEntity": tempME
        })
        
        
    return doc


nlp.add_pipe(h0, last=True)

In [68]:

class exerpt:
    """
    Class exerpt
    Description: a simple class to contain data for the measeval competition

    self.name : the measeval given name associated to the document
    self.txt : the raw text of the document
    self.ann : the brat annotations of the document(deprecated)
    self.tsv : a pandas dataframe containing all the tab seperated value data
    self.grobid : grobid quantities json output for quantity detection
    self.doc : The spacy doc generated from processing on this particular document
    """
    
    def __init__(self, name, txt, ann, tsv, grobid, nlp):
        self.name = name
        self.txt = txt
        self.ann = ann
        self.tsv = tsv
        self.grobid = grobid
        self.context = False
        self.posTag = False
        global LATEST_tsv
        LATEST_tsv = self.tsv
        self.doc = nlp(self.txt)
        self.hashFromFrame()
        
    def getContext(self, count = 3):
        if self.context == False: 
            beforeContext = []
            afterContext = []
            self.context = True
            for start,end in zip(self.tsv["startOffset"].values, self.tsv["endOffset"].values ):#self.tsv.iterrows():
                a = getIndexSeperated(exerpt.txt[0],end,count = count)
                b = getIndexSeperated(exerpt.txt[0],start,count = count,forward = False)
                beforeContext.append(self.txt[0][b:start])
                afterContext.append(self.txt[0][end:a])
            self.tsv.insert(3,"beforeContext", beforeContext)
            self.tsv.insert(3,"afterContext", afterContext)
            
    def hashFromFrame(self):
        self.measurements = {}
        count = 0
        for index, row in LATEST_tsv.iterrows():
            if(row["annotType"] == "Quantity"):
                count+=1
                
        
            #check for creating a new dict
            try:
                if(type(self.measurements[f"Annotation{count}"]) == type(dict)):
                    continue
            except KeyError:
                self.measurements[f"Annotation{count}"] = {}

            
            self.measurements[f"Annotation{count}"][row["annotType"]] = {
                "startOffset" : row["startOffset"],
                "endOffset" : row["endOffset"],
                "annotSet" : row["annotSet"],
                "annotType" : row["annotType"],
                "annotId" : row["annotId"],
                "text" : row["text"],
                "other" : row["other"],
            }

            
    def getPosTag(self):
        if self.context and self.posTag == False: 
            bTag = []
            aTag = []
            self.posTag = True
            for before, after in zip(self.tsv["beforeContext"].values,self.tsv["afterContext"].values):
                bTag.append(nltk.pos_tag(word_tokenize(before)))
                aTag.append(nltk.pos_tag(word_tokenize(after)))
            self.tsv.insert(3,"beforeTag", bTag)
            self.tsv.insert(3,"afterTag", aTag)

In [69]:
#
#TEST ONE
#

def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}
#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid"))),
                nlp
            )
        break

In [70]:
import time
t1 = time.time()
def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}

#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid"))),
                nlp
            )

#load all train data
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRAINPATH, "grobid", fn[:-4] + ".grobid"))),
                nlp
            )
            
t2 = time.time()
print(t2-t1, "Seconds elapsed")
print((t2-t1)/60, "Minutes elapsed")

FindOffset method has created a key error 
origrange: ( 147 , 161 )
range: ( 147 , 162 )
{0: 0, 5: 1, 8: 2, 14: 3, 17: 4, 25: 5, 29: 6, 34: 7, 35: 8, 41: 9, 43: 10, 48: 11, 50: 12, 54: 13, 56: 14, 57: 15, 59: 16, 62: 17, 69: 18, 73: 19, 80: 20, 83: 21, 86: 22, 97: 23, 101: 24, 110: 25, 116: 26, 119: 27, 121: 28, 130: 29, 133: 30, 139: 31, 146: 32, 151: 33, 155: 34, 162: 35, 163: 36, 167: 37, 169: 38, 176: 39, 178: 40, 179: 41, 181: 42, 188: 43, 196: 44, 203: 45, 208: 46, 213: 47, 216: 48, 227: 49, 230: 50, 232: 51, 240: 52, 250: 53, 259: 54, 262: 55, 269: 56, 273: 57, 280: 58, 288: 59, 289: 60, 296: 61, 299: 62, 301: 63, 302: 64, 304: 65, 308: 66, 309: 67, 311: 68, 319: 69, 323: 70, 325: 71, 329: 72, 334: 73, 346: 74, 351: 75, 354: 76, 359: 77, 364: 78, 368: 79, 382: 80, 386: 81, 392: 82, 395: 83, 398: 84, 405: 85, 415: 86, 417: 87, 426: 88, 432: 89, 446: 90, 449: 91, 453: 92, 456: 93, 468: 94, 475: 95, 482: 96, 485: 97, 488: 98, 495: 99, 505: 100, 508: 101, 512: 102, 522: 103, 525: 10

FindOffset method has created a key error 
origrange: ( 127 , 129 )
range: ( 127 , 129 )
{0: 0, 12: 1, 20: 2, 31: 3, 36: 4, 43: 5, 44: 6, 48: 7, 50: 8, 52: 9, 60: 10, 62: 11, 66: 12, 77: 13, 83: 14, 86: 15, 97: 16, 106: 17, 113: 18, 117: 19, 123: 20, 127: 21}
FindOffset method has created a key error 
origrange: ( 954 , 959 )
range: ( 954 , 961 )
{0: 0, 4: 1, 9: 2, 13: 3, 15: 4, 22: 5, 25: 6, 37: 7, 38: 8, 44: 9, 51: 10, 59: 11, 64: 12, 75: 13, 77: 14, 85: 15, 92: 16, 100: 17, 105: 18, 108: 19, 112: 20, 113: 21, 121: 22, 124: 23, 126: 24, 127: 25, 129: 26, 133: 27, 134: 28, 136: 29, 140: 30, 144: 31, 148: 32, 156: 33, 169: 34, 174: 35, 177: 36, 181: 37, 182: 38, 184: 39, 189: 40, 200: 41, 207: 42, 208: 43, 215: 44, 218: 45, 220: 46, 221: 47, 223: 48, 227: 49, 229: 50, 236: 51, 239: 52, 241: 53, 242: 54, 244: 55, 248: 56, 249: 57, 251: 58, 254: 59, 258: 60, 269: 61, 274: 62, 276: 63, 288: 64, 298: 65, 301: 66, 305: 67, 306: 68, 315: 69, 322: 70, 327: 71, 331: 72, 336: 73, 337: 74, 348: 

FindOffset method has created a key error 
origrange: ( 149 , 155 )
range: ( 149 , 155 )
{0: 0, 8: 1, 17: 2, 19: 3, 23: 4, 25: 5, 31: 6, 34: 7, 38: 8, 41: 9, 47: 10, 48: 11, 54: 12, 55: 13, 57: 14, 61: 15, 65: 16, 73: 17, 81: 18, 84: 19, 86: 20, 92: 21, 95: 22, 99: 23, 109: 24, 115: 25, 118: 26, 122: 27, 127: 28, 129: 29, 134: 30, 146: 31, 149: 32, 154: 33}
FindOffset method has created a key error 
origrange: ( 282 , 290 )
range: ( 282 , 291 )
{0: 0, 3: 1, 14: 2, 18: 3, 28: 4, 40: 5, 46: 6, 49: 7, 53: 8, 56: 9, 61: 10, 63: 11, 67: 12, 73: 13, 78: 14, 89: 15, 94: 16, 102: 17, 104: 18, 113: 19, 117: 20, 127: 21, 130: 22, 136: 23, 144: 24, 150: 25, 155: 26, 158: 27, 160: 28, 166: 29, 171: 30, 173: 31, 176: 32, 186: 33, 193: 34, 198: 35, 202: 36, 204: 37, 215: 38, 220: 39, 223: 40, 227: 41, 232: 42, 240: 43, 242: 44, 245: 45, 251: 46, 254: 47, 257: 48, 269: 49, 274: 50, 276: 51, 281: 52, 286: 53, 291: 54, 294: 55, 296: 56, 298: 57, 302: 58, 305: 59, 309: 60, 312: 61, 321: 62, 324: 63, 332

FindOffset method has created a key error 
origrange: ( 118 , 122 )
range: ( 118 , 122 )
{0: 0, 4: 1, 8: 2, 13: 3, 21: 4, 31: 5, 36: 6, 40: 7, 43: 8, 57: 9, 62: 10, 65: 11, 70: 12, 81: 13, 85: 14, 95: 15, 98: 16, 106: 17, 107: 18, 109: 19, 110: 20, 112: 21, 122: 22, 123: 23, 125: 24, 130: 25, 136: 26, 144: 27, 148: 28, 152: 29, 155: 30, 169: 31, 176: 32, 183: 33, 188: 34, 189: 35, 191: 36, 197: 37, 207: 38, 210: 39, 216: 40, 222: 41, 236: 42, 239: 43, 252: 44, 258: 45, 263: 46, 268: 47, 271: 48, 272: 49, 282: 50, 284: 51, 285: 52, 287: 53, 288: 54, 290: 55, 293: 56, 297: 57, 307: 58, 309: 59, 314: 60, 328: 61, 331: 62, 341: 63, 344: 64, 352: 65, 361: 66, 365: 67, 372: 68, 380: 69, 387: 70, 389: 71, 393: 72, 402: 73, 411: 74, 425: 75, 434: 76, 438: 77, 448: 78, 451: 79}
FindOffset method has created a key error 
origrange: ( 112 , 117 )
range: ( 112 , 117 )
{0: 0, 4: 1, 8: 2, 13: 3, 21: 4, 31: 5, 36: 6, 40: 7, 43: 8, 57: 9, 62: 10, 65: 11, 70: 12, 81: 13, 85: 14, 95: 15, 98: 16, 106: 17

FindOffset method has created a key error 
origrange: ( 97 , 123 )
range: ( 97 , 123 )
{0: 0, 3: 1, 12: 2, 16: 3, 23: 4, 27: 5, 31: 6, 40: 7, 48: 8, 50: 9, 63: 10, 68: 11, 79: 12, 87: 13, 88: 14, 90: 15, 93: 16, 97: 17, 103: 18, 108: 19, 110: 20, 117: 21, 124: 22, 126: 23, 131: 24, 134: 25, 139: 26, 151: 27, 154: 28, 156: 29, 163: 30, 172: 31, 176: 32, 180: 33, 190: 34, 193: 35, 201: 36, 203: 37, 208: 38, 220: 39, 221: 40, 227: 41, 231: 42, 237: 43, 244: 44, 247: 45, 255: 46, 256: 47, 258: 48, 264: 49, 268: 50, 281: 51, 282: 52, 292: 53, 304: 54, 310: 55, 320: 56, 323: 57, 333: 58, 338: 59, 344: 60, 349: 61, 351: 62, 359: 63, 368: 64, 375: 65, 384: 66, 389: 67, 397: 68, 400: 69, 409: 70, 417: 71}
FindOffset method has created a key error 
origrange: ( 1 , 11 )
range: ( 1 , 12 )
{0: 0, 5: 1, 12: 2, 22: 3, 25: 4, 29: 5, 31: 6, 34: 7, 40: 8, 45: 9, 49: 10, 53: 11, 55: 12, 58: 13, 68: 14, 74: 15, 88: 16, 91: 17, 98: 18, 107: 19, 114: 20, 124: 21, 127: 22, 131: 23, 135: 24, 138: 25, 141: 26

FindOffset method has created a key error 
origrange: ( 1053 , 1060 )
range: ( 1053 , 1060 )
{0: 0, 14: 1, 15: 2, 18: 3, 20: 4, 24: 5, 34: 6, 39: 7, 44: 8, 45: 9, 53: 10, 54: 11, 62: 12, 64: 13, 65: 14, 67: 15, 121: 16, 123: 17, 124: 18, 129: 19, 130: 20, 133: 21, 135: 22, 143: 23, 145: 24, 155: 25, 157: 26, 162: 27, 167: 28, 174: 29, 178: 30, 188: 31, 189: 32, 192: 33, 194: 34, 198: 35, 200: 36, 204: 37, 206: 38, 211: 39, 215: 40, 226: 41, 228: 42, 232: 43, 236: 44, 246: 45, 251: 46, 256: 47, 259: 48, 267: 49, 269: 50, 273: 51, 277: 52, 281: 53, 286: 54, 296: 55, 299: 56, 306: 57, 314: 58, 326: 59, 331: 60, 340: 61, 349: 62, 355: 63, 356: 64, 364: 65, 370: 66, 372: 67, 378: 68, 383: 69, 390: 70, 393: 71, 395: 72, 403: 73, 407: 74, 414: 75, 417: 76, 423: 77, 430: 78, 434: 79, 437: 80, 438: 81, 445: 82, 453: 83, 459: 84, 462: 85, 468: 86, 474: 87, 476: 88, 488: 89, 490: 90, 494: 91, 506: 92, 510: 93, 520: 94, 523: 95, 528: 96, 535: 97, 543: 98, 548: 99, 551: 100, 553: 101, 565: 102, 568

FindOffset method has created a key error 
origrange: ( 80 , 95 )
range: ( 80 , 96 )
{0: 0, 7: 1, 13: 2, 14: 3, 22: 4, 26: 5, 30: 6, 41: 7, 44: 8, 50: 9, 66: 10, 69: 11, 75: 12, 85: 13, 96: 14, 105: 15, 107: 16, 113: 17, 116: 18, 120: 19, 129: 20, 135: 21, 139: 22, 146: 23, 160: 24, 170: 25, 179: 26, 183: 27, 186: 28, 190: 29, 193: 30, 197: 31, 208: 32, 213: 33, 218: 34, 235: 35, 238: 36, 242: 37, 249: 38, 260: 39, 264: 40, 269: 41, 279: 42, 287: 43, 289: 44, 296: 45}
141.246164560318 Seconds elapsed
2.3541027426719667 Minutes elapsed


In [71]:
other = []
for e in data.values():
    for x in e.measurements.values():
        try:
            print(x["MeasuredProperty"]["text"], x["MeasuredEntity"]["text"],x["Quantity"]["text"],)
        except KeyError:
            continue



%low salinity dinoflagellate cysts Samples 0% to 80%
peridinoid cysts DA1 on average 5%
southern boundary northern rain belt 40°N
before CIE onset from 103 yrs
before CIE onset above 2618 m
present today on Antarctica ice volume between 60% and 100%
emplacement of an ice volume Oi1 ∼400 ka
began decline ∼2 Ma
NaOH wet alkaline digestion 0.2 M
m/Δm∼3500 mass spectrometer 5%
m/Δm∼3500 mass spectrometer 95%
30ε diatom silicon isotope fractionation factor ∼−1‰
differences in the abundance diatoms 40%
correlation δ30Si values of size fractions between 2 and 20 μm r2=0.92
apparent 30ε δ30Si values of size fractions between 2 and 20 μm ∼–1‰
O ammonite Watinoceras devonense F
O W. devonense F
O Mytiloides puebloensis F
positive shift VPDB δ13Corg 2–3‰
VPDB δ13Corg upper Hartland Shale ∼−27‰
below the CTB upper Hartland Shale 4.3 m
gradual fall δ13Corg ∼−27‰
thick package of black organic-rich calcareous shales, termed the “Niveau Thomel” ∼20 m
came from Vergons some of the samples n=4
187Os/18

coexpresses CXCR4 and SOX17 resulting DE population 80%
visually inspected Wells 12 hr
gene targeting rates SOX2 greater than 70%
found to carry the GFP-Neo cassette in the SOX2 locus clones 72%
percentage of GFP-positive (GFP+) cells hSOX2-23 over more than 20 passages.
expressed SOX2 protein GFP+ cells 100%
survived hRPE monolayers 4 weeks
visual-field setting Spectralis 30-degrees
distance between scan 60 μm
visual field device 20 × 20 degrees
corneal curvature settings Spectralis’ 4.2 mm
visual field device 30 degrees
magnification Left images 10,500×
magnification right micrographs 25,000×
distance scale bars 2 μm
distance scale bars 0.2 μm


In [72]:
for e in data.values():
    print(e.name)
    for x in e.doc._.meAnnots.values():
        if e.name in ['0019103511004994-1511']:
            print(x)
        try:
            if(x["Quantity"][len(x["Quantity"])-1].text in [")",".",",",":","/",";","-"]):
                temp = x["Quantity"][len(x["Quantity"])-2]
                print(temp.text.ljust(15), temp.dep_.ljust(15), temp.head.text.ljust(15))
            else: 
                temp = x["Quantity"][len(x["Quantity"])-1]
                print(temp.text.ljust(15), temp.dep_.ljust(15), temp.head.text.ljust(15))
        except TypeError: 
            continue
    

S0012821X12004384-1302
Five            nummod          samples        
m               conj            2614.73        
m               conj            2614.73        
two             nummod          samples        
2614.71         conj            2619.60        
S0012821X12004384-1405
20              nummod          specimens      
%               pobj            from           
S0012821X12004384-1415
m               pobj            to             
%               compound        cysts          
S0012821X12004384-1594
m               pobj            above          
N               attr            is             
S0012821X12004384-1599
yrs             pobj            from           
m               pobj            above          
S0012821X13002185-1061
Ma              appos           Oi1            
%               quantmod        100            
ka              pobj            over           
year            compound        trend          
Ma              dobj            began         

C               pobj            at             
S0032386113009889-2123
MPa             pobj            of             
MPa             pobj            of             
MPa             pobj            of             
S0038071711004354-2573
cm              pobj            within         
six             nummod          sites          
cm              pobj            within         
S0038071712001010-1044
mm2             appos           pores          
days            pobj            for            
S0038071712001010-918
0.001           npadvmod        was            
%               appos           colonisation   
7.01            appos           LSD            
%               appos           mosseae        
%               appos           species        
%               attr            be             
%               appos           %              
%               npadvmod        paired         
%               appos           performance    
S0038071712001010-944
%               pobj   

In [73]:
from code.helpers import *


In [74]:
#testing 
doc = data["S0019103512002801-1927"].doc
s1=doc[3:7]
s2=doc[3:4]
if not intersectSpanSpan(s1,s2):
    print("error1")
s1=doc[3:7]
s2=doc[3:7]
if not intersectSpanSpan(s1,s2):
    print("error2")
s1=doc[3:7]
s2=doc[5:7]
if not intersectSpanSpan(s1,s2):
    print("error3")
s2=doc[3:7]
s1=doc[3:4]
if not intersectSpanSpan(s1,s2):
    print("error4")
s2=doc[3:7]
s1=doc[5:7]
if not intersectSpanSpan(s1,s2):
    print("error5")

In [75]:
#get false positives and true positives
Doc.set_extension("h0NumberTps", default = "def", force = True)
Doc.set_extension("h0UnitTps", default = "def", force = True)
Doc.set_extension("h0MeasuredEntityTps", default = "def", force = True)
Doc.set_extension("h0NumberFps", default = "def", force = True)
Doc.set_extension("h0UnitFps", default = "def", force = True)
Doc.set_extension("h0MeasuredEntityFps", default = "def", force = True)
Doc.set_extension("h0MeasurementTps", default = "def", force = True)

import copy 

for e in data.values():
    doc = e.doc
    doc._.h0NumberTps = []
    doc._.h0UnitTps = []
    doc._.h0MeasuredEntityTps = []
    doc._.h0NumberFps = []
    doc._.h0UnitFps = []
    doc._.h0MeasuredEntityFps = []
    doc._.h0MeasurementTps = []

    tempMeas = list(e.measurements.values())
    for meas in doc._.h0Measurements:
        num = meas["Number"]
        unit = meas["Unit"]
        me = meas["MeasuredEntity"]
        
        
        count = 0
        for m in tempMeas:
            try:
                if(intersectSpan(num["span"],m["Quantity"]["startOffset"],m["Quantity"]["endOffset"]) and intersectSpan(unit["span"],m["Quantity"]["startOffset"],m["Quantity"]["endOffset"])):
                    doc._.h0NumberTps.append(num)
                    doc._.h0UnitTps.append(unit)
                    if(intersectSpan(me["span"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"]) or
                      intersectSpanNum(me["start"],me["end"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])):
                        doc._.h0MeasuredEntityTps.append(num)
                        doc._.h0MeasurementTps.append(meas)
                        if(e.name == "S0016236113008041-3112" and num["text"] ==10):
                            print("Quantity:",m["Quantity"]["text"], "MeasuredEntity",m["MeasuredEntity"]["text"])
                            print(me["span"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])
                            print(me["start"],me["end"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])
                    else:
                        r=dict(meas)
                        del r["MeasuredEntity"]
                        doc._.h0MeasurementTps.append(r)
                        
                        if(e.name == "S0016236113008041-3112" and num["text"] == '10'):
                            print("Quantity:",m["Quantity"]["text"], "MeasuredEntity",m["MeasuredEntity"]["text"])
                            print(me["span"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])
                            print(me["start"],me["end"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])
                    tempMeas.pop(count)
            except KeyError:
                pass#print("No quantity")
            count+=1
                

Quantity: 10% MeasuredEntity acid
acid 751 755
775 779 751 755


In [76]:
data["S0016236113008041-3112"].doc._.h0MeasurementTps

[{'Number': {'start': 766,
   'end': 768,
   'label': 'h0Number',
   'text': '10',
   'span': 10,
   's': 140,
   'e': 141},
  'Unit': {'start': 768,
   'end': 769,
   'label': 'h0Unit',
   'text': '%',
   'span': %,
   's': 141,
   'e': 142}}]

The following for loop converts the current spaCy and human annotation format into a gate readable one using text/x-json-twitter format

In [77]:
goldCount = {
    "Quantity": 0,
    "MeasuredEntity" : 0,
    "MeasuredProperty" : 0,
    "Qualifier" : 0       
}
h0Count = {
    "Number":0,
    "Unit":0,
    "MeasuredEntity":0,
    "total":0
}


counts={
    "goldCount" : goldCount,
    "h0Count":h0Count
}

for e in data.values():
    for index, row in e.tsv.iterrows():
            counts["goldCount"][row["annotType"]] += 1
    counts["h0Count"]["Number"] += len(e.doc._.h0NumberTps)
    counts["h0Count"]["Unit"] += len(e.doc._.h0UnitTps)
    counts["h0Count"]["MeasuredEntity"] += len(e.doc._.h0MeasuredEntityTps)
    counts["h0Count"]["total"] += len(e.doc._.h0Measurements)
    
    
counts["QuantityPrecision"] = counts["h0Count"]["Number"]/counts["h0Count"]["total"]
counts["QuantityRecall"] = counts["h0Count"]["Number"]/counts["goldCount"]["Quantity"]
counts["QuantityF1"] = 2*(counts["QuantityRecall"]*counts["QuantityPrecision"])/(counts["QuantityRecall"]+counts["QuantityPrecision"])
counts["MEPrecision"] = counts["h0Count"]["MeasuredEntity"]/counts["h0Count"]["total"]
counts["MERecall"] = counts["h0Count"]["MeasuredEntity"]/counts["goldCount"]["Quantity"]
counts["MEF1"] = 2*(counts["MERecall"]*counts["MEPrecision"])/(counts["MERecall"]+counts["MEPrecision"])

     

In [78]:
#json.dump(counts, open(f"performance{ENTITIES}.json","w",encoding= "utf-8"),indent = 3)
counts

{'goldCount': {'Quantity': 1087,
  'MeasuredEntity': 1056,
  'MeasuredProperty': 686,
  'Qualifier': 293},
 'h0Count': {'Number': 722, 'Unit': 722, 'MeasuredEntity': 148, 'total': 2913},
 'QuantityPrecision': 0.24785444558874012,
 'QuantityRecall': 0.6642134314627415,
 'QuantityF1': 0.361,
 'MEPrecision': 0.05080672845863371,
 'MERecall': 0.13615455381784727,
 'MEF1': 0.074}

In [118]:
import importlib
importlib.reload(code.output)   
from code.output import getAsciiQ

os.system("rm ascii/noannot/*")
os.system("rm ascii/nome/*")
os.system("rm ascii/normal/*")
os.system("rm ascii/notallquant/*")




for e in data.values():
    if (len(e.doc._.h0MeasuredEntityTps) < len(e.doc._.h0NumberTps)):
        file = open(f"ascii/nome/{e.name}.txt","w",encoding="utf-8")
        getAsciiQ(e, file, True)
        file.close()
    elif(len(e.doc._.h0MeasurementTps) > 0):
        file = open(f"ascii/normal/{e.name}.txt","w",encoding="utf-8")
        getAsciiQ(e, file, True)
        file.close()
    elif(len(e.doc._.h0NumberTps) < len(e.measurements)):
        file = open(f"ascii/notallquant/{e.name}.txt","w",encoding="utf-8")
        getAsciiQ(e, file, True)
        file.close()
    else:
        file = open(f"ascii/noannot/{e.name}.txt","w",encoding="utf-8")
        getAsciiQ(e, file, True)
        file.close()
        

In [119]:
import pprint
pp = pprint.PrettyPrinter(indent=5, width=4, depth=30)

In [120]:
pp.pprint([1, 2, 3, 4, 5, [6,7,8, [9,10 ,11]]])

[    1,
     2,
     3,
     4,
     5,
     [    6,
          7,
          8,
          [    9,
               10,
               11]]]


In [124]:
import json
for e in data.values():
    for sent in e.doc.sents:
        import json
        json.dump(sent._.parse_string, open("pp.const","w",encoding="utf-8"),indent=3)
        pp.pprint(sent._.parse_string)
        break
        
    
        


('(S '
 '(NP '
 '(NP '
 '(NP '
 '(NN '
 'Correspondence) '
 '(NNP '
 'analysis)) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(NP '
 '(NNP '
 'CA)) '
 '(-RRB- '
 '-RRB-))) '
 '(CC '
 'and) '
 '(NP '
 '(JJ '
 'statistical) '
 '(NN '
 'diversity) '
 '(NNP '
 'analysis))) '
 '(VP '
 '(VBD '
 'were) '
 '(VP '
 '(VBN '
 'carried) '
 '(PRT '
 '(RP '
 'out)) '
 '(PP '
 '(IN '
 'on) '
 '(NP '
 '(NP '
 '(DT '
 'the) '
 '(JJ '
 'palynological) '
 '(NN '
 'dataset)) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(NP '
 '(NP '
 '(JJ '
 'total) '
 '(NNS '
 'counts)) '
 '(PP '
 '(IN '
 'per) '
 '(NP '
 '(NN '
 'gram)))) '
 '(-RRB- '
 '-RRB-)))) '
 '(S '
 '(VP '
 '(VP '
 '(TO '
 'to) '
 '(VP '
 '(VB '
 'confirm) '
 '(NP '
 '(NP '
 '(NN '
 'assemblage) '
 '(NNS '
 'designations)) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(NP '
 '(NNP '
 'Figs) '
 '(NNP '
 '.) '
 '(CD '
 '4) '
 '(CC '
 'and) '
 '(CD '
 '5)) '
 '(-RRB- '
 '-RRB-))))) '
 '(, '
 ',) '
 '(VP '
 '(TO '
 'to) '
 '(VP '
 '(VB '
 'identify) '
 '(NP '
 '(NP '
 '(NP '
 '(DT

 'EPA)) '
 '(-RRB- '
 '-RRB-)) '
 '(NN '
 'Method) '
 '(CD '
 '29))))))) '
 '(: '
 ':))')
('(NP '
 '(NP '
 '(NP '
 '(NNS '
 'Results)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(NNP '
 'ICP) '
 '(: '
 '-) '
 '(NNP '
 'MS) '
 '(NN '
 'flue) '
 '(NN '
 'gas) '
 '(NN '
 'analysis)))) '
 '(VP '
 '(VBN '
 'provided) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(NNP '
 'Fig)))) '
 '(. '
 '.))')
('(S '
 '(NP '
 '(NNP '
 'Fig) '
 '(NNP '
 '.) '
 '(CD '
 '6)) '
 '(VP '
 '(VBZ '
 'shows) '
 '(NP '
 '(NP '
 '(DT '
 'the) '
 '(NNS '
 'concentrations)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(JJ '
 'minor) '
 '(NNS '
 'elements))) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(NP '
 '(DT '
 'the) '
 '(JJ '
 'solid) '
 '(NN '
 'sorbent)) '
 '(, '
 ',) '
 '(SBAR '
 '(WHADVP '
 '(WRB '
 'where)) '
 '(S '
 '(NP '
 '(VBG '
 'increasing) '
 '(NN '
 'bed) '
 '(NN '
 'inventory)) '
 '(VP '
 '(VBD '
 'resulted) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(NP '
 '(VBG '
 'increasing) '
 '(NNS '
 'values)) '
 '(VP '
 '(VBN '
 'observed) '
 '(PP 

 '(NN '
 'latitude) '
 '(NNS '
 'temperatures)) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(NP '
 '(NNP '
 'Saturn) '
 '(POS '
 '’s)) '
 '(JJ '
 'upper) '
 '(NN '
 'atmosphere)))) '
 '(VP '
 '(VBN '
 'published) '
 '(PP '
 '(IN '
 'until) '
 '(NP '
 '(RB '
 'recently))))) '
 '(VP '
 '(VBD '
 'had) '
 '(NP '
 '(NP '
 '(NP '
 '(NP '
 '(NNS '
 'values)) '
 '(PP '
 '(IN '
 'below) '
 '(NP '
 '(FW '
 '∼460) '
 '(NN '
 'K)))) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(NP '
 '(NP '
 '(NNP '
 'Melin) '
 '(CC '
 'et) '
 '(NNP '
 'al) '
 '(NNP '
 '.)) '
 '(, '
 ',) '
 '(NP '
 '(CD '
 '2007))) '
 '(: '
 ';) '
 '(NP '
 '(NP '
 '(NNP '
 'Vervack) '
 '(CC '
 'and) '
 '(NNP '
 'Moses)) '
 '(, '
 ',) '
 '(NP '
 '(CD '
 '2012))) '
 '(-RRB- '
 '-RRB-))) '
 '(, '
 ',) '
 '(CC '
 'but) '
 '(NP '
 '(NNP '
 'Melin) '
 '(CC '
 'et) '
 '(NNP '
 'al)))) '
 '(. '
 '.))')
('(S '
 '(PP '
 '(IN '
 'Despite) '
 '(NP '
 '(NP '
 '(JJ '
 'direct) '
 '(JJ '
 'solar) '
 '(NNP '
 'EUV) '
 '(NN '
 'heating)) '
 '(PP '
 '(IN '
 'of) '
 

 'anhydrous) '
 '(NN '
 'MgF2)) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(NP '
 '(NP '
 '(NNP '
 'Aldrich)) '
 '(, '
 ',) '
 '(NP '
 '(CD '
 '99.9) '
 '(NN '
 '%))) '
 '(-RRB- '
 '-RRB-)))))))) '
 '(. '
 '.))')
('(S '
 '(NP '
 '(DT '
 'All) '
 '(NNS '
 'products)) '
 '(VP '
 '(VBD '
 'were) '
 '(ADVP '
 '(RB '
 'initially)) '
 '(VP '
 '(VBN '
 'characterised) '
 '(PP '
 '(IN '
 'by) '
 '(NP '
 '(NNP '
 'PXD))))) '
 '(. '
 '.))')
('(S '
 '(NP '
 '(NNS '
 'Data)) '
 '(VP '
 '(VBD '
 'were) '
 '(VP '
 '(VP '
 '(VBN '
 'drawn) '
 '(PP '
 '(IN '
 'from) '
 '(NP '
 '(DT '
 'the) '
 '(NNP '
 'Whitehall) '
 '(NNP '
 'II) '
 '(NN '
 'study))) '
 '(PP '
 '(IN '
 'with) '
 '(NP '
 '(NP '
 '(JJ '
 'baseline) '
 '(NN '
 'examination)) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(CD '
 '1991)))))) '
 '(: '
 ';) '
 '(NP '
 '(NP '
 '(JJ '
 'follow) '
 '(JJ '
 '-) '
 '(JJ '
 'up) '
 '(NNS '
 'screenings)) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(CD '
 '1997) '
 '(, '
 ',) '
 '(CD '
 '2003) '
 '(, '
 ',) '
 '(CC '
 'and)

 '(NN '
 'incidence)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(JJ '
 'later) '
 '(: '
 '-) '
 '(JJ '
 'life) '
 '(NN '
 'depression)))) '
 '(, '
 ',) '
 '(PP '
 '(ADVP '
 '(RB '
 'primarily)) '
 '(IN '
 'via) '
 '(NP '
 '(NP '
 '(VBN '
 'reduced) '
 '(NNS '
 'rates)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(JJ '
 'manifest) '
 '(JJ '
 'vascular) '
 '(NN '
 'disease)))))))))) '
 '(. '
 '.))')
('(NP '
 '(NP '
 '(NP '
 '(NN '
 'Association)) '
 '(PP '
 '(PP '
 '(IN '
 'Between) '
 '(NP '
 '(NNP '
 'Framingham) '
 '(NN '
 'Risk) '
 '(NNS '
 'Scores))) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(PP '
 '(IN '
 'per) '
 '(NP '
 '(ADJP '
 '(CD '
 '10) '
 '(NN '
 '%)) '
 '(NN '
 'Increase))) '
 '(-RRB- '
 '-RRB-)))) '
 '(CC '
 'and) '
 '(NP '
 '(NP '
 '(NP '
 '(JJ '
 'Subsequent) '
 '(NN '
 'Onset)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(JJ '
 'Depressive) '
 '(NNS '
 'Symptoms)))) '
 '(PP '
 '(IN '
 'Before) '
 '(CC '
 'and) '
 '(IN '
 'After) '
 '(NP '
 '(NN '
 'Age) '
 '(CD '
 '65)))))')
('(S '
 '(NP '
 '(NP 

 '.))')
('(S '
 '(NP '
 '(NP '
 '(DT '
 'The) '
 '(VBN '
 'observed) '
 '(NN '
 'peak)) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(NP '
 '(DT '
 'the) '
 '(JJ '
 'fractional) '
 '(JJ '
 'integrated) '
 '(NN '
 'differential) '
 '(NN '
 'brightness)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(NN '
 'BS1))) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(DT '
 'the) '
 '(NN '
 'H) '
 '(NN '
 'filter)))))) '
 '(VP '
 '(VP '
 '(VBD '
 'was) '
 '(VP '
 '(VBN '
 'observed) '
 '(S '
 '(VP '
 '(TO '
 'to) '
 '(VP '
 '(VB '
 'be) '
 '(NP '
 '(CD '
 '0.64) '
 '(NN '
 '%)) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(DT '
 'the) '
 '(NNP '
 'discovery) '
 '(NN '
 'image))) '
 '(PP '
 '(IN '
 'on) '
 '(NP '
 '(CD '
 '26) '
 '(NNP '
 'October) '
 '(CD '
 '2011)))))))) '
 '(, '
 ',) '
 '(CC '
 'and) '
 '(VP '
 '(VBD '
 'declined) '
 '(PP '
 '(TO '
 'to) '
 '(NP '
 '(CD '
 '0.02) '
 '(NN '
 '%))) '
 '(PP '
 '(IN '
 'by) '
 '(NP '
 '(NNP '
 'December) '
 '(CD '
 '16))))) '
 '(. '
 '.))')
('(S '
 '(VP '
 '(VB '
 'Notice) '
 '(ADVP '

 '(NP '
 '(CD '
 '1)) '
 '(-RRB- '
 '-RRB-))) '
 '(PP '
 '(IN '
 'from) '
 '(NP '
 '(NNP '
 'PXD) '
 '(CC '
 'and) '
 '(NNP '
 'PND))) '
 '(IN '
 'at) '
 '(NP '
 '(CD '
 '298)) '
 '(. '
 'K.))')
('(S '
 '(LST '
 '(SYM '
 '►)) '
 '(NP '
 '(PRP '
 'We)) '
 '(VP '
 '(VBP '
 'examine) '
 '(NP '
 '(NP '
 '(DT '
 'a) '
 '(ADJP '
 '(JJ '
 'high) '
 '(NN '
 'resolution)) '
 '(JJ '
 'multi) '
 '(: '
 '-) '
 '(JJ '
 'proxy) '
 '(JJ '
 'physical) '
 '(NNS '
 'properties)) '
 '(PP '
 '(IN '
 'from) '
 '(NP '
 '(CD '
 'two) '
 '(JJ '
 'marine) '
 '(NNS '
 'cores))))) '
 '(. '
 '.))')
('(S '
 '(NP '
 '(NP '
 '(NN '
 'Analysis)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(DT '
 'the) '
 '(NN '
 'diffraction) '
 '(NNS '
 'data)))) '
 '(VP '
 '(VBD '
 'was) '
 '(VP '
 '(VBN '
 'conducted) '
 '(PP '
 '(IN '
 'by) '
 '(S '
 '(VP '
 '(VBG '
 'measuring) '
 '(NP '
 '(NN '
 'peak) '
 '(NN '
 'intensity)) '
 '(PP '
 '(IN '
 'as) '
 '(NP '
 '(NN '
 'peak) '
 '(NN '
 'area))) '
 '(S '
 '(VP '
 '(VBG '
 'using) '
 '(N

 '(NN '
 'hash) '
 '(NN '
 'function) '
 '(NNP '
 'H)))) '
 '(VP '
 '(VBZ '
 'is) '
 '(VP '
 '(VBN '
 'given) '
 '(PP '
 '(IN '
 'as) '
 '(NP '
 '(NP '
 '(VBN '
 'Hashed) '
 '(NN '
 'Slice) '
 '(NN '
 'Precision)) '
 '(, '
 ',) '
 '(NP '
 '(NP '
 '(NNP '
 'HSP)) '
 '(: '
 '=) '
 '(NP '
 '(UH '
 'UH) '
 '(: '
 '/) '
 '(NP '
 '(PRP '
 'US)))))))) '
 '(. '
 '.))')
('(S '
 '(S '
 '(VP '
 '(TO '
 'To) '
 '(VP '
 '(VB '
 'assess) '
 '(SBAR '
 '(IN '
 'if) '
 '(S '
 '(NP '
 '(DT '
 'a) '
 '(NN '
 'program)) '
 '(VP '
 '(VBZ '
 'includes) '
 '(NP '
 '(DT '
 'a) '
 '(JJ '
 'large) '
 '(JJ '
 'coherent) '
 '(NN '
 'cluster)))))))) '
 '(, '
 ',) '
 '(VP '
 '(VBZ '
 'requires) '
 '(S '
 '(VP '
 '(VBG '
 'making) '
 '(NP '
 '(NP '
 '(DT '
 'a) '
 '(NN '
 'judgement)) '
 '(VP '
 '(VBG '
 'concerning) '
 '(SBAR '
 '(WHNP '
 '(WDT '
 'what) '
 '(NN '
 'threshold)) '
 '(S '
 '(VP '
 '(VBZ '
 'constitutes) '
 '(NP '
 '(JJ '
 'large)))))))))) '
 '(. '
 '.))')
('(S '
 '(NP '
 '(NN '
 'Table) '
 '(CD '
 '4

 '(TO '
 'to) '
 '(VP '
 '(VB '
 'provide) '
 '(NP '
 '(NN '
 'node) '
 '(NN '
 'control)))))) '
 '(CC '
 'and) '
 '(S '
 '(NP '
 '(DT '
 'some) '
 '(NNS '
 'others)) '
 '(VP '
 '(TO '
 'to) '
 '(VP '
 '(VB '
 'run) '
 '(NP '
 '(DT '
 'the) '
 '(NN '
 'application)))))))))))) '
 '(. '
 '.))')
('(S '
 '(SBAR '
 '(IN '
 'As) '
 '(S '
 '(VP '
 '(VBN '
 'expected)))) '
 '(NP '
 '(PRP$ '
 'our) '
 '(NN '
 'design)) '
 '(VP '
 '(VBD '
 'did) '
 '(RB '
 'not) '
 '(VP '
 '(VB '
 'deadlock) '
 '(SBAR '
 '(IN '
 'whereas) '
 '(S '
 '(NP '
 '(DT '
 'a) '
 '(JJ '
 'conventional) '
 '(NN '
 'unit)) '
 '(VP '
 '(VBN '
 'deadlocked) '
 '(NP '
 '(NP '
 '(QP '
 '(RB '
 'roughly) '
 '(CD '
 '2)) '
 '(NN '
 '%)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(NP '
 '(DT '
 'the) '
 '(NNS '
 'times)) '
 '(SBAR '
 '(WHNP '
 '(IN '
 'that)) '
 '(S '
 '(NP '
 '(DT '
 'a) '
 '(NN '
 'glitch)) '
 '(VP '
 '(VBZ '
 'appears)))))))))))) '
 '(. '
 '.))')
('(S '
 '(NP '
 '(NNP '
 '•SOC) '
 '(NNS '
 'stocks)) '
 '(VP '
 '(VP '

 '(NP '
 '(NP '
 '(NN '
 'x)) '
 '(: '
 '=) '
 '(NP '
 '(CD '
 '0.072) '
 '(NN '
 'm)))) '
 '(PP '
 '(IN '
 'for) '
 '(NP '
 '(DT '
 'the) '
 '(ADJP '
 '(CD '
 '1.3) '
 '(NN '
 'mm)) '
 '(NN '
 'sand) '
 '(: '
 '-) '
 '(JJ '
 'rough) '
 '(NN '
 'beach))) '
 '(: '
 ';) '
 '(NP '
 '(NP '
 '(NNS '
 'results)) '
 '(PP '
 '(IN '
 'from) '
 '(NP '
 '(NP '
 '(NP '
 '(CD '
 '50) '
 '(JJ '
 'individual) '
 '(NNS '
 'events)) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(NP '
 '(JJ '
 'grey)) '
 '(-RRB- '
 '-RRB-))) '
 '(CC '
 'and) '
 '(NP '
 '(NP '
 '(NN '
 'ensemble)) '
 '(: '
 '-) '
 '(NP '
 '(NP '
 '(JJ '
 'averaged) '
 '(NN '
 'result)) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(ADJP '
 '(JJ '
 'black)) '
 '(-RRB- '
 '-RRB-))))))) '
 '(. '
 '.))')
('(S '
 '(NP '
 '(NN '
 'Flow) '
 '(NNS '
 'depths)) '
 '(VP '
 '(VBD '
 'were) '
 '(VP '
 '(VBN '
 'measured) '
 '(S '
 '(VP '
 '(VBG '
 'using) '
 '(NP '
 '(NP '
 '(NN '
 'Laser) '
 '(: '
 '-) '
 '(JJ '
 'induced) '
 '(NN '
 'fluorescence)) '
 '(PRN '
 '(-LRB- 

 '(-RRB- '
 '-RRB-)) '
 '(NN '
 'film)) '
 '(VP '
 '(VBN '
 'grown) '
 '(PP '
 '(IN '
 'by) '
 '(NP '
 '(NP '
 '(JJ '
 'Atmospheric) '
 '(JJ '
 'Atomic) '
 '(NN '
 'Layer) '
 '(NN '
 'Deposition)) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(NP '
 '(NNP '
 'AALD)) '
 '(-RRB- '
 '-RRB-)))) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(NP '
 '(DT '
 'a) '
 '(NN '
 'matter)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(NNS '
 'minutes))))))) '
 '(VP '
 '(VBD '
 'was) '
 '(VP '
 '(VBN '
 'studied) '
 '(PP '
 '(IN '
 'as) '
 '(NP '
 '(NP '
 '(DT '
 'a) '
 '(NN '
 'hole) '
 '(: '
 '-) '
 '(JJ '
 'blocking) '
 '(NN '
 'layer)) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(ADJP '
 '(NN '
 'poly(3-hexylthiophene-2,5-diyl):[6,6]-phenyl) '
 '(: '
 '-) '
 '(JJ '
 'C61-buyric) '
 '(NN '
 'acid) '
 '(NN '
 'methyl) '
 '(NN '
 'ester) '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(NP '
 '(NN '
 'P3HT)) '
 '(: '
 ':) '
 '(NP '
 '(NNP '
 'PCBM)) '
 '(-RRB- '
 '-RRB-)) '
 '(NN '
 'based)) '
 '(VBN '
 'inverted) '
 '(JJ '
 'solar) '
 '(NNS '
 'cells

('(S '
 '(NP '
 '(DT '
 'The) '
 '(JJ '
 'following) '
 '(NN '
 'significance) '
 '(NN '
 'diagram)) '
 '(VP '
 '(VBZ '
 'displays) '
 '(NP '
 '(NP '
 '(DT '
 'the) '
 '(JJ '
 'average) '
 '(NN '
 'rank)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(DT '
 'the) '
 '(NNS '
 'classifiers))) '
 '(PP '
 '(IN '
 'at) '
 '(NP '
 '(DT '
 'an) '
 '(ADJP '
 '(ADJP '
 '(CD '
 '85) '
 '(NN '
 '%)) '
 '(JJ '
 'good)) '
 '(, '
 ',) '
 '(ADJP '
 '(CD '
 '15) '
 '(NN '
 '%)) '
 '(JJ '
 'bad) '
 '(NN '
 'class) '
 '(NN '
 'split))))) '
 '(: '
 ':))')
('(NP '
 '(NP '
 '(NNP '
 'AR) '
 '(NN '
 'comparison)) '
 '(PP '
 '(IN '
 'at) '
 '(NP '
 '(NP '
 '(DT '
 'an) '
 '(ADJP '
 '(CD '
 '85/15) '
 '(NN '
 '%)) '
 '(NN '
 'split)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(ADJP '
 '(JJ '
 'good) '
 '(CC '
 '/) '
 '(JJ '
 'bad)) '
 '(NNS '
 'observations))))) '
 '(. '
 '.))')
('(NP '
 '(NP '
 '(DT '
 'The) '
 '(NN '
 'generator) '
 '(NN '
 'efficiency)) '
 '(, '
 ',) '
 '(VP '
 '(VBN '
 'defined) '
 '(PP '
 '(IN '
 'in) '


 'multiphase) '
 '(NN '
 'fluid) '
 '(NN '
 'flow) '
 '(NNS '
 'simulations)))))) '
 '(, '
 ',) '
 '(NP '
 '(PRP '
 'we)) '
 '(VP '
 '(VBP '
 'have) '
 '(VP '
 '(VBN '
 'estimated) '
 '(NP '
 '(NP '
 '(DT '
 'the) '
 '(NN '
 'impact)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(DT '
 'the) '
 '(NN '
 'reservoir) '
 '(NN '
 'temperature))) '
 '(PP '
 '(IN '
 'on) '
 '(NP '
 '(NP '
 '(DT '
 'the) '
 '(NNP '
 '4D) '
 '(JJ '
 'seismic) '
 '(NNS '
 'data)) '
 '(PP '
 '(IN '
 'from) '
 '(NP '
 '(NNP '
 'Ketzin)))))))) '
 '(. '
 '.))')
('(S '
 '(NP '
 '(DT '
 'An) '
 '(JJ '
 'additional) '
 '(NN '
 'uncertainty)) '
 '(VP '
 '(VBZ '
 'arises) '
 '(PP '
 '(IN '
 'from) '
 '(NP '
 '(NP '
 '(NN '
 'saturation) '
 '(NN '
 'profiling)) '
 '(, '
 ',) '
 '(SBAR '
 '(WHNP '
 '(WDT '
 'which)) '
 '(S '
 '(VP '
 '(VBZ '
 'discerns) '
 '(PP '
 '(PP '
 '(IN '
 'between) '
 '(NP '
 '(NP '
 '(NP '
 '(NNS '
 'pores)) '
 '(VP '
 '(VBN '
 'filled) '
 '(PP '
 '(IN '
 'with) '
 '(NP '
 '(NN '
 'brine))))) '
 '(CC '
 'a

 '(DT '
 'Another) '
 '(JJ '
 'important) '
 '(NN '
 'finding)) '
 '(VP '
 '(VBZ '
 'is) '
 '(SBAR '
 '(IN '
 'that) '
 '(, '
 ',) '
 '(S '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(NP '
 '(NN '
 'spite)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(NP '
 '(DT '
 'the) '
 '(JJ '
 'immune) '
 '(NNS '
 'responses)) '
 '(VP '
 '(VBN '
 'mounted) '
 '(PP '
 '(IN '
 'by) '
 '(NP '
 '(DT '
 'the) '
 '(NN '
 'host) '
 '(NN '
 'brain)))))))) '
 '(, '
 ',) '
 '(NP '
 '(NP '
 '(DT '
 'a) '
 '(JJ '
 'substantial) '
 '(NN '
 'number)) '
 '(PP '
 '(IN '
 'of) '
 '(NP '
 '(: '
 'TH+) '
 '(NNS '
 'cells)))) '
 '(VP '
 '(VBD '
 'survived) '
 '(PP '
 '(IN '
 'in) '
 '(NP '
 '(DT '
 'the) '
 '(NNS '
 'allografts))))))) '
 '(. '
 '.))')
('(S '
 '(NP '
 '(PRN '
 '(-LRB- '
 '-LRB-) '
 '(NN '
 'C) '
 '(-RRB- '
 '-RRB-)) '
 '(NN '
 'C) '
 '(: '
 '-) '
 '(NN '
 'peptide) '
 '(CC '
 'and) '
 '(CD '
 'PDX1) '
 '(NN '
 'expression)) '
 '(VP '
 '(VBD '
 'was) '
 '(VP '
 '(VBN '
 'confirmed) '
 '(PP '
 '(IN '
 'by) '
 '(NP '
 '(

SyntaxError: invalid syntax (<ipython-input-122-d2aa967f9aed>, line 2)

In [None]:
"""
One Long document in json format
"""


import math
twitjson = {"full_text": "","entities":{}}
offset = 0

for doc in data.values():

    testjson  = doc.doc.to_json()


    
    twitjson["full_text"] = twitjson["full_text"] +  testjson["text"]


    for tok in testjson["tokens"]:
        tempToken = {}
        tempToken["indices"] = [offset + tok["start"],offset + tok["end"]] 
        tempToken["category"] = tok["tag"]
        tempToken["kind"] = tok["dep"]
        tempToken["id"] = tok["id"]
        tempToken["head"] = tok["head"]
        try:
            twitjson["entities"]["Token"].append(tempToken)
        except KeyError:
            twitjson["entities"]["Token"] = [tempToken] 

#     for ent in testjson["ents"]:
#         tempEnt = {}
#         tempEnt["indices"] = [offset + ent["start"],offset + ent["end"]] 
#         try:
#             twitjson["entities"][ent["label"]].append(tempEnt)
#         except KeyError:
#             twitjson["entities"][ent["label"]] = [tempEnt]
            
#     for unit in doc.doc._.unit:
#         tempUnit = {}
#         tempUnit["indices"] = [int(unit["start"]),int(unit["end"])]
#         tempUnit["text"]= unit["text"].text
#         try:
#             twitjson["entities"]["unit"].append(tempUnit)
#         except KeyError:
#             twitjson["entities"]["unit"] = [tempUnit]

            
    for sent in testjson["sents"]:
        tempSent = {}
        tempSent["indices"] = [offset + sent["start"],offset + sent["end"]] 
        try:
            twitjson["entities"]["sentence"].append(tempSent)
        except KeyError:
            twitjson["entities"]["sentence"] = [tempSent] 
            
    for index, row in doc.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["indices"] = [offset + row["startOffset"],offset + row["endOffset"]] 
        tempAnnot["annotSet"] = row["annotSet"]
        tempAnnot["annotId"] = row["annotId"]
        tempAnnot["text"] = row["text"]
        if(type(row["other"]) == str):
            tempAnnot["other"] = row["other"]
        else:
            tempAnnot["other"] = "nothing"
            
        try:
            twitjson["entities"]["MEval-"+row["annotType"]].append(tempAnnot)
        except KeyError:
            twitjson["entities"]["MEval-"+row["annotType"]] = [tempAnnot] 
            
#      doc._.h0Number = []
#     doc._.h0Unit = []
#     doc._.h0MeasuredEntity = []
            
            
    for num in doc.doc._.h0Number:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Number"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Number"] = [temp]
            
    for num in doc.doc._.h0Unit:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Unit"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Unit"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntity:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntity"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntity"] = [temp]
            
    #True Positives        
    for num in doc.doc._.h0NumberTps:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0NumberTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0NumberTP"] = [temp]
            
    for num in doc.doc._.h0UnitTps:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0UnitTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0UnitTP"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntityTps:
        temp= {}
        temp["indices"] = [offset + int(num["start"]),offset + int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntityTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntityTP"] = [temp]
            
    twitjson["full_text"] = twitjson["full_text"] +  "\n\n"
    offset += len(twitjson["full_text"])
    
    if offset > 1000 and offset < 4000:
        json.dump(twitjson, open(f'jsondoctest/sample.json',"w"), indent=3)


json.dump(twitjson, open(f'jsondoctest/alldocs.json',"w"), indent=3)

In [38]:
def createFeature(key, value, file):
    file.write(f"""<Feature>
  <Name className="java.lang.String">{key}</Name>
  <Value className="java.lang.String">{value}</Value>
</Feature>\n""")

def createAnnotation(ID, tpe, start, end, features, file):
    file.write(f"<Annotation Id=\"{ID}\" Type=\"{tpe}\" StartNode=\"{start}\" EndNode=\"{end}\">\n")
    for key in features.keys():
        createFeature(key,features[key],file)
    file.write("</Annotation>\n")
    
def createNode(token,doc,offset,file,prevEnd):
    txt = token.text
    txt = txt.replace("'","&apos;")
    txt = txt.replace("\"","&quot;")
    txt = txt.replace("&","&amp;")
    txt = txt.replace("<","&lt;")
    txt = txt.replace(">","&gt;")
    
    start = doc[token.i:token.i+1].start_char+offset
    end = offset+doc[token.i:token.i+1].end_char
    
    if(start == prevEnd):
        file.write("{}<Node id=\"{}\"/>".format(txt,end))
        
    elif(start > prevEnd):
        file.write(" <Node id=\"{}\"/>{}<Node id=\"{}\"/>".format(start,txt,end))
    else:
        print("case Unhandled")
    
    return end
    

In [39]:
"""
One Long document in xml format
"""
import os 
file = open("gatexmlforalldocs.xml", "w", encoding = "utf-8")
txtFile = open("textFileForGatexml.txt", "w", encoding = "utf-8")

file.write("""<?xml version='1.0' encoding='utf-8'?>
<GateDocument version="3">
<GateDocumentFeatures>""")
createFeature("gate.SourceURL",os.path.join(os.getcwd(), "textFileForGatexml.txt"),file)
createFeature("MimeType","text/plain",file)
createFeature("docNewLineType","",file)
file.write("\n</GateDocumentFeatures>\n\n")
file.write("<TextWithNodes>")


offset = 0
annotId = 0
annotz = []
for e in data.values():
    testjson  = e.doc.to_json()
    prevEnd = -1
    
    for sent in e.doc.sents:
        for token in sent:
            prevEnd = createNode(token,e.doc,offset,file,prevEnd)


    
    txtFile.write(testjson["text"] + "\n\n")
    


    for tok in testjson["tokens"]:
        tempToken = {}
        tempToken["category"] = tok["tag"]
        tempToken["kind"] = tok["dep"]
        tempToken["id"] = tok["id"]
        tempToken["head"] = tok["head"]
        
        annotz.append([annotId, "Token", offset + tok["start"], offset + tok["end"], tempToken, file])
        #createAnnotation(annotId, "Token", offset + tok["start"], offset + tok["end"], tempToken, file)
        annotId += 1
            
            
    for sent in testjson["sents"]:
        tempSent = {}
        annotz.append([annotId, "sentence", offset + sent["start"], offset + sent["end"], tempSent, file])
        #createAnnotation(annotId, "sentence", offset + sent["start"], offset + sent["end"], tempSent, file)
        annotId += 1 
        
            
    for index, row in e.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["annotSet"] = row["annotSet"]
        tempAnnot["annotId"] = row["annotId"]
        tempAnnot["text"] = row["text"]
        if(type(row["other"]) == str):
            tempAnnot["other"] = row["other"]
        else:
            tempAnnot["other"] = "nothing"
            
        annotz.append([annotId, "MEval-"+row["annotType"] , offset + row["startOffset"], offset + row["endOffset"], tempAnnot, file])    
        #createAnnotation(annotId, "MEval-"+row["annotType"] , offset + row["startOffset"], offset + row["endOffset"], tempAnnot, file)
        annotId += 1
            
    for num in doc.doc._.h0Number:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0Number", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0Number", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    for num in doc.doc._.h0Unit:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0Unit", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0Unit", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    for num in doc.doc._.h0MeasuredEntity:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0MeasuredEntity", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0MeasuredEntity", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    #True Positives        
    for num in doc.doc._.h0NumberTps:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0NumberTP", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0NumberTP", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
        
    for num in doc.doc._.h0UnitTps:
        temp= {}
        temp["text"]= num["text"]
        
        annotz.append([annotId, "h0UnitTP", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0UnitTP", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    for num in doc.doc._.h0MeasuredEntityTps:
        temp= {}
        temp["text"] = num["text"]
        
        annotz.append([annotId, "h0MeasuredEntityTP", offset + num["start"], offset + num["end"], temp, file])
        #createAnnotation(annotId, "h0MeasuredEntityTP", offset + num["start"], offset + num["end"], temp, file)
        annotId += 1 
            
    offset += len(testjson["text"])
    break
    
    
file.write("\n</TextWithNodes>\n\n")    
    
file.write("<AnnotationSet Name=\"Bens annots\">\n")

for x in annotz:
    createAnnotation(*x)
    
file.write("</AnnotationSet>")
file.write("</GateDocument>")    
file.close()

txtFile.close()



In [63]:
"""
Document by document in json format
"""


import math
for doc in data.values():
    print(doc.name)

    testjson  = doc.doc.to_json()


    twitjson = {"text": testjson["text"],"entities":{}}

    for tok in testjson["tokens"]:
        tempToken = {}
        tempToken["indices"] = [tok["start"],tok["end"]] 
        tempToken["category"] = tok["tag"]
        tempToken["kind"] = tok["dep"]
        tempToken["id"] = tok["id"]
        tempToken["head"] = tok["head"]
        
            
        try:
            twitjson["entities"]["Token"].append(tempToken)
        except KeyError:
            twitjson["entities"]["Token"] = [tempToken] 

    for ent in testjson["ents"]:
        tempEnt = {}
        tempEnt["indices"] = [ent["start"],ent["end"]] 
        try:
            twitjson["entities"][ent["label"]].append(tempEnt)
        except KeyError:
            twitjson["entities"][ent["label"]] = [tempEnt]
            
            
    for sent in doc.doc.sents:
        for tok in sent: 
            tempEnt = {}
            if tok.dep_ == "root":
                tempEnt["args"] = ["",tok.text]
            else:
                tempEnt["args"] = [tok.head.text,tok.text]
                
            tempEnt["kind"] = tok.dep_
            
            mn = min(doc.doc[tok.head.i:tok.head.i+1].start_char,doc.doc[tok.i:tok.i+1].start_char)
            mx = max(doc.doc[tok.head.i:tok.head.i+1].end_char,doc.doc[tok.i:tok.i+1].end_char)
            
            tempEnt["indices"] = [mn,mx]
        
            try:
                twitjson["entities"]["NickDependency"].append(tempEnt)
            except KeyError:
                twitjson["entities"]["NickDependency"] = [tempEnt]
            
            
#     for unit in doc.doc._.unit:
#         tempUnit = {}
#         tempUnit["indices"] = [int(unit["start"]),int(unit["end"])]
#         tempUnit["text"]= unit["text"].text
#         try:
#             twitjson["entities"]["unit"].append(tempUnit)
#         except KeyError:
#             twitjson["entities"]["unit"] = [tempUnit]

            
    for sent in testjson["sents"]:
        tempSent = {}
        tempSent["indices"] = [sent["start"],sent["end"]] 
        try:
            twitjson["entities"]["sentence"].append(tempSent)
        except KeyError:
            twitjson["entities"]["sentence"] = [tempSent] 
            
    for index, row in doc.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["indices"] = [row["startOffset"],row["endOffset"]] 
        tempAnnot["annotSet"] = row["annotSet"]
        tempAnnot["annotId"] = row["annotId"]
        tempAnnot["text"] = row["text"]
        if(type(row["other"]) == str):
            tempAnnot["other"] = row["other"]
        else:
            tempAnnot["other"] = "nothing"
            
        try:
            twitjson["entities"]["MEval-"+row["annotType"]].append(tempAnnot)
        except KeyError:
            twitjson["entities"]["MEval-"+row["annotType"]] = [tempAnnot] 
            
#      doc._.h0Number = []
#     doc._.h0Unit = []
#     doc._.h0MeasuredEntity = []
            
            
    for num in doc.doc._.h0Number:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Number"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Number"] = [temp]
            
    for num in doc.doc._.h0Unit:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Unit"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Unit"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntity:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntity"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntity"] = [temp]
            
    #True Positives        
    for num in doc.doc._.h0NumberTps:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0NumberTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0NumberTP"] = [temp]
            
    for num in doc.doc._.h0UnitTps:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0UnitTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0UnitTP"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntityTps:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntityTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntityTP"] = [temp]
    


    json.dump(twitjson, open(f'jsondocs/{doc.name}.json',"w"), indent=3)
    
    


S0012821X12004384-1302
S0012821X12004384-1405
S0012821X12004384-1415
S0012821X12004384-1594
S0012821X12004384-1599
S0012821X13002185-1061
S0012821X13002185-1200
S0012821X13002185-1217
S0012821X13002185-1231
S0012821X13002185-835
S0012821X13007309-1482
S0012821X13007309-1509
S0012821X13007309-1605
S0012821X13007309-1989
S0016236113008041-3031
S0016236113008041-3112
S0016236113008041-3153
S0016236113008041-3171
S0016236113008041-3186
S0016236113008041-3207
S0016236113008041-3269
S0016236113008041-3290
S0016236113008041-890
S0016236113008041-913
S0016236113008041-967
S0019103511004994-1382
S0019103511004994-1511
S0019103511004994-1565
S0019103512002801-1342
S0019103512002801-1496
S0019103512002801-1608
S0019103512002801-1824
S0019103512002801-1849
S0019103512002801-1927
S0019103512003533-3299
S0019103512003533-3348
S0019103512003533-4685
S0019103512003533-4971
S0019103512003533-5031
S0019103512003533-5072
S0019103512003533-5251
S0019103512003533-5598
S0019103512003995-1807
S00191035120039