This notebook contains code for loading in all competition texts and annotations, and converting these to gate readable format

In [2]:
#imports
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json

TRIALPATH = "data/trial"
TRAINPATH = "data/train"
LATEST_tsv = None

#setup SpaCy
import spacy
import en_core_web_lg
spacy.prefer_gpu()
nlp = en_core_web_lg.load()

Rule-based Pipeline component for retrieving Quantities. 

Rules: 
    1. any CD followed by a unit that has a nouns POS
    2. any cardinal, money, ordinal, percent, date, time or quantity followed by a unit that has a noun POS
    3. any token that is LIKE_NUM followed by a unit that has a noun POS



In [4]:
#add pipeline component 
from spacy.matcher import Matcher
from spacy.tokens import Doc
Doc.set_extension("unit", default = "def", force = True)


def customMatcher(nlp):
    """
    Description: matcher giving the most recall so far
    """
    matchList = open("gazetteers/combined_measurements.lst","r",encoding="utf-8").read().split("\n")
    matcher = Matcher(nlp.vocab)
    pattern = []
    for word in matchList: 
        pattern.append([{"TAG": {"REGEX": "^[CD]"}},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        pattern.append([{"ENT_TYPE": {"IN": ["CARDINAL", "MONEY", "ORDINAL", "PERCENT", "DATE", "TIME", "QUANTITY"]},
                        "TAG":{"REGEX": "^[DT]"},"op": "!"},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        pattern.append([{"LIKE_NUM": True},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        #pattern.append([{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        
    matcher.add("Unit", None, *pattern)
        
    return matcher

def gazetteer(doc):
    matcher = customMatcher(nlp)
    matches = matcher(doc)
    doc._.unit = []
    for match_id, start, end in matches:
        tempSpan = doc[start:end]
        doc._.unit.append({'start': tempSpan.start_char, 'end': tempSpan.end_char, 'label': 'UNIT', 'text' : doc[start:end]})
    return doc
        
        
        
#nlp.add_pipe(gazetteer, last=True)
    

In [5]:
#pipeline component H0

Doc.set_extension("h0Number", default = "def", force = True)
Doc.set_extension("h0Unit", default = "def", force = True)
Doc.set_extension("h0MeasuredEntity", default = "def", force = True)
Doc.set_extension("h0Measurements", default = "def", force = True)

def numberMatcher(nlp):
    """
    Description: matcher giving the most recall so far
    """
    matcher = Matcher(nlp.vocab)
    pattern = []
    pattern.append([{"LIKE_NUM": True}])
    matcher.add("h0Number", None, *pattern)
        
    return matcher


def h0(doc):
    matcher = numberMatcher(nlp)
    matches = matcher(doc)
    doc._.h0Number = []
    doc._.h0Unit = []
    doc._.h0MeasuredEntity = []
    doc._.h0Measurements = []
    for match_id, start, end in matches:
        
        tempSpan = doc[start:end]
        tempTok = doc[start]
        tempNum = {
            'start': tempSpan.start_char, 
            'end': tempSpan.end_char, 
            'label': 'h0Number', 
            'text' : tempTok.text,
            'span' : tempSpan,
            's' : start,
            'e' : end
        }
        
        doc._.h0Number.append(tempNum)
        
        tempHead = tempTok.head
        spanHead = doc[tempHead.i:tempHead.i+1]
        tempUnit = {
            'start': spanHead.start_char, 
            'end': spanHead.end_char, 
            'label': 'h0Unit', 
            'text' : tempHead.text,
            'span' : spanHead,
            's' : tempHead.i,
            'e' : tempHead.i+1
        }
        
        doc._.h0Unit.append(tempUnit)
        
        tempHeadHead = tempTok.head.head
        spanHeadHead = doc[tempHeadHead.i:tempHeadHead.i+1]
        tempME = {
            'start': spanHeadHead.start_char, 
            'end': spanHeadHead.end_char, 
            'label': 'h0MeasuredEntity', 
            'text' : tempHeadHead.text,
            'span' : spanHeadHead,
            's' : tempHeadHead.i,
            'e' : tempHeadHead.i+1
        }
        
        doc._.h0MeasuredEntity.append(tempME)
        
        doc._.h0Measurements.append({
            "Number" : tempNum,
            "Unit" : tempUnit,
            "MeasuredEntity": tempME
        })
        
        
    return doc


nlp.add_pipe(h0, last=True)

In [6]:
from code.exerpt import exerpt

In [7]:
#
#TEST ONE
#

def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}
#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid"))),
                nlp
            )
        break

In [8]:
import time
t1 = time.time()
def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}

#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid"))),
                nlp
            )

#load all train data
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRAINPATH, "grobid", fn[:-4] + ".grobid"))),
                nlp
            )
            
t2 = time.time()
print(t2-t1, "Seconds elapsed")
print((t2-t1)/60, "Minutes elapsed")

FindOffset method has created a key error 
origrange: ( 147 , 161 )
range: ( 147 , 162 )
{0: 0, 5: 1, 8: 2, 14: 3, 17: 4, 25: 5, 29: 6, 34: 7, 35: 8, 41: 9, 43: 10, 48: 11, 50: 12, 54: 13, 56: 14, 57: 15, 59: 16, 62: 17, 69: 18, 73: 19, 80: 20, 83: 21, 86: 22, 97: 23, 101: 24, 110: 25, 116: 26, 119: 27, 121: 28, 130: 29, 133: 30, 139: 31, 146: 32, 151: 33, 155: 34, 162: 35, 163: 36, 167: 37, 169: 38, 176: 39, 178: 40, 179: 41, 181: 42, 188: 43, 196: 44, 203: 45, 208: 46, 213: 47, 216: 48, 227: 49, 230: 50, 232: 51, 240: 52, 250: 53, 259: 54, 262: 55, 269: 56, 273: 57, 280: 58, 288: 59, 289: 60, 296: 61, 299: 62, 301: 63, 302: 64, 304: 65, 308: 66, 309: 67, 311: 68, 319: 69, 323: 70, 325: 71, 329: 72, 334: 73, 346: 74, 351: 75, 354: 76, 359: 77, 364: 78, 368: 79, 382: 80, 386: 81, 392: 82, 395: 83, 398: 84, 405: 85, 415: 86, 417: 87, 426: 88, 432: 89, 446: 90, 449: 91, 453: 92, 456: 93, 468: 94, 475: 95, 482: 96, 485: 97, 488: 98, 495: 99, 505: 100, 508: 101, 512: 102, 522: 103, 525: 10

FindOffset method has created a key error 
origrange: ( 1711 , 1715 )
range: ( 1711 , 1715 )
{0: 0, 4: 1, 13: 2, 18: 3, 26: 4, 30: 5, 33: 6, 37: 7, 42: 8, 50: 9, 62: 10, 72: 11, 74: 12, 76: 13, 80: 14, 82: 15, 91: 16, 92: 17, 97: 18, 103: 19, 106: 20, 114: 21, 121: 22, 123: 23, 124: 24, 125: 25, 126: 26, 128: 27, 133: 28, 143: 29, 146: 30, 150: 31, 158: 32, 161: 33, 172: 34, 175: 35, 177: 36, 185: 37, 193: 38, 203: 39, 212: 40, 213: 41, 215: 42, 217: 43, 228: 44, 236: 45, 239: 46, 251: 47, 260: 48, 261: 49, 267: 50, 269: 51, 270: 52, 272: 53, 273: 54, 274: 55, 276: 56, 283: 57, 290: 58, 295: 59, 306: 60, 309: 61, 321: 62, 324: 63, 329: 64, 331: 65, 341: 66, 350: 67, 353: 68, 357: 69, 358: 70, 364: 71, 372: 72, 382: 73, 384: 74, 394: 75, 397: 76, 402: 77, 404: 78, 411: 79, 419: 80, 428: 81, 432: 82, 440: 83, 444: 84, 447: 85, 451: 86, 456: 87, 461: 88, 469: 89, 470: 90, 476: 91, 478: 92, 480: 93, 486: 94, 495: 95, 504: 96, 517: 97, 518: 98, 529: 99, 532: 100, 534: 101, 535: 102, 537: 10

FindOffset method has created a key error 
origrange: ( 538 , 551 )
range: ( 538 , 552 )
{0: 0, 9: 1, 12: 2, 16: 3, 27: 4, 30: 5, 37: 6, 40: 7, 44: 8, 49: 9, 52: 10, 56: 11, 58: 12, 66: 13, 74: 14, 77: 15, 79: 16, 86: 17, 88: 18, 96: 19, 99: 20, 109: 21, 114: 22, 116: 23, 125: 24, 129: 25, 132: 26, 142: 27, 145: 28, 155: 29, 157: 30, 167: 31, 176: 32, 179: 33, 186: 34, 189: 35, 196: 36, 206: 37, 209: 38, 212: 39, 221: 40, 232: 41, 240: 42, 242: 43, 249: 44, 251: 45, 262: 46, 271: 47, 274: 48, 279: 49, 284: 50, 291: 51, 295: 52, 303: 53, 308: 54, 317: 55, 324: 56, 335: 57, 341: 58, 345: 59, 349: 60, 356: 61, 359: 62, 366: 63, 367: 64, 368: 65, 369: 66, 371: 67, 382: 68, 391: 69, 394: 70, 396: 71, 403: 72, 407: 73, 418: 74, 428: 75, 430: 76, 435: 77, 437: 78, 446: 79, 457: 80, 465: 81, 470: 82, 472: 83, 475: 84, 477: 85, 479: 86, 480: 87, 483: 88, 485: 89, 488: 90, 492: 91, 500: 92, 510: 93, 512: 94, 516: 95, 519: 96, 529: 97, 532: 98, 537: 99, 547: 100, 552: 101, 555: 102, 562: 103, 567

FindOffset method has created a key error 
origrange: ( 148 , 154 )
range: ( 148 , 155 )
{0: 0, 4: 1, 14: 2, 18: 3, 22: 4, 24: 5, 26: 6, 31: 7, 34: 8, 39: 9, 51: 10, 52: 11, 55: 12, 57: 13, 58: 14, 59: 15, 61: 16, 66: 17, 70: 18, 78: 19, 81: 20, 84: 21, 87: 22, 100: 23, 104: 24, 108: 25, 113: 26, 116: 27, 123: 28, 125: 29, 128: 30, 133: 31, 141: 32, 143: 33, 146: 34, 153: 35, 155: 36, 159: 37, 165: 38, 171: 39, 174: 40, 180: 41, 185: 42, 189: 43, 196: 44, 203: 45, 211: 46, 215: 47, 221: 48, 224: 49, 228: 50, 229: 51, 230: 52, 240: 53, 244: 54, 249: 55, 252: 56, 256: 57, 257: 58, 258: 59, 267: 60, 269: 61, 274: 62, 279: 63, 283: 64, 288: 65, 291: 66, 302: 67, 311: 68, 315: 69, 320: 70, 325: 71, 328: 72, 338: 73, 340: 74, 348: 75, 350: 76, 353: 77, 355: 78, 365: 79, 374: 80, 379: 81, 385: 82, 388: 83, 395: 84, 398: 85, 408: 86, 412: 87, 415: 88, 423: 89, 427: 90, 433: 91, 439: 92, 442: 93, 450: 94, 457: 95, 460: 96, 463: 97, 469: 98, 474: 99, 478: 100, 482: 101, 485: 102, 489: 103, 490: 

FindOffset method has created a key error 
origrange: ( 268 , 273 )
range: ( 268 , 273 )
{0: 0, 9: 1, 19: 2, 29: 3, 39: 4, 43: 5, 49: 6, 52: 7, 65: 8, 69: 9, 82: 10, 89: 11, 93: 12, 102: 13, 105: 14, 111: 15, 119: 16, 123: 17, 131: 18, 135: 19, 139: 20, 149: 21, 151: 22, 154: 23, 156: 24, 158: 25, 167: 26, 171: 27, 172: 28, 179: 29, 182: 30, 186: 31, 196: 32, 205: 33, 208: 34, 216: 35, 221: 36, 222: 37, 228: 38, 238: 39, 242: 40, 246: 41, 250: 42, 253: 43, 257: 44, 261: 45, 265: 46, 268: 47, 275: 48, 277: 49, 282: 50, 287: 51, 291: 52, 299: 53, 303: 54, 309: 55, 318: 56, 321: 57, 326: 58, 328: 59, 330: 60, 336: 61, 339: 62, 343: 63, 348: 64, 356: 65, 361: 66, 364: 67, 369: 68, 374: 69, 375: 70, 381: 71, 389: 72, 391: 73, 395: 74, 400: 75, 406: 76, 410: 77, 418: 78, 424: 79, 427: 80, 433: 81, 440: 82, 448: 83, 452: 84, 464: 85, 469: 86, 471: 87, 480: 88, 490: 89, 494: 90, 505: 91, 510: 92, 522: 93}
FindOffset method has created a key error 
origrange: ( 573 , 584 )
range: ( 573 , 585 )


FindOffset method has created a key error 
origrange: ( 599 , 602 )
range: ( 599 , 602 )
{0: 0, 6: 1, 9: 2, 16: 3, 24: 4, 26: 5, 28: 6, 33: 7, 36: 8, 45: 9, 52: 10, 56: 11, 63: 12, 65: 13, 72: 14, 81: 15, 90: 16, 99: 17, 106: 18, 109: 19, 113: 20, 114: 21, 123: 22, 133: 23, 135: 24, 138: 25, 149: 26, 153: 27, 154: 28, 157: 29, 165: 30, 172: 31, 176: 32, 178: 33, 181: 34, 186: 35, 189: 36, 191: 37, 194: 38, 201: 39, 205: 40, 206: 41, 213: 42, 219: 43, 221: 44, 225: 45, 233: 46, 241: 47, 250: 48, 260: 49, 264: 50, 276: 51, 281: 52, 287: 53, 294: 54, 298: 55, 303: 56, 305: 57, 309: 58, 312: 59, 316: 60, 321: 61, 323: 62, 327: 63, 329: 64, 331: 65, 335: 66, 343: 67, 351: 68, 360: 69, 370: 70, 374: 71, 387: 72, 392: 73, 398: 74, 405: 75, 413: 76, 417: 77, 422: 78, 424: 79, 428: 80, 431: 81, 435: 82, 440: 83, 442: 84, 446: 85, 448: 86, 450: 87, 456: 88, 461: 89, 464: 90, 470: 91, 482: 92, 485: 93, 493: 94, 502: 95, 512: 96, 516: 97, 528: 98, 533: 99, 534: 100, 538: 101, 540: 102, 542: 103, 5

In [9]:
from code.helpers import *



EVALUATION -- START


In [10]:
#get false positives and true positives
Doc.set_extension("h0NumberTps", default = "def", force = True)
Doc.set_extension("h0UnitTps", default = "def", force = True)
Doc.set_extension("h0MeasuredEntityTps", default = "def", force = True)
Doc.set_extension("h0NumberFps", default = "def", force = True)
Doc.set_extension("h0UnitFps", default = "def", force = True)
Doc.set_extension("h0MeasuredEntityFps", default = "def", force = True)
Doc.set_extension("h0MeasurementTps", default = "def", force = True)

for e in data.values():
    doc = e.doc
    doc._.h0NumberTps = []
    doc._.h0UnitTps = []
    doc._.h0MeasuredEntityTps = []
    doc._.h0NumberFps = []
    doc._.h0UnitFps = []
    doc._.h0MeasuredEntityFps = []

    
    for meas in doc._.h0Measurements:
        num = meas["Number"]
        unit = meas["Unit"]
        me = meas["MeasuredEntity"]
        
        for m in e.measurements.values():
            try:
                if(intersectSpan(num["span"],m["Quantity"]["startOffset"],m["Quantity"]["endOffset"]) and intersectSpan(unit["span"],m["Quantity"]["startOffset"],m["Quantity"]["endOffset"])):
                    doc._.h0NumberTps.append(num)
                    doc._.h0UnitTps.append(unit)
                    if(intersectSpan(me["span"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])):
                        doc._.h0MeasuredEntityTps.append(num)
                        doc._.h0MeasurementTps.appemd(meas)
                    else:
                        r=dict(d)
                        del r["MeasuredEntity"]
                        doc._.h0MeasurementTps.append(r)
                        
            except KeyError:
                continue#print("No quantity")

No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity
No quantity


In [None]:
goldCount = {
    "Quantity": 0,
    "MeasuredEntity" : 0,
    "MeasuredProperty" : 0,
    "Qualifier" : 0       
}
h0Count = {
    "Number":0,
    "Unit":0,
    "MeasuredEntity":0,
    "total":0
}


counts={
    "goldCount" : goldCount,
    "h0Count":h0Count
}

for e in data.values():
    for index, row in e.tsv.iterrows():
            counts["goldCount"][row["annotType"]] += 1
    counts["h0Count"]["Number"] += len(e.doc._.h0NumberTps)
    counts["h0Count"]["Unit"] += len(e.doc._.h0UnitTps)
    counts["h0Count"]["MeasuredEntity"] += len(e.doc._.h0MeasuredEntityTps)
    counts["h0Count"]["total"] += len(e.doc._.h0Measurements)
    
    
counts["QuantityPrecision"] = counts["h0Count"]["Number"]/counts["h0Count"]["total"]
counts["QuantityRecall"] = counts["h0Count"]["Number"]/counts["goldCount"]["Quantity"]
counts["QuantityF1"] = 2*(counts["QuantityRecall"]*counts["QuantityPrecision"])/(counts["QuantityRecall"]+counts["QuantityPrecision"])
counts["MEPrecision"] = counts["h0Count"]["MeasuredEntity"]/counts["h0Count"]["total"]
counts["MERecall"] = counts["h0Count"]["MeasuredEntity"]/counts["goldCount"]["Quantity"]
counts["MEF1"] = 2*(counts["MERecall"]*counts["MEPrecision"])/(counts["MERecall"]+counts["MEPrecision"])

     

In [None]:
counts


EVALUATION -- DONE


In [22]:
from code.output import ascii

for e in data.values():
    file = open(f"ascii/{e.name}.txt","w",encoding="utf-8")
    getAscii(e, file)
    file.close()

skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
skipping
s

In [10]:
# #get false positives and true positives
# Doc.set_extension("h0NumberTps", default = "def", force = True)
# Doc.set_extension("h0UnitTps", default = "def", force = True)
# Doc.set_extension("h0MeasuredEntityTps", default = "def", force = True)
# Doc.set_extension("h0NumberFps", default = "def", force = True)
# Doc.set_extension("h0UnitFps", default = "def", force = True)
# Doc.set_extension("h0MeasuredEntityFps", default = "def", force = True)


# for e in data.values():
#     doc = e.doc
#     doc._.h0NumberTps = []
#     doc._.h0UnitTps = []
#     doc._.h0MeasuredEntityTps = []
#     doc._.h0NumberFps = []
#     doc._.h0UnitFps = []
#     doc._.h0MeasuredEntityFps = []

    
#     for num in doc._.h0Number:
#         for m in e.measurements.values():
#             try:
#                 if(intersectSpan(num["span"],m["Quantity"]["startOffset"],m["Quantity"]["endOffset"])):
#                     print(e.name)
#                     doc._.h0NumberTps.append(num)
#                 else:
#                     doc._.h0NumberFps.append(num)
#             except KeyError:
#                 print("No quantity")
            
#     for num in doc._.h0Unit:
#         for m in e.measurements.values():
#             try:
#                 if(intersectSpan(num["span"],m["Quantity"]["startOffset"],m["Quantity"]["endOffset"])):
#                     doc._.h0UnitTps.append(num)
#                 else:
#                     doc._.h0UnitFps.append(num)
#             except KeyError:
#                 print("No quantity")

            
#     for num in doc._.h0MeasuredEntity:
#         for m in e.measurements.values():
#             try:
#                 if(intersectSpan(num["span"],m["MeasuredEntity"]["startOffset"],m["MeasuredEntity"]["endOffset"])):
#                     doc._.h0MeasuredEntityTps.append(num)
#                 else:
#                     doc._.h0MeasuredEntityFps.append(num)
#             except KeyError:
#                 print("No MeasuredEntity")

The following for loop converts the current spaCy and human annotation format into a gate readable one using text/x-json-twitter format

In [228]:
import math
for doc in data.values():

    testjson  = doc.doc.to_json()


    twitjson = {"full_text": testjson["text"],"entities":{}}


    for tok in testjson["tokens"]:
        tempToken = {}
        tempToken["indices"] = [tok["start"],tok["end"]] 
        tempToken["category"] = tok["tag"]
        tempToken["kind"] = tok["dep"]
        tempToken["id"] = tok["id"]
        tempToken["head"] = tok["head"]
        try:
            twitjson["entities"]["Token"].append(tempToken)
        except KeyError:
            twitjson["entities"]["Token"] = [tempToken] 

    for ent in testjson["ents"]:
        tempEnt = {}
        tempEnt["indices"] = [ent["start"],ent["end"]] 
        try:
            twitjson["entities"][ent["label"]].append(tempEnt)
        except KeyError:
            twitjson["entities"][ent["label"]] = [tempEnt]
            
#     for unit in doc.doc._.unit:
#         tempUnit = {}
#         tempUnit["indices"] = [int(unit["start"]),int(unit["end"])]
#         tempUnit["text"]= unit["text"].text
#         try:
#             twitjson["entities"]["unit"].append(tempUnit)
#         except KeyError:
#             twitjson["entities"]["unit"] = [tempUnit]

            
    for sent in testjson["sents"]:
        tempSent = {}
        tempSent["indices"] = [sent["start"],sent["end"]] 
        try:
            twitjson["entities"]["sentence"].append(tempSent)
        except KeyError:
            twitjson["entities"]["sentence"] = [tempSent] 
            
    for index, row in doc.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["indices"] = [row["startOffset"],row["endOffset"]] 
        tempAnnot["annotSet"] = row["annotSet"]
        tempAnnot["annotId"] = row["annotId"]
        tempAnnot["text"] = row["text"]
        if(type(row["other"]) == str):
            tempAnnot["other"] = row["other"]
        else:
            tempAnnot["other"] = "nothing"
            
        try:
            twitjson["entities"]["MEval-"+row["annotType"]].append(tempAnnot)
        except KeyError:
            twitjson["entities"]["MEval-"+row["annotType"]] = [tempAnnot] 
            
#      doc._.h0Number = []
#     doc._.h0Unit = []
#     doc._.h0MeasuredEntity = []
            
            
    for num in doc.doc._.h0Number:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Number"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Number"] = [temp]
            
    for num in doc.doc._.h0Unit:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0Unit"].append(temp)
        except KeyError:
            twitjson["entities"]["h0Unit"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntity:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntity"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntity"] = [temp]
            
    #True Positives        
    for num in doc.doc._.h0NumberTps:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0NumberTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0NumberTP"] = [temp]
            
    for num in doc.doc._.h0UnitTps:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0UnitTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0UnitTP"] = [temp]
            
    for num in doc.doc._.h0MeasuredEntityTps:
        temp= {}
        temp["indices"] = [int(num["start"]),int(num["end"])]
        temp["text"]= num["text"]
        try:
            twitjson["entities"]["h0MeasuredEntityTP"].append(temp)
        except KeyError:
            twitjson["entities"]["h0MeasuredEntityTP"] = [temp]
    


    json.dump(twitjson, open(f'jsondocs/{doc.name}.json',"w"), indent=3)

{'start': 597, 'end': 604, 'label': 'h0Number', 'text': '2617.35', 'span': 2617.35, 's': 116, 'e': 117}
{'start': 606, 'end': 613, 'label': 'h0Number', 'text': '2617.44', 'span': 2617.44, 's': 118, 'e': 119}
{'start': 615, 'end': 622, 'label': 'h0Number', 'text': '2614.73', 'span': 2614.73, 's': 120, 'e': 121}
{'start': 628, 'end': 635, 'label': 'h0Number', 'text': '2614.71', 'span': 2614.71, 's': 123, 'e': 124}
{'start': 778, 'end': 785, 'label': 'h0Number', 'text': '2614.73', 'span': 2614.73, 's': 159, 'e': 160}
{'start': 791, 'end': 798, 'label': 'h0Number', 'text': '2614.71', 'span': 2614.71, 's': 162, 'e': 163}
{'start': 1505, 'end': 1512, 'label': 'h0Number', 'text': '2614.71', 'span': 2614.71, 's': 305, 'e': 306}
{'start': 775, 'end': 776, 'label': 'h0Number', 'text': '0', 'span': 0, 's': 144, 'e': 145}
{'start': 781, 'end': 783, 'label': 'h0Number', 'text': '80', 'span': 80, 's': 147, 'e': 148}
{'start': 44, 'end': 48, 'label': 'h0Number', 'text': '2632', 'span': 2632, 's': 8, 

In [241]:
counts

{'goldCount': {'Quantity': 1087,
  'MeasuredEntity': 1056,
  'MeasuredProperty': 686,
  'Qualifier': 293},
 'h0Count': {'Number': 739, 'Unit': 739, 'MeasuredEntity': 56, 'total': 1768},
 'QuantityPrecision': 0.41798642533936653,
 'QuantityRecall': 0.6798528058877645,
 'QuantityF1': 0.5176882661996497,
 'MEPrecision': 0.03167420814479638,
 'MERecall': 0.051517939282428704,
 'MEF1': 0.03922942206654991}

In [232]:
for e in data.values():
    if (len(e.doc._.h0MeasuredEntityTps) > 0):
        print(e.name)

S0012821X12004384-1415
S0012821X13002185-1061
S0012821X13002185-1217
S0012821X13007309-1605
S0012821X13007309-1989
S0019103512003533-4685
S0019103512003995-1910
S0021979713004438-1969
S0021979713004438-2148
S0006322312001096-1177
S0006322312001096-1190
S0006322312001096-1194
S0006322312001096-1202
S0006322312001096-1221
S0012821X12004384-1148
S0012821X12004384-1232
S0012821X12004384-1249
S0012821X12004384-952
S0016236113008041-2924
S0016236113008041-3012
S0019103512003995-2737
S0021979713004438-1401
S0022399913003358-931
S0022399913003358-943
S0038071712001010-918
S016412121300188X-4617
S0167610512002292-3187
S030881461301604X-1001
S030881461301604X-1002
S0378383911001669-1088
S0921818113002245-1571
S0960148113004989-3277
S0960148113005727-1494
S175058361300203X-1483
S175058361300203X-1542
S1750583613004192-714
S1873506113001116-710
S2211124712002884-1110
S2211124712002884-682
S2213671113001306-1398
S2213671113001306-907
S2213671113001306-908
