This notebook contains code for diplaying spacy dependency format int a way that we can Identify measured entities, etc...

In [3]:
#imports
from __future__ import unicode_literals
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json

TRIALPATH = "data/trial"
TRAINPATH = "data/train"
#setup SpaCy
import spacy
import en_core_web_lg
spacy.prefer_gpu()
nlp = en_core_web_lg.load()

Rule-based Pipeline component for retrieving Quantities. 

Rules: 
    1. any DT followed by a unit that has a nouns POS will be called a unit



In [4]:
#add pipeline component 
from spacy.matcher import Matcher
from spacy.tokens import Doc
Doc.set_extension("unit", default = "def", force = True)

def customMatcher(nlp):
    matchList = open("measure_units.lst").read().split("\n")
    matcher = Matcher(nlp.vocab)
    pattern = []
    for word in matchList: 
        pattern.append([{"TAG": "DT"},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        
    matcher.add("Unit", None, *pattern)
        
    return matcher
    

def gazetteer(doc):
    matcher = customMatcher(nlp)
    matches = matcher(doc)
    doc._.unit = []
    for match_id, start, end in matches:
        tempSpan = doc[start:end]
        doc._.unit.append({'start': tempSpan.start_char, 'end': tempSpan.end_char, 'label': 'UNIT', 'text' : doc[start:end]})
    return doc
        
        
        
nlp.add_pipe(gazetteer, last=True)
    

In [24]:
class exerpt:
    """
    Class exerpt
    Description: a simple class to contain data for the measeval competition

    self.name : the measeval given name associated to the document
    self.txt : the raw text of the document
    self.ann : the brat annotations of the document(deprecated)
    self.tsv : a pandas dataframe containing all the tab seperated value data
    self.grobid : grobid quantities json output for quantity detection
    self.doc : The spacy doc generated from processing on this particular document
    """
    
    def __init__(self, name, txt, ann, tsv, grobid):
        self.name = name
        self.txt = txt
        self.ann = ann
        self.tsv = tsv
        self.grobid = grobid
        self.context = False
        self.posTag = False
        self.doc = nlp(self.txt)
        
    def getContext(self, count = 3):
        if self.context == False: 
            beforeContext = []
            afterContext = []
            self.context = True
            for start,end in zip(self.tsv["startOffset"].values, self.tsv["endOffset"].values ):#self.tsv.iterrows():
                a = getIndexSeperated(exerpt.txt[0],end,count = count)
                b = getIndexSeperated(exerpt.txt[0],start,count = count,forward = False)
                beforeContext.append(self.txt[0][b:start])
                afterContext.append(self.txt[0][end:a])
            self.tsv.insert(3,"beforeContext", beforeContext)
            self.tsv.insert(3,"afterContext", afterContext)
            
    def getPosTag(self):
        if self.context and self.posTag == False: 
            bTag = []
            aTag = []
            self.posTag = True
            for before, after in zip(self.tsv["beforeContext"].values,self.tsv["afterContext"].values):
                bTag.append(nltk.pos_tag(word_tokenize(before)))
                aTag.append(nltk.pos_tag(word_tokenize(after)))
            self.tsv.insert(3,"beforeTag", bTag)
            self.tsv.insert(3,"afterTag", aTag)
            

In [25]:
def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}

#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid")))
            )

#load all train data
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRAINPATH, "grobid", fn[:-4] + ".grobid"))))

In [26]:
data.keys()

dict_keys(['S0012821X12004384-1302', 'S0012821X12004384-1405', 'S0012821X12004384-1415', 'S0012821X12004384-1594', 'S0012821X12004384-1599', 'S0012821X13002185-1061', 'S0012821X13002185-1200', 'S0012821X13002185-1217', 'S0012821X13002185-1231', 'S0012821X13002185-835', 'S0012821X13007309-1482', 'S0012821X13007309-1509', 'S0012821X13007309-1605', 'S0012821X13007309-1989', 'S0016236113008041-3031', 'S0016236113008041-3112', 'S0016236113008041-3153', 'S0016236113008041-3171', 'S0016236113008041-3186', 'S0016236113008041-3207', 'S0016236113008041-3269', 'S0016236113008041-3290', 'S0016236113008041-890', 'S0016236113008041-913', 'S0016236113008041-967', 'S0019103511004994-1382', 'S0019103511004994-1511', 'S0019103511004994-1565', 'S0019103512002801-1342', 'S0019103512002801-1496', 'S0019103512002801-1608', 'S0019103512002801-1824', 'S0019103512002801-1849', 'S0019103512002801-1927', 'S0019103512003533-3299', 'S0019103512003533-3348', 'S0019103512003533-4685', 'S0019103512003533-4971', 'S001

In [27]:
from spacy.lang.en.tag_map import TAG_MAP


from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON



tempDoc = data['S2213671113001306-907']

for index, row in tempDoc.tsv.iterrows():
    for sent in tempDoc.doc.sents:
        for token in sent:  
            print(type(token.pos_))
            if(token.idx >= row["startOffset"] and token.idx < row["endOffset"]):
                TAG_MAP[str(token.pos_)+"-"+row["annotType"]] = {POS: PUNCT}
                token.pos_ = str(token.pos_)+"-"+row["annotType"]
                print(token.pos_)
            elif(token.idx > row["endOffset"]):
                break
        if(sent.start_char > row["endOffset"]):
            break

ImportError: attempted relative import with no known parent package

In [12]:
for sent in tempDoc.doc.sents:
    for token in sent:
        print(token,token.idx)

( 0
A 1
) 2
Fetal 4
hRPE 10
stained 15
for 23
pan 27
- 30
cytokeratin 31
( 43
scale 44
bar 50
, 53
50 55
μm 58
) 60
; 61
inset 63
shows 69
section 75
overview 83
stained 92
with 100
hematoxylin 105
/ 116
eosin 117
( 123
scale 124
bar 130
, 133
200 135
μm 139
) 141
. 142


The following for loop converts the current spaCy and human annotation format into a gate readable one using text/x-json-twitter format

In [None]:
import math
for doc in data.values():
    for index, row in doc.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["indices"] = [row["startOffset"],row["endOffset"]] 
        
        
        
        
        
        
        if(type(row["other"]) == str):
            tempAnnot["other"] = row["other"]
        else:
            tempAnnot["other"] = "nothing"
            
        try:
            twitjson["entities"]["MEval-"+row["annotType"]].append(tempAnnot)
        except KeyError:
            twitjson["entities"]["MEval-"+row["annotType"]] = [tempAnnot] 

    

In [29]:
#format [(docID, tokenNum)
POSITIONAL_ii = {"apple": [(1,200),(2,150),(330,77),(330,203),(500,5)], "orange" : [(5,44)]}
#disctinct terms: 2
#nonpositional postings: 5
#positional postings: 6
NONPOSITIONAL_ii = {"apple": [1,2,330,500], "orange" : [5]}

In [30]:
ii["apple"]

[1, 2, 330, 500]

In [None]:
"I love New-York" (1,3)