This notebook contains code for loading in all competition texts and annotations, and converting these to gate readable format

In [1]:
#imports
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json

TRIALPATH = "data/trial"
TRAINPATH = "data/train"
LATEST_tsv = None

#setup SpaCy
import spacy
import en_core_web_lg
spacy.prefer_gpu()
nlp = en_core_web_lg.load()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\software_install\\lib\\site-packages\\en_core_web_lg\\en_core_web_lg-2.3.1\\vocab\\lexemes.bin'

In [None]:
#add pipeline component
from spacy.tokens import Doc
import math
Doc.set_extension("meAnnots", default = "def", force = True)
# Doc.set_extension("me-measuredEntity", default = "def", force = True)
# Doc.set_extension("me-quantityM", default = "def", force = True)
# Doc.set_extension("me-quantityC", default = "def", force = True)
# Doc.set_extension("me-measuredProperty", default = "def", force = True)
# Doc.set_extension("me-qualifier", default = "def", force = True)

def findOffset(offset, text):
    '''
    Corrects annotations whose offsets are not correct
    '''
    try:
        #find correct start offset
        if(text[offset] != " " and text[offset+1] != " "):
            return offset
            #no change
        elif(text[offset] != " " and text[offset+1] == " "):
            if(text[offset-1] == " "):
            #case where the word is one char
                return offset
                #no change
            else:
                return offset+2
                #skip ahead 2
        elif(text[offset] == " "):
            return offset + 1
        else:
            print("error, unhandled case in findOffset()")
            print("offset", offset)
            print("text", text)
            return offset
    except IndexError: 
        return offset

def getSentences(omin,omax,doc):
    sents = []
    for sent in doc.sents:
        if sent.end_char > omin and sent.start_char < omax:
            sents.append(sent)
            
    return sents
            

def annotationCreation(doc):
    doc._.meAnnots = {}
    count = 0
    lookup = {tok.idx : tok.i for tok in doc}
    annotminmax={}
    for index, row in LATEST_tsv.iterrows():
        if(row["annotType"] == "Quantity"):
            count+=1
            if count > 1:
                #set sentence values for each
                 doc._.meAnnots[f"Annotation{count-1}"]["sentences"] = getSentences(annotminmax[f"offset{count-1}min"],annotminmax[f"offset{count-1}max"],doc)
        
        #get min and max offsets
        try:
            annotminmax[f"offset{count}max"] = max(annotminmax[f"offset{count}max"],row["endOffset"])
        except KeyError:
             annotminmax[f"offset{count}max"] = row["endOffset"]
                
        try:
            annotminmax[f"offset{count}min"] = min(annotminmax[f"offset{count}min"],row["startOffset"])
        except KeyError:
             annotminmax[f"offset{count}min"] = row["startOffset"]
            
        
        #doc._.meAnnots[f"Annotation{count}"]
        
        #check for creating a new dict
        try:
            if(type(doc._.meAnnots[f"Annotation{count}"]) == type(dict)):
                continue
        except KeyError:
            print("added new dict")
            doc._.meAnnots[f"Annotation{count}"] = {}
        
        tempSpan = None
        
        try:
            tempSpan = doc[lookup[findOffset(row["startOffset"],doc.text)]:lookup[findOffset(row["endOffset"],doc.text)]]
            print(tempSpan)
        except KeyError:
            print("FindOffset method has created a key error")
            
        doc._.meAnnots[f"Annotation{count}"][row["annotType"]] = tempSpan
      
    doc._.meAnnots[f"Annotation{count}"]["sentences"] = getSentences(annotminmax[f"offset{count}min"],annotminmax[f"offset{count}max"],doc)
        
    return doc
                  
                  
                  
                  
                  
nlp.add_pipe(annotationCreation, last=True)

Rule-based Pipeline component for retrieving Quantities. 

Rules: 
    1. any CD followed by a unit that has a nouns POS
    2. any cardinal, money, ordinal, percent, date, time or quantity followed by a unit that has a noun POS
    3. any token that is LIKE_NUM followed by a unit that has a noun POS



In [None]:
#add pipeline component 
from spacy.matcher import Matcher
from spacy.tokens import Doc
Doc.set_extension("unit", default = "def", force = True)

def customMatcher(nlp):
    matchList = open("gazetteers/combined_measurements.lst","r",encoding="utf-8").read().split("\n")
    matcher = Matcher(nlp.vocab)
    pattern = []
    for word in matchList: 
        pattern.append([{"TAG": {"REGEX": "^[CD]"}},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        pattern.append([{"ENT_TYPE": {"IN": ["CARDINAL", "MONEY", "ORDINAL", "PERCENT", "DATE", "TIME", "QUANTITY"]},
                        "TAG":{"REGEX": "^[DT]"},"op": "!"},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        pattern.append([{"LIKE_NUM": True},{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        #pattern.append([{"LOWER": word.lower(),"TAG": {"REGEX": "^[NN|NNP|NNPS|NNS]"}}])
        
    matcher.add("Unit", None, *pattern)
        
    return matcher
    

def gazetteer(doc):
    matcher = customMatcher(nlp)
    matches = matcher(doc)
    doc._.unit = []
    for match_id, start, end in matches:
        tempSpan = doc[start:end]
        doc._.unit.append({'start': tempSpan.start_char, 'end': tempSpan.end_char, 'label': 'UNIT', 'text' : doc[start:end]})
    return doc
        
        
        
nlp.add_pipe(gazetteer, last=True)
    

In [None]:
class exerpt:
    """
    Class exerpt
    Description: a simple class to contain data for the measeval competition

    self.name : the measeval given name associated to the document
    self.txt : the raw text of the document
    self.ann : the brat annotations of the document(deprecated)
    self.tsv : a pandas dataframe containing all the tab seperated value data
    self.grobid : grobid quantities json output for quantity detection
    self.doc : The spacy doc generated from processing on this particular document
    """
    
    def __init__(self, name, txt, ann, tsv, grobid):
        self.name = name
        self.txt = txt
        self.ann = ann
        self.tsv = tsv
        self.grobid = grobid
        self.context = False
        self.posTag = False
        global LATEST_tsv
        LATEST_tsv = self.tsv
        self.doc = nlp(self.txt)
        
    def getContext(self, count = 3):
        if self.context == False: 
            beforeContext = []
            afterContext = []
            self.context = True
            for start,end in zip(self.tsv["startOffset"].values, self.tsv["endOffset"].values ):#self.tsv.iterrows():
                a = getIndexSeperated(exerpt.txt[0],end,count = count)
                b = getIndexSeperated(exerpt.txt[0],start,count = count,forward = False)
                beforeContext.append(self.txt[0][b:start])
                afterContext.append(self.txt[0][end:a])
            self.tsv.insert(3,"beforeContext", beforeContext)
            self.tsv.insert(3,"afterContext", afterContext)
            
    def getPosTag(self):
        if self.context and self.posTag == False: 
            bTag = []
            aTag = []
            self.posTag = True
            for before, after in zip(self.tsv["beforeContext"].values,self.tsv["afterContext"].values):
                bTag.append(nltk.pos_tag(word_tokenize(before)))
                aTag.append(nltk.pos_tag(word_tokenize(after)))
            self.tsv.insert(3,"beforeTag", bTag)
            self.tsv.insert(3,"afterTag", aTag)
            

In [None]:
#
#TEST ONE
#

def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}
#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid")))
            )
        break

In [None]:
import time
t1 = time.time()
def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}

#load all trial data
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid")))
            )

#load all train data
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRAINPATH, "grobid", fn[:-4] + ".grobid"))))
            
t2 = time.time()
print(t2-t1, "Seconds elapsed")
print((t2-t1)/60, "Minutes elapsed")

In [None]:
count = 0 
for doc in data.values():
    for unit in doc.doc._.unit:
        count+=1
print(count)

In [None]:
import networkx as nx
def getGraphPathsShortest(sentences, quantity, measuredProperty, reverse):
    nonePassed = False
    
    if(type(sentences) == type(None)):
        print("sentences passed to getGraphPaths() as None")
        nonePassed = True
        
    if(type(quantity) == type(None)):
        print("quantity passed to getGraphPaths() as None")
        nonePassed = True
        
    if(type(measuredProperty) == type(None)):
        print("measuredProperty passed to getGraphPaths() as None")
        nonePassed = True
        
    if(nonePassed):
        return None
        
    if len(sentences) != 1:
        #cant handle more than one sentece so break
        print("more than one sentence")
        return None
    
    edges = []
    
    for sent in sentences:
        for token in sent:
            for child in token.children:
                edges.append(((token.text,token.i,token.dep_),
                              (child.text,child.i,child.dep_)))
             
    paths = {}
    graph = nx.Graph(edges)
    for src in quantity:
        for trg in measuredProperty:
            source = (src.text, src.i, src.dep_)
            target = (trg.text, trg.i, trg.dep_)
            shortestPath = nx.shortest_path(graph, source=source, target=target)
            print(nx.shortest_path_length(graph, source=source, target=target))
            print(shortestPath)
            paths[len(shortestPath)] = nx.shortest_path(graph, source=source, target=target)
    
    
    for x in sorted(list(paths.keys()),reverse=reverse):
        return paths[x][1:]     
    return paths


def getGraphPaths(sentences, quantity, measuredProperty, reverse=False, pos=False):
    nonePassed = False
    
    if(type(sentences) == type(None)):
        print("sentences passed to getGraphPaths() as None")
        nonePassed = True
        
    if(type(quantity) == type(None)):
        print("quantity passed to getGraphPaths() as None")
        nonePassed = True
        
    if(type(measuredProperty) == type(None)):
        print("measuredProperty passed to getGraphPaths() as None")
        nonePassed = True
        
    if(nonePassed):
        return None
        
    if len(sentences) != 1:
        #cant handle more than one sentece so break
        print("more than one sentence")
        return None
    
    edges = []
    
    for sent in sentences:
        for token in sent:
            for child in token.children:
                edges.append(((token.text,token.i,token.dep_),
                              (child.text,child.i,child.dep_)))
             
    paths = []
    graph = nx.Graph(edges)
    unique = {}
    for src in quantity:
        for trg in measuredProperty:
            source = (src.text, src.i, src.dep_)
            target = (trg.text, trg.i, trg.dep_)
            shortestPath = nx.shortest_path(graph, source=source, target=target)
            print(nx.shortest_path_length(graph, source=source, target=target))
            print(shortestPath)
            
            temp = nx.shortest_path(graph, source=source, target=target)
            if pos == True:
                temp.insert(0,(0,0,str(src.pos_)))
                paths.append(temp) 
                
            unique[src.pos_+temp[1][2]] = temp
            
    return list(unique.values())

In [None]:
count = 0 
accumPOS = []
accumNOPOS = []
prop = "MeasuredProperty"
tp = "Shortest"
for e in data.values():
    #print(e.name)
    doc = e.doc
    for annot in doc._.meAnnots.values():
        get = True
        
        try:
            tempSent = annot["sentences"]
        except KeyError:
            print("no Sentence found")
            get = False
            
        try:
            tempQuantity = annot["Quantity"]
        except KeyError:
            print("no Quantity found")
            get = False
            
        try:
            tempMeasuredProperty = annot["MeasuredProperty"]
        except KeyError:
            print("no MeasuredProperty found")
            get = False
        
        longest = True
        if tp == "Shortest":
            longest = False
            
        if(get):
            count+=1
            temp = getGraphPaths(tempSent, tempQuantity, tempMeasuredProperty, longest,True)
            temp1 = getGraphPaths(tempSent, tempQuantity, tempMeasuredProperty, longest,False)
            if temp != None:
                accumPOS.append((temp,(e.name,tempQuantity.text,tempMeasuredProperty.text)))
                accumNOPOS.append((temp1,(e.name,tempQuantity.text,tempMeasuredProperty.text)))
            

In [66]:

def getFreq(accum):
    d = {} 
    for y in accum:
        for x in y[0]:
            temp = "" 
            for t in x:
                try:
                    temp += t[2]+","
                except TypeError:
                    print(t)     
            try:
                d[temp[:-1]][0] += 1
                d[temp[:-1]][1].append(y[1])
            except KeyError: 
                d[temp[:-1]] = [1,[y[1]]]
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1],reverse=True)}
                
freqPos = getFreq(accumPOS)
freq = getFreq(accumNOPOS)
        

In [67]:
freqPos #NUM,nummod,pobj,prep,pobj

{'NUM,nummod,pobj,prep,pobj': [27,
  [('S0016236113008041-913', '1000 ppm', 'SO2 concentrations'),
   ('S0019103511004994-1511', '77.45°', 'points at an angle'),
   ('S0019103512003533-3348', '2°', 'latitude'),
   ('S0019103512003533-5072', 'around 1300 m s−1', 'peak velocities'),
   ('S0019103512003995-1807', '300–800 nm.', 'wavelengths'),
   ('S0006322312001096-1177', '73%,', 'response'),
   ('S0006322312001096-1253', '65 years.', 'before the age'),
   ('S0019103511004994-1399', '3.95 Saturn radii RS', 'distance'),
   ('S0032386113009889-2123', 'about 70 MPa,', 'applied uniaxial stress'),
   ('S0032386113009889-2123', '210 MPa', 'applied hydrostatic stress'),
   ('S0921818113002245-1571', '8 m', 'depth'),
   ('S0921818113002245-1571', 'approximately 11.8 m', 'depth'),
   ('S0921818113002245-1571', 'between 11.8 m and 11.5 m', 'depth'),
   ('S0921818113002245-1752', '5.16 m.', 'depth'),
   ('S0925443913003037-654', '2.5 years', 'age'),
   ('S0927024813002420-1032', '80 vol%', 'C60'),


In [68]:
freq

{'pobj,prep,pobj': [38,
  [('S0012821X13002185-1231', '∼–1‰', 'apparent 30ε'),
   ('S0016236113008041-3153', '<2 ppm', 'concentrations'),
   ('S0016236113008041-3269', '0,', 'concentrations'),
   ('S0016236113008041-3290', '0.63', 'concentrations'),
   ('S0016236113008041-913', '1000 ppm', 'SO2 concentrations'),
   ('S0019103511004994-1511', '77.45°', 'points at an angle'),
   ('S0019103512002801-1342', '8.74Rs', 'distance'),
   ('S0019103512003533-3348', '2°', 'latitude'),
   ('S0019103512003533-5072', 'around 1300 m s−1', 'peak velocities'),
   ('S0019103512003995-1807', '300–800 nm.', 'wavelengths'),
   ('S0019103513005058-3094', '∼2 m', 'depth'),
   ('S0019103513005058-4210', '2 m', 'depth'),
   ('S0006322312001096-1177', '73%,', 'response'),
   ('S0006322312001096-1253', '65 years.', 'before the age'),
   ('S0006322312001096-1278', '65,', 'age'),
   ('S0019103511004994-1399', '3.95 Saturn radii RS', 'distance'),
   ('S0019103513005058-4158', '0.51–0.85 m.', '1/e mixing depth'),
  

In [69]:
json.dump(freqPos,
           open(f"FrequenciesWithPOS.json","w", encoding = 'utf-8'),
           indent = 3)

In [70]:
json.dump(freq,
           open(f"FrequenciesWithoutPOS.json","w", encoding = 'utf-8'),
           indent = 3)

In [98]:
json.dump({k: v for k, v in sorted(d.items(), key=lambda item: item[1],reverse=True)},
           open(f"{prop}_{tp}_shortest_path.json","w", encoding = 'utf-8'),
           indent = 1)

In [73]:
import networkx as nx
edges = []
for sent in data['S0012821X13007309-1605'].doc.sents:
    for token in sent:
        for child in token.children:
            edges.append(((token.text,token.i,token.dep_),
                          (child.text,child.i,child.dep_)))
#             edges.append(('{0}'.format(token.text+str(token.i)),
#                           '{0}'.format(child.text+str(child.i))))
    break
    
    
print(edges)


[(('Basin', 2, 'nsubj'), ('The', 0, 'det')), (('Basin', 2, 'nsubj'), ('Vocontian', 1, 'compound')), (('was', 3, 'ROOT'), ('Basin', 2, 'nsubj')), (('was', 3, 'ROOT'), ('part', 4, 'attr')), (('part', 4, 'attr'), ('of', 5, 'prep')), (('of', 5, 'prep'), ('gulf', 8, 'pobj')), (('gulf', 8, 'pobj'), ('the', 6, 'det')), (('gulf', 8, 'pobj'), ('western', 7, 'amod')), (('gulf', 8, 'pobj'), ('in', 9, 'prep')), (('in', 9, 'prep'), ('region', 13, 'pobj')), (('Alpine', 12, 'compound'), ('European', 11, 'compound')), (('region', 13, 'pobj'), ('the', 10, 'det')), (('region', 13, 'pobj'), ('Alpine', 12, 'compound')), (('region', 13, 'pobj'), ('of', 14, 'prep')), (('of', 14, 'prep'), ('N', 21, 'pobj')), (('Ocean', 18, 'nmod'), ('NW', 16, 'compound')), (('Ocean', 18, 'nmod'), ('Tethys', 17, 'compound')), (('°', 20, 'compound'), ('Ocean', 18, 'nmod')), (('°', 20, 'compound'), ('∼30', 19, 'dep')), (('N', 21, 'pobj'), ('the', 15, 'det')), (('N', 21, 'pobj'), ('°', 20, 'compound'))]


In [76]:
graph = nx.Graph(edges)
# Get the length and path
src = ('Basin', 2, 'nsubj')
trg = ('The', 0, 'det')
print(nx.shortest_path_length(graph, source=src, target=trg))
print(nx.shortest_path(graph, source=src, target=trg))

1
[('Basin', 2, 'nsubj'), ('The', 0, 'det')]


The following for loop converts the current spaCy and human annotation format into a gate readable one using text/x-json-twitter format

In [52]:
import math
for doc in data.values():

    testjson  = doc.doc.to_json()


    twitjson = {"full_text": testjson["text"],"entities":{}}


    for tok in testjson["tokens"]:
        tempToken = {}
        tempToken["indices"] = [tok["start"],tok["end"]] 
        tempToken["category"] = tok["tag"]
        tempToken["kind"] = tok["dep"]
        tempToken["id"] = tok["id"]
        tempToken["head"] = tok["head"]
        try:
            twitjson["entities"]["Token"].append(tempToken)
        except KeyError:
            twitjson["entities"]["Token"] = [tempToken] 

    for ent in testjson["ents"]:
        tempEnt = {}
        tempEnt["indices"] = [ent["start"],ent["end"]] 
        try:
            twitjson["entities"][ent["label"]].append(tempEnt)
        except KeyError:
            twitjson["entities"][ent["label"]] = [tempEnt]
            
    for unit in doc.doc._.unit:
        tempUnit = {}
        tempUnit["indices"] = [int(unit["start"]),int(unit["end"])]
        tempUnit["text"]= unit["text"].text
        try:
            twitjson["entities"]["unit"].append(tempUnit)
        except KeyError:
            twitjson["entities"]["unit"] = [tempUnit]

            
    for sent in testjson["sents"]:
        tempSent = {}
        tempSent["indices"] = [sent["start"],sent["end"]] 
        try:
            twitjson["entities"]["sentence"].append(tempSent)
        except KeyError:
            twitjson["entities"]["sentence"] = [tempSent] 
            
    for index, row in doc.tsv.iterrows():
        tempAnnot = {}
        tempAnnot["indices"] = [row["startOffset"],row["endOffset"]] 
        tempAnnot["annotSet"] = row["annotSet"]
        tempAnnot["annotId"] = row["annotId"]
        tempAnnot["text"] = row["text"]
        if(type(row["other"]) == str):
            tempAnnot["other"] = row["other"]
        else:
            tempAnnot["other"] = "nothing"
            
        try:
            twitjson["entities"]["MEval-"+row["annotType"]].append(tempAnnot)
        except KeyError:
            twitjson["entities"]["MEval-"+row["annotType"]] = [tempAnnot] 

    json.dump(twitjson, open(f'jsondocs/{doc.name}.json',"w"), indent=3)