In [51]:
from __future__ import unicode_literals, print_function
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json
import spacy
import plac
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

TRIALPATH = "data/trial"
TRAINPATH = "data/train"

class exerpt:
    def __init__(self, name, txt, ann, tsv, grobid):
        self.name = name
        self.txt = txt
        self.ann = ann
        self.tsv = tsv
        self.grobid = grobid
        self.context = False
        self.posTag = False
        self.doc = nlp(self.txt)
        
    def getContext(self, count = 3):
        if self.context == False: 
            beforeContext = []
            afterContext = []
            self.context = True
            for start,end in zip(self.tsv["startOffset"].values, self.tsv["endOffset"].values ):#self.tsv.iterrows():
                a = getIndexSeperated(exerpt.txt[0],end,count = count)
                b = getIndexSeperated(exerpt.txt[0],start,count = count,forward = False)
                beforeContext.append(self.txt[0][b:start])
                afterContext.append(self.txt[0][end:a])
            self.tsv.insert(3,"beforeContext", beforeContext)
            self.tsv.insert(3,"afterContext", afterContext)
            
    def getPosTag(self):
        if self.context and self.posTag == False: 
            bTag = []
            aTag = []
            self.posTag = True
            for before, after in zip(self.tsv["beforeContext"].values,self.tsv["afterContext"].values):
                bTag.append(nltk.pos_tag(word_tokenize(before)))
                aTag.append(nltk.pos_tag(word_tokenize(after)))
            self.tsv.insert(3,"beforeTag", bTag)
            self.tsv.insert(3,"afterTag", aTag)
            


def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", l + ".grobid")))
            )
            
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                "none"
            )

In [54]:
len(data)

278

In [28]:
from spacy import displacy
import os
from pathlib import Path
options = {"fine_grained" : True,
            "collapse_punct": True,
          "collapse_phrases": False, 
          "compact" : True,
          "word_spacing" : 25,
          "offset_x" : 50,
          "arrow_stroke" : 1,
          "arrow_spacing" : 6,
          "distance" : 90,
          "arrow_width": 4.5,
          "font" : "Times"}

"""
The following code generates dependency parses and saves them to a dependency parses folder
It is also possible to do this and have spacy output it to a server port... 
"""
for exerpt in data.values(): 
    svg = displacy.render(exerpt.doc, jupyter = False, style = "dep", minify = True, options = options)
    file_name = exerpt.name + ".svg"
    output_path = Path(os.path.join("./","data/trial/dep", file_name))
    output_path.open("w", encoding="utf-8").write(svg)

In [33]:

"""A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.

Compatible with: spaCy v2.0.0+
Last tested with: v2.2.1
"""




TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]


def main(model="en_core_web_sm"):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))

    for text in TEXTS:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))


def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result

skips = {}
skipKind = {}
def extract_currency_relations(doc):
    # Merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)

    relations = []
    tps = ["CARDINAL", "MONEY", "ORDINAL", "PERCENT", "DATE", "TIME", "QUANTITY"]
    for identifiedEnt in filter(lambda w: w.ent_type_ in tps, doc):
        if identifiedEnt.dep_ in ("attr", "dobj"):
            subject = [w for w in identifiedEnt.head.lefts if w.dep_ == "nsubj"]
            if subject:
                subject = subject[0]
                relations.append((subject, identifiedEnt))
        elif identifiedEnt.dep_ == "pobj" and identifiedEnt.head.dep_ == "prep":
            relations.append((identifiedEnt.head.head, identifiedEnt))
        else:
            print("skipped: ", identifiedEnt.dep_,  identifiedEnt.pos_, identifiedEnt)
            try:
                skips[identifiedEnt.dep_] += 1
                skipKind[identifiedEnt.dep_].append(identifiedEnt)
            except KeyError: 
                skips[identifiedEnt.dep_] = 1
                skipKind[identifiedEnt.dep_] = [identifiedEnt]
    return relations

nlp = spacy.load("en_core_web_sm")

In [34]:
for ent in data.values():
    print(ent.name)
    relations = extract_currency_relations(ent.doc)
    for r1, r2 in relations:
        print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
    print()

S0012821X12004384-1302
skipped:  dep NUM 6
skipped:  conj NUM 8)
skipped:  nummod X 4
skipped:  conj NUM 2617.35
skipped:  conj NUM 2617.44
skipped:  conj NUM 2614.73
skipped:  nsubj NOUN 2614.71 m
skipped:  appos X 5
skipped:  appos NUM 2619.60
skipped:  conj NUM 2614.73
skipped:  nsubj NUM 8)
skipped:  nummod NUM 1
skipped:  nsubj NUM 8)
skipped:  nummod NUM 1
skipped:  appos NUM 2619.60
skipped:  conj NUM 2614.71
skipped:  nummod NUM 1
skipped:  dep NUM 2005
skipped:  ROOT NUM Figs. 3, 6–9
from      	CARDINAL	2619.60
Some      	CARDINAL	3
we        	CARDINAL	5a

S0012821X12004384-1405
skipped:  dep NUM 1992
skipped:  dep NUM 2005
skipped:  dep NUM 7
skipped:  nummod NUM 10
overview  	DATE	2005
ranging   	PERCENT	0% to 80%

S0012821X12004384-1415
skipped:  nummod NUM 1
skipped:  nsubj NOUN 7p
skipped:  npadvmod NUM 1994
skipped:  npadvmod NUM 2005
(DA1      	DATE	2632 to 2618 m, Fig

S0012821X12004384-1594
skipped:  nummod NUM 9
skipped:  npadvmod X 2006b
skipped:  dep NUM 2001
skipp

In [35]:
skipKind

{'dep': [6,
  2005,
  1992,
  2005,
  7,
  2001,
  2008,
  2001,
  2011,
  2010,
  2010,
  2011,
  2007,
  2013,
  1997,
  2012,
  2011c,
  6,
  2,
  2007,
  2011a,
  2011c,
  2004,
  2012,
  1993,
  2000,
  2004,
  2006,
  2000,
  2000,
  2006,
  2004,
  2006,
  2011,
  2009,
  2011,
  2006,
  2011,
  2011,
  2001,
  2008,
  1,
  2010,
  2012,
  2008,
  2008,
  2010,
  2012,
  10,
  1983,
  2,
  2007,
  2,
  2001,
  2007,
  2010,
  2010,
  2005,
  2011,
  1983,
  2009,
  2009,
  2009,
  2014,
  2],
 'conj': [8),
  2617.35,
  2617.44,
  2614.73,
  2614.73,
  2614.71,
  up to 1 week,
  385,
  5,
  9.10,
  6 kg and,
  13 kg,
  several tens,
  0.66 s,
  86 s,
  H2O+,
  H3O+,
  2008,
  3MgF2→2Mg3NF3(2)Mg3N2+MgF2→2Mg2NF],
 'nummod': [4,
  1,
  1,
  1,
  10,
  1,
  9,
  1,
  5,
  between 2 and 20,
  86,
  2,
  2,
  29,
  29,
  13 kg,
  4.5,
  6,
  14,
  27,
  16,
  2008,
  78,
  3,
  300–800,
  0.3,
  0.5,
  2,
  1.0–1.7,
  0.2–0.4,
  10,
  1,
  1536,
  1260,
  1],
 'nsubj': [2614.71 m,
  8)

In [26]:
skips

{'dep': 65,
 'conj': 18,
 'nummod': 35,
 'nsubj': 29,
 'appos': 69,
 'ROOT': 23,
 'npadvmod': 30,
 'prep': 7,
 'meta': 5,
 'amod': 1,
 'advmod': 3,
 'pobj': 2,
 'punct': 2,
 'advcl': 1,
 'pcomp': 1,
 'compound': 1,
 'nsubjpass': 1,
 'oprd': 1}

In [6]:
data["S0019103512003533-3348"].doc.ents

(Müller-Wodarg et al., 2006, 10°, 5 s)

In [8]:
list(data["S0019103512003533-3348"].doc.noun_chunks)

[The STIM GCM,
 the transport,
 winds,
 molecular and turbulent diffusion,
 key neutral species,
 H,
 H2,
 He,
 CH4,
 H2O,
 the procedures,
 Müller-Wodarg et al.,
 The global spherical grid,
 flexible resolution,
 simulations,
 this study,
 we,
 latitude,
 longitude,
 2°,
 10°,
 a vertical resolution,
 0.4 scale heights,
 Our time integration step,
 5 s,
 we,
 the code,
 500 Saturn rotations,
 steady state]

In [12]:
list(filter(lambda w: w.ent_type_ , data["S0019103512003533-3348"].doc))

[Müller-Wodarg et al., 2006, 10°, 5 s]

In [15]:
for w in list(data["S0019103512003533-3348"].doc): 
    print(w.ent_type_)


























ORG

DATE




















PRODUCT










CARDINAL










