In [1]:
from __future__ import unicode_literals, print_function
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json
import spacy
import plac
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

TRIALPATH = "data/trial"
TRAINPATH = "data/train"

class exerpt:
    def __init__(self, name, txt, ann, tsv, grobid):
        self.name = name
        self.txt = txt
        self.ann = ann
        self.tsv = tsv
        self.grobid = grobid
        self.context = False
        self.posTag = False
        self.doc = nlp(self.txt)
        
    def getContext(self, count = 3):
        if self.context == False: 
            beforeContext = []
            afterContext = []
            self.context = True
            for start,end in zip(self.tsv["startOffset"].values, self.tsv["endOffset"].values ):#self.tsv.iterrows():
                a = getIndexSeperated(exerpt.txt[0],end,count = count)
                b = getIndexSeperated(exerpt.txt[0],start,count = count,forward = False)
                beforeContext.append(self.txt[0][b:start])
                afterContext.append(self.txt[0][end:a])
            self.tsv.insert(3,"beforeContext", beforeContext)
            self.tsv.insert(3,"afterContext", afterContext)
            
    def getPosTag(self):
        if self.context and self.posTag == False: 
            bTag = []
            aTag = []
            self.posTag = True
            for before, after in zip(self.tsv["beforeContext"].values,self.tsv["afterContext"].values):
                bTag.append(nltk.pos_tag(word_tokenize(before)))
                aTag.append(nltk.pos_tag(word_tokenize(after)))
            self.tsv.insert(3,"beforeTag", bTag)
            self.tsv.insert(3,"afterTag", aTag)
            


def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid")))
            )
            
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRAINPATH, "grobid", fn[:-4] + ".grobid")))
            )

In [13]:
for s in data["S0012821X12004384-1302"].doc.sents:
    for t in s:
        print(t.dep_)

compound
nsubjpass
punct
appos
punct
cc
amod
compound
conj
auxpass
ROOT
prt
prep
det
amod
pobj
punct
amod
appos
prep
pobj
punct
aux
advcl
compound
dobj
punct
appos
punct
nummod
cc
conj
punct
punct
aux
conj
det
dobj
prep
det
pobj
advmod
prep
pobj
punct
cc
aux
conj
dobj
punct
appos
punct
dep
punct
punct
compound
compound
nsubjpass
punct
appos
punct
cc
compound
conj
punct
appos
punct
auxpass
ROOT
agent
advmod
amod
pobj
prep
det
compound
pobj
punct
appos
punct
conj
cc
conj
punct
cc
conj
agent
pobj
punct
appos
punct
nummod
punct
advcl
det
amod
nummod
dobj
punct
acl
det
amod
dobj
prep
pobj
punct
punct
nummod
nsubj
prep
prep
det
pobj
prep
pobj
punct
conj
punct
conj
punct
conj
punct
cc
compound
nsubj
punct
acl
prep
pobj
punct
appos
punct
ROOT
dobj
punct
prep
pobj
prep
det
amod
pobj
prep
det
pobj
punct
appos
punct
pobj
cc
conj
punct
punct
nsubj
prep
det
pobj
punct
appos
punct
conj
punct
cc
compound
conj
punct
advmod
ROOT
amod
dobj
punct
amod
prep
det
pobj
punct
appos
punct
dobj
punct
punct
aux


In [16]:
data["S0012821X12004384-1302"].doc.to_json()["tokens"]

[{'id': 0,
  'start': 0,
  'end': 14,
  'pos': 'NOUN',
  'tag': 'NN',
  'dep': 'compound',
  'head': 1},
 {'id': 1,
  'start': 15,
  'end': 23,
  'pos': 'NOUN',
  'tag': 'NN',
  'dep': 'nsubjpass',
  'head': 10},
 {'id': 2,
  'start': 24,
  'end': 25,
  'pos': 'PUNCT',
  'tag': '-LRB-',
  'dep': 'punct',
  'head': 1},
 {'id': 3,
  'start': 25,
  'end': 27,
  'pos': 'PROPN',
  'tag': 'NNP',
  'dep': 'appos',
  'head': 1},
 {'id': 4,
  'start': 27,
  'end': 28,
  'pos': 'PUNCT',
  'tag': '-RRB-',
  'dep': 'punct',
  'head': 1},
 {'id': 5,
  'start': 29,
  'end': 32,
  'pos': 'CCONJ',
  'tag': 'CC',
  'dep': 'cc',
  'head': 1},
 {'id': 6,
  'start': 33,
  'end': 44,
  'pos': 'ADJ',
  'tag': 'JJ',
  'dep': 'amod',
  'head': 7},
 {'id': 7,
  'start': 45,
  'end': 54,
  'pos': 'NOUN',
  'tag': 'NN',
  'dep': 'compound',
  'head': 8},
 {'id': 8,
  'start': 55,
  'end': 63,
  'pos': 'NOUN',
  'tag': 'NN',
  'dep': 'conj',
  'head': 1},
 {'id': 9,
  'start': 64,
  'end': 68,
  'pos': 'AUX',
  '

In [57]:
for exerpt in list(data.values()):
    print("Name:", exerpt.name)
    print("grobid")
    for x in exerpt.grobid[1]["measurements"]:
        if x["type"] == "value":
            print(x["quantity"]["offsetStart"],"-",x["quantity"]["offsetEnd"],":",x["quantity"]["rawValue"])
        elif x["type"] == "listc":
            for q in x["quantities"]:
                print(q["offsetStart"],"-",q["offsetEnd"],":",q["rawValue"])
        elif x["type"] == "interval":
            try:
                print(x["quantityLeast"]["offsetStart"],"-",x["quantityLeast"]["offsetEnd"],":",x["quantityLeast"]["rawValue"])
            except KeyError:
                continue
            try:
                print(x["quantityMost"]["offsetStart"],"-",x["quantityMost"]["offsetEnd"],":",x["quantityMost"]["rawValue"])
            except KeyError:
                continue
        else: 
            print("not implemented for type:",x["type"])
    print("gold")            
    for start, end, text, tpe in  zip(exerpt.tsv["startOffset"].values, exerpt.tsv["endOffset"].values,exerpt.tsv["text"].values,exerpt.tsv["annotType"].values):
        if tpe == "Quantity":
            print(start,"-", end, ":", text)
            
            
            
            

Name: S0012821X12004384-1302
grobid
492 - 497 : three
553 - 557 : Five
588 - 595 : 2619.60
597 - 604 : 2617.35
606 - 613 : 2617.44
615 - 622 : 2614.73
628 - 635 : 2614.71
769 - 776 : 2619.60
778 - 785 : 2614.73
791 - 798 : 2614.71
1481 - 1484 : two
1493 - 1500 : 2619.60
1505 - 1512 : 2614.71
1992 - 1995 : two
2115 - 2118 : two
gold
553 - 557 : Five
588 - 637 : 2619.60, 2617.35, 2617.44, 2614.73, and 2614.71 m
769 - 800 : 2619.60, 2614.73, and 2614.71 m
1481 - 1484 : two
1493 - 1512 : 2619.60 and 2614.71
Name: S0012821X12004384-1405
grobid
626 - 628 : 20
775 - 776 : 0
781 - 783 : 80
gold
626 - 628 : 20
775 - 784 : 0% to 80%
Name: S0012821X12004384-1415
grobid
44 - 48 : 2632
52 - 56 : 2618
291 - 292 : 5
gold
39 - 58 : from 2632 to 2618 m
280 - 293 : on average 5%
Name: S0012821X12004384-1594
grobid
162 - 166 : 2618
310 - 312 : 40
gold
156 - 168 : above 2618 m
310 - 314 : 40°N
Name: S0012821X12004384-1599
grobid
81 - 84 : 103
117 - 121 : 2618
gold
76 - 88 : from 103 yrs
111 - 123 : above 

597 - 622 : within a month and a half
649 - 666 : approximately 2 h
Name: S0019103512003995-2760
grobid
324 - 326 : 11
337 - 338 : 5
gold
324 - 333 : 11 km s−1
337 - 340 : 5Rp
Name: S0019103512003995-3548
grobid
235 - 240 : three
651 - 652 : 1
857 - 861 : 8250
1022 - 1026 : 6000
1033 - 1039 : 11,000
gold
651 - 657 : 1 μbar
851 - 863 : about 8250 K
1008 - 1028 : approximately 6000 K
1033 - 1041 : 11,000 K
Name: S0019103512004009-3488
grobid
550 - 554 : 1300
576 - 577 : 1
812 - 813 : 3
gold
181 - 187 : 3 mbar
550 - 566 : 1300 K to 3500 K
576 - 582 : 1 μbar
812 - 818 : 3 mbar
Name: S0019103512004009-3825
grobid
28 - 30 : 50
263 - 269 : 11,500
313 - 316 : 1.5
324 - 327 : 0.3
355 - 358 : 0.1
386 - 389 : 1.4
393 - 396 : 0.5
406 - 409 : 1.9
413 - 416 : 0.1
463 - 469 : 10,000
475 - 481 : 13,200
1000 - 1001 : 1
gold
28 - 31 : 50%
263 - 271 : 11,500 K
308 - 318 : near 1.5Rp
324 - 332 : 0.3 nbar
355 - 363 : 0.1 to 1
386 - 422 : 1.4Rp (0.5 nbar) to 1.9Rp (0.1 nbar)
463 - 483 : 10,000 K to 13,200 K

1298 - 1300 : 60
1357 - 1359 : 63
gold
11 - 13 : 71
512 - 519 : 1/100th
594 - 597 : 5 g
689 - 695 : 0.05 g
1116 - 1123 : >250 μm
1192 - 1196 : >90%
1283 - 1302 : between 20 and 60 g
1357 - 1362 : 63 μm
Name: S0012821X12004384-1284
grobid
146 - 149 : 225
324 - 329 : 1.167
447 - 450 : 289
748 - 750 : 24
1128 - 1129 : 5
1253 - 1256 : 250
1328 - 1330 : 35
1711 - 1714 : 0.4
gold
146 - 149 : 225
324 - 329 : 1.167
397 - 402 : ±<0.1
447 - 450 : 289
748 - 752 : 24 h
811 - 820 : 93:7, v/v
1128 - 1130 : 5%
1252 - 1259 : >250 μm
1328 - 1330 : 35
1711 - 1715 : 0.4‰
Name: S0012821X12004384-1640
grobid
483 - 484 : 4
1226 - 1228 : 54
gold
39 - 41 : 5‰
478 - 486 : from 4 m
1226 - 1231 : 54 oN
Name: S0012821X12004384-952
grobid
381 - 383 : 30
250 - 252 : 30
658 - 664 : 2614.7
669 - 675 : 2619.6
gold
249 - 253 : <30%
380 - 384 : >30%
658 - 677 : 2614.7 and 2619.6 m
Name: S0012821X13002185-994
grobid
374 - 376 : 40
1078 - 1081 : 1.5
gold
81 - 89 : ∼33.7 Ma
368 - 377 : up to 40%
895 - 899 : 0.6‰
1078 - 108

grobid
37 - 41 : five
449 - 453 : 11.4
470 - 474 : five
621 - 623 : 30
gold
37 - 41 : five
438 - 462 : range from 11.4% to 2.4%
470 - 474 : five
605 - 617 : less than 2%
621 - 632 : 30 vertices
Name: S016412121300188X-4617
grobid
227 - 229 : 52
299 - 301 : 11
gold
227 - 230 : 52%
299 - 302 : 11%
Name: S016412121300188X-5038
grobid
224 - 229 : three
gold
224 - 229 : three
Name: S016412121300188X-5066
grobid
466 - 468 : 78
624 - 626 : 30
gold
461 - 476 : from 78% to 95%
624 - 626 : 30
Name: S0165587612003680-1078
grobid
190 - 195 : three
444 - 449 : three
719 - 724 : three
1025 - 1027 : 60
gold
190 - 195 : three
444 - 449 : three
719 - 734 : three occasions
1025 - 1028 : 60%
Name: S0165587612003680-953
grobid
305 - 307 : 90
gold
295 - 308 : more than 90%
Name: S0165587612003680-998
grobid
123 - 126 : two
205 - 208 : one
258 - 261 : one
403 - 407 : 54.3
413 - 417 : 14.3
420 - 424 : 82.9
430 - 434 : 57.1
441 - 445 : 67.9
451 - 452 : 0
567 - 571 : 0.02
gold
123 - 126 : two
196 - 208 : at le

KeyError: 'measurements'

In [58]:
from spacy import displacy
import os
from pathlib import Path
options = {"fine_grained" : True,
            "collapse_punct": True,
          "collapse_phrases": False, 
          "compact" : True,
          "word_spacing" : 25,
          "offset_x" : 50,
          "arrow_stroke" : 1,
          "arrow_spacing" : 6,
          "distance" : 90,
          "arrow_width": 4.5,
          "font" : "Times"}

"""
The following code generates dependency parses and saves them to a dependency parses folder
It is also possible to do this and have spacy output it to a server port... 
"""
for exerpt in data.values(): 
    svg = displacy.render(exerpt.doc, jupyter = False, style = "dep", minify = True, options = options)
    file_name = exerpt.name + ".svg"
    output_path = Path(os.path.join("./","data/trial/dep", file_name))
    output_path.open("w", encoding="utf-8").write(svg)

In [59]:

"""A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.

Compatible with: spaCy v2.0.0+
Last tested with: v2.2.1
"""




TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]


def main(model="en_core_web_sm"):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))

    for text in TEXTS:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))


def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result

skips = {}
skipKind = {}
def extract_currency_relations(doc):
    # Merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)

    relations = []
    tps = ["CARDINAL", "MONEY", "ORDINAL", "PERCENT", "DATE", "TIME", "QUANTITY"]
    for identifiedEnt in filter(lambda w: w.ent_type_ in tps, doc):
        if identifiedEnt.dep_ in ("attr", "dobj"):
            subject = [w for w in identifiedEnt.head.lefts if w.dep_ == "nsubj"]
            if subject:
                subject = subject[0]
                relations.append((subject, identifiedEnt))
        elif identifiedEnt.dep_ == "pobj" and identifiedEnt.head.dep_ == "prep":
            relations.append((identifiedEnt.head.head, identifiedEnt))
        else:
            print("skipped: ", identifiedEnt.dep_,  identifiedEnt.pos_, identifiedEnt)
            try:
                skips[identifiedEnt.dep_] += 1
                skipKind[identifiedEnt.dep_].append(identifiedEnt)
            except KeyError: 
                skips[identifiedEnt.dep_] = 1
                skipKind[identifiedEnt.dep_] = [identifiedEnt]
    return relations

nlp = spacy.load("en_core_web_sm")

In [60]:
for ent in data.values():
    print(ent.name)
    relations = extract_currency_relations(ent.doc)
    for r1, r2 in relations:
        print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
    print()

S0012821X12004384-1302
skipped:  dep NUM 6
skipped:  conj NUM 8)
skipped:  nummod X 4
skipped:  conj NUM 2617.35
skipped:  conj NUM 2617.44
skipped:  conj NUM 2614.73
skipped:  nsubj NOUN 2614.71 m
skipped:  appos X 5
skipped:  appos NUM 2619.60
skipped:  conj NUM 2614.73
skipped:  nsubj NUM 8)
skipped:  nummod NUM 1
skipped:  nsubj NUM 8)
skipped:  nummod NUM 1
skipped:  appos NUM 2619.60
skipped:  conj NUM 2614.71
skipped:  nummod NUM 1
skipped:  dep NUM 2005
skipped:  ROOT NUM 3, 6–9
from      	CARDINAL	2619.60
Some      	CARDINAL	3
we        	CARDINAL	5a
these two depths	DATE	Figs.

S0012821X12004384-1405
skipped:  dep NUM 1992
skipped:  dep NUM 2005
skipped:  dep NUM 7
skipped:  nummod NUM 10
overview  	DATE	2005
ranging   	PERCENT	0% to 80%

S0012821X12004384-1415
skipped:  nummod NUM 1
skipped:  nsubj NOUN 7p
skipped:  npadvmod NUM 1994
skipped:  npadvmod NUM 2005
(DA1      	DATE	2632 to 2618 m, Fig

S0012821X12004384-1594
skipped:  nummod NUM 9
skipped:  npadvmod X 2006b
skippe

skipped:  dep NUM 1983
skipped:  nummod NUM 0.3
skipped:  nummod NUM 0.5
skipped:  advcl NUM 1
A net heating efficiency	PERCENT	50%
located   	QUANTITY	1.5Rp
varies    	CARDINAL	0.1
from      	CARDINAL	1
shifts    	CARDINAL	1.9Rp

S0019103512004009-3962
skipped:  nsubj NUM 2010a
skipped:  pcomp NUM 0.5
skipped:  appos NUM 2008

S0019103512004009-4007
skipped:  dep NUM 2009
skipped:  nsubj NUM 4
skipped:  nummod NUM 2
skipped:  dep NUM 2009
skipped:  appos NUM 2007
a constant photoelectron heating efficiency	PERCENT	93%
an electron mixing ratio	CARDINAL	0.1
based     	CARDINAL	C1

S0019103512004009-4492
skipped:  meta NUM 11
skipped:  compound NOUN 7.2 km

S0019103512004009-5019
skipped:  ROOT NUM 2009
skipped:  ROOT NUM 2009
skipped:  nsubj NUM 10

S0019103512004009-5507
skipped:  nsubj X 2010a
skipped:  npadvmod NUM 2007
skipped:  appos NUM 2008
skipped:  nsubj X 2010a
skipped:  appos NUM 2003
skipped:  npadvmod NUM 2008

S0019103513005058-1737

S0019103513005058-3094
skipped:  npadvm

skipped:  npadvmod NUM 1998
shaded    	CARDINAL	2614.7

S0012821X13002185-994
skipped:  npadvmod NOUN today
skipped:  nummod NUM 1090
skipped:  nummod NUM 0.6‰
skipped:  nummod NUM 689
account   	PERCENT	up to 40%

S0012821X13007309-1691
skipped:  appos NUM 1982
skipped:  npadvmod NUM 2007
skipped:  appos PROPN Table 1e
occurring 	CARDINAL	<0.5 m

S0016236113008041-2924

S0016236113008041-3012
skipped:  npadvmod NUM 7
skipped:  nsubjpass NOUN 82%
skipped:  nsubj NOUN 81%

S0016236113008041-3159
skipped:  npadvmod NOUN 13 kg

S0016236113008041-3161
skipped:  conj NOUN 13 kg
the lowest bed inventory	QUANTITY	4.5 kg
bed inventories	QUANTITY	4.5 kg and

S0016236113008041-872
skipped:  conj NOUN 6 kg and
skipped:  conj NOUN 13 kg
bed inventories	QUANTITY	4.5 kg

S0016236113008041-961

S0019103511004994-1399
skipped:  appos NUM one
skipped:  appos NOUN 60,268 km
skipped:  appos NUM 1980
skipped:  dep NUM 1981
skipped:  appos NUM 1982
skipped:  npadvmod NUM 14
skipped:  prep ADP between 2005 

skipped:  npadvmod NUM 2001
skipped:  appos NOUN 4–8 years
skipped:  npadvmod NUM 2003
skipped:  npadvmod NUM 2003
a 90–95% maximum yield	DATE	the year

S1359645413009816-2243
skipped:  nummod NUM 1.5
a nominal resolution	QUANTITY	20 kV

S1367912013002277-1213
skipped:  nummod NUM 2.5–10
skipped:  conj NOUN less than 2.5 μm
Molar ratios	DATE	100Ti/

S1387700313001822-661
skipped:  nummod NUM 1
skipped:  nummod NUM 260
skipped:  nummod NUM 1
skipped:  appos NUM 5
skipped:  appos NUM 12
skipped:  nummod NUM 297
skipped:  appos NUM 13

S1388248113001951-339
skipped:  appos NUM 12
skipped:  appos NUM 1/20
loading   	CARDINAL	2.4

S1389128612002496-5994
skipped:  nummod NUM 3
more      	PERCENT	the 7.6%

S1389128612002496-6119
skipped:  nsubj NUM 12
flows     	CARDINAL	10
flows     	CARDINAL	ms to 0.1

S1389128612002496-6138
skipped:  nsubj NUM 13

S1550413113004920-1509
diagnosed 	DATE	6 months of age
diagnosed 	DATE	9 months

S175058361300203X-1280
skipped:  nummod NUM 4
skipped:  appos N

In [61]:
skipKind

{'dep': [6,
  2005,
  1992,
  2005,
  7,
  2001,
  2008,
  2001,
  2011,
  2010,
  2010,
  2011,
  2007,
  2013,
  1997,
  2012,
  2011c,
  6,
  2,
  2007,
  2011a,
  2011c,
  2004,
  2012,
  1993,
  2000,
  2004,
  2006,
  2000,
  2000,
  2006,
  2004,
  2006,
  2011,
  2009,
  2011,
  2006,
  2011,
  2011,
  2001,
  2008,
  1,
  2010,
  2012,
  2008,
  2008,
  2010,
  2012,
  10,
  1983,
  2,
  2007,
  2,
  2001,
  2007,
  2010,
  2010,
  2005,
  2011,
  1983,
  2009,
  2009,
  2009,
  2014,
  2,
  2011,
  2007,
  2006a,
  2011,
  2007,
  2005,
  2006a,
  2009,
  2011,
  1995,
  2004,
  2009,
  2001,
  2,
  1996,
  2011,
  2009,
  2,
  93:7,
  1981,
  2006,
  2006,
  2006,
  2006,
  2010,
  2006,
  2009,
  H2O+,
  2009,
  2008,
  2009,
  2010a,
  2012,
  2009,
  2013,
  2013,
  2010,
  2014,
  2008,
  2006,
  1,
  2009,
  2006,
  2001,
  1997,
  2,
  2010b,
  2010a,
  0.02,
  2007,
  2012,
  1993b,
  8.9,
  2004,
  2012,
  2006,
  2003,
  4,
  2007,
  2010,
  2010,
  2012,
  2004,
  

In [66]:
skips

{'dep': 148,
 'conj': 85,
 'nummod': 134,
 'nsubj': 63,
 'appos': 257,
 'ROOT': 71,
 'npadvmod': 124,
 'prep': 17,
 'meta': 20,
 'amod': 2,
 'compound': 3,
 'advmod': 12,
 'pobj': 9,
 'punct': 7,
 'advcl': 1,
 'pcomp': 1,
 'nsubjpass': 12,
 'oprd': 1,
 'parataxis': 6,
 'acomp': 2,
 'nmod': 1}

In [63]:
data["S0019103512003533-3348"].doc.ents

(Müller-Wodarg et al., 2006, 10°, 5 s)

In [64]:
list(data["S0019103512003533-3348"].doc.noun_chunks)

[The STIM GCM,
 the transport,
 winds,
 molecular and turbulent diffusion,
 key neutral species,
 H,
 H2,
 He,
 CH4,
 H2O,
 the procedures,
 Müller-Wodarg et al.,
 The global spherical grid,
 flexible resolution,
 simulations,
 this study,
 we,
 latitude,
 longitude,
 2°,
 10°,
 a vertical resolution,
 0.4 scale heights,
 Our time integration step,
 5 s,
 we,
 the code,
 500 Saturn rotations,
 steady state]

In [12]:
list(filter(lambda w: w.ent_type_ , data["S0019103512003533-3348"].doc))

[Müller-Wodarg et al., 2006, 10°, 5 s]

In [15]:
for w in list(data["S0019103512003533-3348"].doc): 
    print(w.ent_type_)


























ORG

DATE




















PRODUCT










CARDINAL










