In [1]:
from __future__ import unicode_literals, print_function
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json
import spacy
import plac
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

TRIALPATH = "data/trial"
TRAINPATH = "data/train"

class exerpt:
    def __init__(self, name, txt, ann, tsv, grobid):
        self.name = name
        self.txt = txt
        self.ann = ann
        self.tsv = tsv
        self.grobid = grobid
        self.context = False
        self.posTag = False
        self.doc = nlp(self.txt)
        
    def getContext(self, count = 3):
        if self.context == False: 
            beforeContext = []
            afterContext = []
            self.context = True
            for start,end in zip(self.tsv["startOffset"].values, self.tsv["endOffset"].values ):#self.tsv.iterrows():
                a = getIndexSeperated(exerpt.txt[0],end,count = count)
                b = getIndexSeperated(exerpt.txt[0],start,count = count,forward = False)
                beforeContext.append(self.txt[0][b:start])
                afterContext.append(self.txt[0][end:a])
            self.tsv.insert(3,"beforeContext", beforeContext)
            self.tsv.insert(3,"afterContext", afterContext)
            
    def getPosTag(self):
        if self.context and self.posTag == False: 
            bTag = []
            aTag = []
            self.posTag = True
            for before, after in zip(self.tsv["beforeContext"].values,self.tsv["afterContext"].values):
                bTag.append(nltk.pos_tag(word_tokenize(before)))
                aTag.append(nltk.pos_tag(word_tokenize(after)))
            self.tsv.insert(3,"beforeTag", bTag)
            self.tsv.insert(3,"afterTag", aTag)
            


def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid")))
            )
            
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRAINPATH, "grobid", fn[:-4] + ".grobid")))
            )

In [2]:
import spacy
from nltk import Tree

doc = data["S175058361300203X-1542"].doc

def tok_format(tok):
    return "_".join([tok.orth_, tok.tag_])


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)


trees = [to_nltk_tree(sent.root) for sent in doc.sents]
#[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]


In [21]:
sentences = [[s,[(x, x.tag_) for x in s]] for s in data["S175058361300203X-1542"].doc.sents]
sentences

[[It is well known that the seismic velocity in sandstones saturated with brine does not depend on temperatures in the range of 34–38 °C (e.g. Mavko, 2005).,
  [(It, 'PRP'),
   (is, 'VBZ'),
   (well, 'RB'),
   (known, 'VBN'),
   (that, 'IN'),
   (the, 'DT'),
   (seismic, 'JJ'),
   (velocity, 'NN'),
   (in, 'IN'),
   (sandstones, 'NNS'),
   (saturated, 'VBN'),
   (with, 'IN'),
   (brine, 'NN'),
   (does, 'VBZ'),
   (not, 'RB'),
   (depend, 'VB'),
   (on, 'IN'),
   (temperatures, 'NNS'),
   (in, 'IN'),
   (the, 'DT'),
   (range, 'NN'),
   (of, 'IN'),
   (34–38, 'NN'),
   (°, ','),
   (C, 'NNP'),
   ((, '-LRB-'),
   (e.g., 'NNP'),
   (Mavko, 'NNP'),
   (,, ','),
   (2005, 'CD'),
   (), '-RRB-'),
   (., '.')]],
 [To demonstrate this we calculated the difference in Vp between both temperature scenarios present at the Ketzin site at the time of the 1st 3D seismic repeat campaign (Ivanova et al., 2012) using Gassmann's equations (1951) for 50% CO2 saturation.,
  [(To, 'TO'),
   (demonstrate, 

In [22]:

grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
grammar._calculate_indexes()


In [26]:
print("six" in "ddddsixssss")
sentences[0][0]

True


It is well known that the seismic velocity in sandstones saturated with brine does not depend on temperatures in the range of 34–38 °C (e.g. Mavko, 2005).

In [31]:

for t in sentences[0][1]:
    for x in list(grammar._lhs_index.keys()):
        if t[1] in str(x):
            lhs = nltk.grammar.Nonterminal(str(x))
            rhs = str(t[0])
            new_production = nltk.grammar.Production(lhs, [rhs])
            grammar._productions.append(new_production)
            break
        elif t[1] == '-RRB-':
            print("ger")
            sentences[0][0] = str(sentences[0][0]).replace(str(t[0]),"")
            break

sentences[0][0]

ger


'It is well known that the seismic velocity in sandstones saturated with brine does not depend on temperatures in the range of 34–38 °C (e.g. Mavko, 2005.'

In [28]:
grammar._calculate_indexes()
parser = nltk.parse.BottomUpChartParser(grammar)
chart = parser.chart_parse(sentences[0][0])
chart.draw()

ValueError: Grammar does not cover some of the input words: "'I', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'g', ' ', ' ', '3', '4', '–', '3', '8', ' ', '°', 'C', ' ', '(', 'g', ' ', 'M', ',', ' ', '2', '0', '0', '5'".

In [32]:
trees[0].draw()

In [20]:
from nltk import Tree
from nltk.draw.util import CanvasFrame
from nltk.draw import TreeWidget

cf = CanvasFrame()
t = Tree.fromstring('(S (NP this tree) (VP (V is) (AdjP pretty)))')
tc = TreeWidget(cf.canvas(),t)
cf.add_widget(tc,10,10) # (10,10) offsets


In [21]:
cf.print_to_file('tree.ps')


In [22]:
cf.destroy()

In [27]:
nltk.corpus.sinica_treebank.parsed_sents()[3450].draw() 

In [24]:
nltk.corpus.sinica_treebank.parsed_sents()[3450]

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('VP', [Tree('Ndabe', ['同時']), Tree('Dd', ['就']), Tree('VC32', ['帶']), Tree('Di', ['了']), Tree('NP', [Tree('NP', [Tree('DM', ['四張']), Tree('VH11', ['熟']), Tree('Nab', ['牛皮'])]), Tree('Caa', ['和']), Tree('NP', [Tree('DM', ['十二頭']), Tree('VH16', ['肥']), Tree('Nab', ['牛'])])])])