In [1]:
from __future__ import unicode_literals, print_function
import pandas as pd
import os.path
from nltk.tokenize import word_tokenize
import nltk
import json
import spacy
import plac
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

TRIALPATH = "data/trial"
TRAINPATH = "data/train"

class exerpt:
    def __init__(self, name, txt, ann, tsv, grobid):
        self.name = name
        self.txt = txt
        self.ann = ann
        self.tsv = tsv
        self.grobid = grobid
        self.context = False
        self.posTag = False
        self.doc = nlp(self.txt)
        
    def getContext(self, count = 3):
        if self.context == False: 
            beforeContext = []
            afterContext = []
            self.context = True
            for start,end in zip(self.tsv["startOffset"].values, self.tsv["endOffset"].values ):#self.tsv.iterrows():
                a = getIndexSeperated(exerpt.txt[0],end,count = count)
                b = getIndexSeperated(exerpt.txt[0],start,count = count,forward = False)
                beforeContext.append(self.txt[0][b:start])
                afterContext.append(self.txt[0][end:a])
            self.tsv.insert(3,"beforeContext", beforeContext)
            self.tsv.insert(3,"afterContext", afterContext)
            
    def getPosTag(self):
        if self.context and self.posTag == False: 
            bTag = []
            aTag = []
            self.posTag = True
            for before, after in zip(self.tsv["beforeContext"].values,self.tsv["afterContext"].values):
                bTag.append(nltk.pos_tag(word_tokenize(before)))
                aTag.append(nltk.pos_tag(word_tokenize(after)))
            self.tsv.insert(3,"beforeTag", bTag)
            self.tsv.insert(3,"afterTag", aTag)
            


def readTXTByLine(filepath):
    ftemp = open(filepath, "r", encoding = "utf-8")
    raw = str(ftemp.read())
    ftemp.close()
    return raw

data = {}
for fn in os.listdir(os.path.join(TRIALPATH,"txt")):
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRIALPATH, "txt", fn[:-4] + ".txt")),
                readTXTByLine(os.path.join(TRIALPATH, "ann", fn[:-4] + ".ann")),
                pd.read_csv(os.path.join(TRIALPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRIALPATH, "grobid", fn[:-4] + ".grobid")))
            )
            
for fn in [x for x in os.listdir(os.path.join(TRAINPATH,"text")) if x[:-4]+".tsv" in os.listdir(os.path.join(TRAINPATH,"tsv"))]:
        if fn.endswith('.txt'):
            data[fn[:-4]] = exerpt(
                fn[:-4],
                readTXTByLine(os.path.join(TRAINPATH, "text", fn[:-4] + ".txt")),
                "none",
                pd.read_csv(os.path.join(TRAINPATH, "tsv", fn[:-4] + ".tsv"), "\t", header = 0 ),
                json.load(open(os.path.join(TRAINPATH, "grobid", fn[:-4] + ".grobid")))
            )

In [3]:
list(data.keys())

['S0012821X12004384-1302',
 'S0012821X12004384-1405',
 'S0012821X12004384-1415',
 'S0012821X12004384-1594',
 'S0012821X12004384-1599',
 'S0012821X13002185-1061',
 'S0012821X13002185-1200',
 'S0012821X13002185-1217',
 'S0012821X13002185-1231',
 'S0012821X13002185-835',
 'S0012821X13007309-1482',
 'S0012821X13007309-1509',
 'S0012821X13007309-1605',
 'S0012821X13007309-1989',
 'S0016236113008041-3031',
 'S0016236113008041-3112',
 'S0016236113008041-3153',
 'S0016236113008041-3171',
 'S0016236113008041-3186',
 'S0016236113008041-3207',
 'S0016236113008041-3269',
 'S0016236113008041-3290',
 'S0016236113008041-890',
 'S0016236113008041-913',
 'S0016236113008041-967',
 'S0019103511004994-1382',
 'S0019103511004994-1511',
 'S0019103511004994-1565',
 'S0019103512002801-1342',
 'S0019103512002801-1496',
 'S0019103512002801-1608',
 'S0019103512002801-1824',
 'S0019103512002801-1849',
 'S0019103512002801-1927',
 'S0019103512003533-3299',
 'S0019103512003533-3348',
 'S0019103512003533-4685',
 'S00

The following code will render the dependency visualization inside the jupyter notebook

In [19]:
from spacy import displacy

exerpt = data["S0012821X13007309-1605"]

svg = displacy.render(exerpt.doc, jupyter = True, style = "dep", page=True)

SpaCy gives users the option to format displacy's output. The following options configure displaycy's dependency visualisions look.

In [25]:
options = {
    "fine_grained" : False,
    "collapse_punct": True,
    "collapse_phrases": False, 
    "compact" : True,
    "word_spacing" : 25,
    "offset_x" : 50,
    "arrow_stroke" : 1,
    "arrow_spacing" : 6,
    "distance" : 90,
    "arrow_width": 4.5,
    "font" : "Times"
}

exerpt = data["S0012821X13007309-1605"]

svg = displacy.render(exerpt.doc,jupyter = True, style = "dep", page=True, options=options)

Let's try visualizing this with collasped phrases!

In [26]:
options = {
    "fine_grained" : False,
    "collapse_punct": True,
    "collapse_phrases": True, 
    "compact" : True,
    "word_spacing" : 25,
    "offset_x" : 50,
    "arrow_stroke" : 1,
    "arrow_spacing" : 6,
    "distance" : 90,
    "arrow_width": 4.5,
    "font" : "Times"
}

exerpt = data["S0012821X13007309-1605"]

svg = displacy.render(exerpt.doc,jupyter = True, style = "dep", page=True, options=options)

Let's try visualizing this with fine grained POS tags!

In [29]:
options = {
    "fine_grained" : True,
    "collapse_punct": True,
    "collapse_phrases": True, 
    "compact" : True,
    "word_spacing" : 25,
    "offset_x" : 50,
    "arrow_stroke" : 1,
    "arrow_spacing" : 6,
    "distance" : 90,
    "arrow_width": 4.5,
    "font" : "Times"
}

exerpt = data["S0012821X13007309-1605"]

svg = displacy.render(exerpt.doc,jupyter = True, style = "dep", page=True, options=options)