# Other Tools

## Server Parsing Model

In [15]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza



In [16]:
# Import stanza
import stanza

In [17]:
# Download the Stanford CoreNLP Java library and unzip it to a ./corenlp folder
!echo "Downloading CoreNLP..."
!wget "http://nlp.stanford.edu/software/stanford-corenlp-4.0.0.zip" -O corenlp.zip
!unzip corenlp.zip
!mv ./stanford-corenlp-4.0.0 ./corenlp

Downloading CoreNLP...
--2020-07-08 18:55:49--  http://nlp.stanford.edu/software/stanford-corenlp-4.0.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... failed: Connection timed out.
Retrying.

^C
Archive:  corenlp.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of corenlp.zip or
        corenlp.zip.zip, and cannot find corenlp.zip.ZIP, period.
mv: cannot stat './stanford-corenlp-4.0.0': No such file or directory


In [None]:
# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = "./corenlp"

In [None]:
# Import client module
from stanza.server import CoreNLPClient

In [None]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'depparse'], memory='4G', endpoint='http://localhost:9001')
print(client)

<stanza.server.client.CoreNLPClient object at 0x7fcb57b13828>


In [None]:
# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-9735b515903444c1.props -preload tokenize,ssplit,pos,lemma,ner,depparse


In [None]:
# Print background processes and look for java
!ps -o pid,cmd | grep java

In [None]:
# Annotate some text
text = "The <e1>author</e1> of a keygen uses a <e2>disassembler</e2> to look at the raw assembly code."
document = client.annotate(text)
print(type(document))

In [None]:
# Iterate over all tokens in all sentences, and print out the word, lemma, pos and ner tags
print("{:12s}\t{:12s}\t{:6s}\t{}".format("Word", "Lemma", "POS", "NER"))

for i, sent in enumerate(document.sentence):
    print("[Sentence {}]".format(i+1))
    for t in sent.token:
        print("{:12s}\t{:12s}\t{:6s}\t{}".format(t.word, t.lemma, t.pos, t.ner))
    print("")

Word        	Lemma       	POS   	NER
[Sentence 1]
The         	the         	DT    	O
<e1>        	<e1>        	NN    	O
author      	author      	NN    	TITLE
</e1>       	</e1>       	NN    	O
of          	of          	IN    	O
a           	a           	DT    	O
keygen      	keygen      	NN    	O
uses        	use         	VBZ   	O
a           	a           	DT    	O
<e2>        	<e2>        	NN    	O
disassembler	disassembler	NN    	TITLE
</e2>       	</e2>       	ADD   	O
to          	to          	TO    	O
look        	look        	VB    	O
at          	at          	IN    	O
the         	the         	DT    	O
raw         	raw         	JJ    	O
assembly    	assembly    	NN    	O
code        	code        	NN    	O
.           	.           	.     	O



In [None]:
# Iterate over all detected entity mentions
print("{:30s}\t{}".format("Mention", "Type"))

for sent in document.sentence:
    for m in sent.mentions:
        print("{:30s}\t{}".format(m.entityMentionText, m.entityType))

Mention                       	Type
author                        	TITLE
disassembler                  	TITLE


In [None]:
# Print annotations of a token
print(document.sentence[0].token[0])

# Print annotations of a mention
print(document.sentence[0].mentions[0])

word: "The"
pos: "DT"
value: "The"
before: ""
after: " "
originalText: "The"
ner: "O"
lemma: "the"
beginChar: 0
endChar: 3
tokenBeginIndex: 0
tokenEndIndex: 1
hasXmlContext: false
isNewline: false
coarseNER: "O"
fineGrainedNER: "O"
nerLabelProbs: "O=0.9999979943339318"

sentenceIndex: 0
tokenStartInSentenceInclusive: 2
tokenEndInSentenceExclusive: 3
ner: "TITLE"
entityType: "TITLE"
entityMentionIndex: 0
canonicalEntityMentionIndex: 0
entityMentionText: "author"



In [None]:
# Shut down the background CoreNLP server
client.stop()

time.sleep(10)
!ps -o pid,cmd | grep java

   5022 /bin/bash -c ps -o pid,cmd | grep java
   5024 grep java


TEST FOR PARSER


In [None]:
client = CoreNLPClient(annotators=['tokenize', 'mwt', 'ssplit', 'pos', 'lemma', 'ner', 'depparse'], memory='4G', endpoint='http://localhost:9001')

In [None]:
client.start()
import time; time.sleep(10)
!ps -o pid,cmd | grep java

In [None]:
# text = "The suspect dumped the dead <e1>body</e1> into a local <e2>reservoir</e2>."
text = "The <e1>author</e1> of a keygen uses a <e2>disassembler</e2> to look at the raw assembly code."
document = client.annotate(text)

In [None]:
for i, sent in enumerate(document.sentence):
    for t in sent.token:
        print("{:12s}\t{:12s}\t{:6s}\t{}".format(t.word, t.lemma, t.pos, t.ner))
    print("")

In [None]:
client.stop()

## Pipeline Parsing Model

In [None]:
!pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/27/9c/60689521a971a57dd02d2925105efedefa9dccd76c9a0b92566683d43e89/stanza-1.0.1-py3-none-any.whl (193kB)
[K     |█▊                              | 10kB 28.5MB/s eta 0:00:01[K     |███▍                            | 20kB 6.3MB/s eta 0:00:01[K     |█████                           | 30kB 7.3MB/s eta 0:00:01[K     |██████▉                         | 40kB 7.8MB/s eta 0:00:01[K     |████████▌                       | 51kB 7.4MB/s eta 0:00:01[K     |██████████▏                     | 61kB 8.3MB/s eta 0:00:01[K     |███████████▉                    | 71kB 8.1MB/s eta 0:00:01[K     |█████████████▋                  | 81kB 9.1MB/s eta 0:00:01[K     |███████████████▎                | 92kB 8.6MB/s eta 0:00:01[K     |█████████████████               | 102kB 8.5MB/s eta 0:00:01[K     |██████████████████▋             | 112kB 8.5MB/s eta 0:00:01[K     |████████████████████▍           | 122kB 8.5MB/s eta 0:00

In [None]:
import stanza

In [None]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 33.8MB/s]                    
2020-06-24 16:40:09 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.0.0/en/default.zip: 100%|██████████| 402M/402M [00:23<00:00, 17.4MB/s]
2020-06-24 16:40:39 INFO: Finished downloading models and saved to /root/stanza_resources.


In [None]:
nlp = stanza.Pipeline('en', processors='tokenize, mwt, lemma, pos, depparse, ner')

2020-06-24 16:40:39 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-06-24 16:40:39 INFO: Use device: gpu
2020-06-24 16:40:39 INFO: Loading: tokenize
2020-06-24 16:40:49 INFO: Loading: pos
2020-06-24 16:40:50 INFO: Loading: lemma
2020-06-24 16:40:50 INFO: Loading: depparse
2020-06-24 16:40:51 INFO: Loading: ner
2020-06-24 16:40:51 INFO: Done loading processors!


In [None]:
doc = nlp("Texas-born virtuoso finds harmony, sophistication in Appalachian instrument.")
doc

[
  [
    {
      "id": "1",
      "text": "Texas",
      "lemma": "Texas",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 3,
      "deprel": "obl:npmod",
      "misc": "start_char=0|end_char=5"
    },
    {
      "id": "2",
      "text": "-",
      "lemma": "-",
      "upos": "PUNCT",
      "xpos": "HYPH",
      "head": 3,
      "deprel": "punct",
      "misc": "start_char=5|end_char=6"
    },
    {
      "id": "3",
      "text": "born",
      "lemma": "bear",
      "upos": "VERB",
      "xpos": "VBN",
      "feats": "Tense=Past|VerbForm=Part",
      "head": 4,
      "deprel": "amod",
      "misc": "start_char=6|end_char=10"
    },
    {
      "id": "4",
      "text": "virtuoso",
      "lemma": "virtuoso",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "head": 5,
      "deprel": "nsubj",
      "misc": "start_char=11|end_char=19"
    },
    {
      "id": "5",
      "text": "finds",
      "lemma": "find",
      "upo

In [None]:
for sentence in doc.sentences:
    print(sentence.ents)

[{
  "text": "16th July, 2020",
  "type": "DATE",
  "start_char": 0,
  "end_char": 15
}]


## NLTK WordNet

In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
wn.synsets('motorcar')

[Synset('car.n.01')]

In [None]:
wn.synsets('fly')

[Synset('fly.n.01'),
 Synset('tent-fly.n.01'),
 Synset('fly.n.03'),
 Synset('fly.n.04'),
 Synset('fly.n.05'),
 Synset('fly.v.01'),
 Synset('fly.v.02'),
 Synset('fly.v.03'),
 Synset('fly.v.04'),
 Synset('fly.v.05'),
 Synset('fly.v.06'),
 Synset('fly.v.07'),
 Synset('fly.v.08'),
 Synset('fly.v.09'),
 Synset('fly.v.10'),
 Synset('flee.v.01'),
 Synset('fly.v.12'),
 Synset('fly.v.13'),
 Synset('vanish.v.05'),
 Synset('fly.s.01')]

# Start

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My\ Drive/NLP/FINAL
!ls -al

/content/drive/My Drive/NLP/FINAL
total 524810
-rw------- 1 root root    427732 Jul  8 18:48 Basic.ipynb
drwx------ 2 root root      4096 May 14 06:08 corenlp
-rw------- 1 root root       140 May 14 10:09 corenlp_server-2b6b6d11f50647dd.props
-rw------- 1 root root       140 May 14 10:10 corenlp_server-7300fb9a87b44690.props
-rw------- 1 root root 504479415 May  5 05:42 corenlp.zip
drwx------ 2 root root      4096 May 14 08:06 data
-rw------- 1 root root      3416 Jun 10 15:02 data_reader.py
drwx------ 2 root root      4096 Jun 15 09:26 Model
-rw------- 1 root root    195386 Jun 28 12:24 New.ipynb
drwx------ 2 root root      4096 May 14 12:11 __pycache__
-rw------- 1 root root     11540 May 14 12:46 tf_glove.py
drwx------ 2 root root      4096 Jun 14 06:31 .vector_cache
-rw------- 1 root root  32264387 Jun 12 03:27 word2vec.npy


# Rebuild Corpus
· Other :  18  
· Cause-Effect :  0, 1  
· Component-Whole :  2, 3    
· Entity-Destination :  4, 5  
· Product-Producer :  6, 7   
· Entity-Origin :  8, 9    
· Member-Collection :  10, 11  
· Message-Topic :  12, 13  
· Content-Container :  14, 15  
· Instrument-Agency :  16, 17  


## corenlp tool

In [3]:
import numpy as np
import string
from tqdm import tqdm
import os
import pickle

In [None]:
!pip install stanza
import stanza
stanza.download('en')

import spacy

In [6]:
import spacy

In [7]:
nlp = spacy.load('en_core_web_sm')

In [11]:
doc = nlp('I arrived at New York yesterday.')

In [34]:
for word in doc.doc:
    print(word.head)

arrived
arrived
arrived
York
at
arrived
arrived


In [None]:
rel_dict = {"Other": 18, "Cause-Effect": 0, "Component-Whole": 2, "Entity-Destination": 4,
            "Product-Producer": 6, "Entity-Origin": 8, "Member-Collection": 10,
            "Message-Topic": 12, "Content-Container": 14, "Instrument-Agency": 16}

In [None]:
class Node(object):
    def __init__(self, indx=None):
        self.indx = indx
        self.text = None
        self.pos = None
        self.dep = None
        self.head = None
        self.childs = []
        self.depth = None

In [None]:
class DepTree(object):
    def __init__(self, size=0):
        self.size = size
        self.root = None
        self.nodes = []
        for i in range(size):
            self.nodes.append(Node(i))
    
    def add_node(self, indx, text, pos, dep, head):
        self.nodes[indx].text = text
        self.nodes[indx].pos = pos
        self.nodes[indx].dep = dep
        if head==-1:
            self.root = indx
            self.nodes[indx].head = -1
        else:
            self.nodes[indx].head = head
            self.nodes[head].childs.append(indx)
    
    def cal_depth(self, indx=None, depth=0):
        if depth==0:
            indx = self.root
        self.nodes[indx].depth = depth
        if len(self.nodes[indx].childs)==0:
            return
        for child in self.nodes[indx].childs:
            self.cal_depth(child, depth+1)
    
    def disp(self):
        for i in range(self.size):
            print('indx: %2d\thead: %2d\ttext: %12s\tpos: %12s\tdep: %12s' % 
                  (self.nodes[i].indx, self.nodes[i].head, self.nodes[i].text, self.nodes[i].pos, self.nodes[i].dep))

## train_path

In [None]:
nlp = stanza.Pipeline('en', processors='tokenize, mwt, lemma, pos, depparse, ner')

words_seq_all = []
pos_seq_all = []
indx_path1_all = []
indx_path2_all = []
deps_seq_all = []
dep_path1_all = []
dep_path2_all = []
childs_path1_all = []
childs_path2_all = []
rels = []

fi = open("data/TRAIN_FILE.txt", 'r')
try:
    with tqdm(range(8000)) as t:
        for i in t: 
            # preprocess
            raw = fi.readline()
            raw = raw.lstrip(string.digits+'\t'+'\"').rstrip('\"\n').lower()

            ent1 = raw[raw.find('<e1>')+4:raw.find('</e1>')]
            ent2 = raw[raw.find('<e2>')+4:raw.find('</e2>')]

            raw = raw.replace('<e1>', '').replace('</e1>', '').replace('<e2>', '').replace('</e2>', '')

            doc = nlp(raw)
            doc1 = nlp(ent1)
            doc2 = nlp(ent2)

            # build tree
            dt = DepTree(doc.num_words)
            offset = 0
            for sentence in doc.sentences:
                for word in sentence.words:
                    dt.add_node(int(word.id)+offset-1, word.text, word.pos, word.deprel, int(word.head)+offset-1)
                offset += len(sentence.words)
            
            words_seq = [dt.nodes[i].text for i in range(dt.size)]
            pos_seq = [dt.nodes[i].pos for i in range(dt.size)]
            deps_seq = [dt.nodes[i].dep for i in range(dt.size)]

            words_seq_all.append(words_seq)
            pos_seq_all.append(pos_seq)
            deps_seq_all.append(deps_seq)

            # find index for entities
            ent1_text = None
            if '-' not in ent1 and ' ' not in ent1:
                ent1_text = ent1
            else:
                for sentence in doc1.sentences:
                    for word in sentence.words:
                        if word.head == 0:
                            ent1_text = word.text
                            break

            ent2_text = None
            if '-' not in ent2 and ' ' not in ent2:
                ent2_text = ent2
            elif i==6802:
                ent2_text = 'bas-reliefs'
            else:
                for sentence in doc2.sentences:
                    for word in sentence.words:
                        if word.head == 0:
                            ent2_text = word.text
                            break
            
            ent1_indx = words_seq.index(ent1_text)
            ent2_indx = words_seq.index(ent2_text)

            # find LCA and indx_path
            indx_path1 = [ent1_indx]
            indx_path2 = [ent2_indx]

            while not indx_path1[-1]==-1:
                indx_path1.append(dt.nodes[indx_path1[-1]].head)
            while indx_path2[-1] not in indx_path1:
                indx_path2.append(dt.nodes[indx_path2[-1]].head)
            indx_path1 = indx_path1[0:indx_path1.index(indx_path2[-1])+1]
            if indx_path1[-1]==-1:
                indx_path1 = indx_path1[0:-1]
                indx_path2 = indx_path2[0:-1]

            indx_path1_all.append(indx_path1)
            indx_path2_all.append(indx_path2)

            # dep_path
            dep_path1 = [dt.nodes[i].dep for i in indx_path1]
            dep_path2 = [dt.nodes[i].dep for i in indx_path2]

            dep_path1_all.append(dep_path1)
            dep_path2_all.append(dep_path2)

            # childs_path
            childs_path1 = [dt.nodes[i].childs for i in indx_path1]
            childs_path2 = [dt.nodes[i].childs for i in indx_path2]

            childs_path1_all.append(childs_path1)
            childs_path2_all.append(childs_path2)

            rel = fi.readline()
            rel = rel.rstrip('\n').replace('(', ' (').split(" ")
            if rel[0] == 'Other':
                rels.append(18)
            elif rel[1] == '(e1,e2)':
                rels.append(rel_dict[rel[0]])
            elif rel[1] == '(e2,e1)':
                rels.append(rel_dict[rel[0]]+1)

            fi.readline() # comment line
            fi.readline() # blank line
except Exception:
    t.close()
    raise
fi.close()

f = open('data/train_path_', 'wb')
pickle.dump([words_seq_all, pos_seq_all, deps_seq_all, indx_path1_all, indx_path2_all, 
             dep_path1_all, dep_path2_all, childs_path1_all, childs_path2_all, rels], f)
f.close()

2020-06-22 08:48:13 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-06-22 08:48:13 INFO: Use device: gpu
2020-06-22 08:48:13 INFO: Loading: tokenize
2020-06-22 08:48:13 INFO: Loading: pos
2020-06-22 08:48:13 INFO: Loading: lemma
2020-06-22 08:48:14 INFO: Loading: depparse
2020-06-22 08:48:15 INFO: Loading: ner
2020-06-22 08:48:15 INFO: Done loading processors!
100%|█████████▉| 7999/8000 [27:24<00:00,  4.58it/s]

In [None]:
nlp = spacy.load('en_core_web_sm')

f = open('data/train_path_', 'rb')
words_seq, pos_seq, deps_seq, indx_path1, indx_path2, dep_path1, dep_path2, childs_path1, childs_path2, rels = pickle.load(f)
f.close()

ents_seq_all = []

def intv_find(intv_seq, char_indx):
    word_indx = 0
    while char_indx >= intv_seq[word_indx+1]:
        word_indx += 1
    return word_indx

try:
    with tqdm(range(8000)) as t:
        for i in t:
            size = len(words_seq[i])
            ents_seq = [''] * size
            bilou_seq = [''] * size
            intv_seq = [0] * (size+1)

            doc = nlp(' '.join(words_seq[i]))

            sentence = ' '.join(words_seq[i])
            for j in range(1, size+1):
                intv_seq[j] = intv_seq[j-1] + len(words_seq[i][j-1]) + 1
            
            for ent in doc.ents:
                start = intv_find(intv_seq, ent.start_char)
                end = intv_find(intv_seq, ent.end_char)
                for j in range(start, end+1):
                    ents_seq[j] = ent.label_
            
            for j in range(size):
                if ents_seq[j]=='':
                    bilou_seq[j] = 'O'
                elif j==0:
                    if ents_seq[j+1]!=ents_seq[j]:
                        bilou_seq[j] = 'U-'
                    else:
                        bilou_seq[j] = 'B-'
                elif j==size-1:
                    if ents_seq[j-1]!=ents_seq[j]:
                        bilou_seq[j] = 'U-'
                    else:
                        bilou_seq[j]= 'L-'
                elif ents_seq[j-1]!=ents_seq[j] and ents_seq[j+1]!=ents_seq[j]:
                    bilou_seq[j] = 'U-'
                elif ents_seq[j-1]!=ents_seq[j] and ents_seq[j+1]==ents_seq[j]:
                    bilou_seq[j] = 'B-'
                elif ents_seq[j-1]==ents_seq[j] and ents_seq[j+1]!=ents_seq[j]:
                    bilou_seq[j] = 'L-'
                elif ents_seq[j-1]==ents_seq[j] and ents_seq[j+1]==ents_seq[j]:
                    bilou_seq[j] = 'I-'

            ents_seq_all.append([bilou_seq[i]+ents_seq[i] for i in range(size)])

except Exception:
    t.close()
    print(words_seq[i])
    raise

f = open('data/train_path', 'wb')
pickle.dump([words_seq, pos_seq, deps_seq, ents_seq_all, indx_path1, indx_path2, dep_path1, dep_path2, childs_path1, childs_path2, rels], f)
f.close()

100%|██████████| 8000/8000 [01:20<00:00, 99.92it/s] 


## test_path

In [None]:
nlp = stanza.Pipeline('en', processors='tokenize, mwt, lemma, pos, depparse, ner')

words_seq_all = []
pos_seq_all = []
deps_seq_all = []
indx_path1_all = []
indx_path2_all = []
dep_path1_all = []
dep_path2_all = []
childs_path1_all = []
childs_path2_all = []
rels = []

fi = open("data/TEST_FILE.txt", 'r')
try:
    with tqdm(range(2717)) as t:
        for i in t: 
            # preprocess
            raw = fi.readline()
            raw = raw.lstrip(string.digits+'\t'+'\"').rstrip('\"\n').lower()

            ent1 = raw[raw.find('<e1>')+4:raw.find('</e1>')]
            ent2 = raw[raw.find('<e2>')+4:raw.find('</e2>')]

            raw = raw.replace('<e1>', '').replace('</e1>', '').replace('<e2>', '').replace('</e2>', '')

            doc = nlp(raw)
            doc1 = nlp(ent1)
            doc2 = nlp(ent2)

            # build tree
            dt = DepTree(doc.num_words)
            offset = 0
            for sentence in doc.sentences:
                for word in sentence.words:
                    dt.add_node(int(word.id)+offset-1, word.text, word.pos, word.deprel, int(word.head)+offset-1)
                offset += len(sentence.words)
            
            words_seq = [dt.nodes[i].text for i in range(dt.size)]
            pos_seq = [dt.nodes[i].pos for i in range(dt.size)]
            deps_seq = [dt.nodes[i].dep for i in range(dt.size)]

            words_seq_all.append(words_seq)
            pos_seq_all.append(pos_seq)
            deps_seq_all.append(deps_seq)

            # find index for entities
            ent1_text = None
            if '-' not in ent1 and ' ' not in ent1:
                ent1_text = ent1
            else:
                for sentence in doc1.sentences:
                    for word in sentence.words:
                        if word.head == 0:
                            ent1_text = word.text
                            break

            ent2_text = None
            if '-' not in ent2 and ' ' not in ent2:
                ent2_text = ent2
            else:
                for sentence in doc2.sentences:
                    for word in sentence.words:
                        if word.head == 0:
                            ent2_text = word.text
                            break
            
            ent1_indx = words_seq.index(ent1_text)
            ent2_indx = words_seq.index(ent2_text)

            # find LCA and indx_path
            indx_path1 = [ent1_indx]
            indx_path2 = [ent2_indx]

            while not indx_path1[-1]==-1:
                indx_path1.append(dt.nodes[indx_path1[-1]].head)
            while indx_path2[-1] not in indx_path1:
                indx_path2.append(dt.nodes[indx_path2[-1]].head)
            indx_path1 = indx_path1[0:indx_path1.index(indx_path2[-1])+1]
            if indx_path1[-1]==-1:
                indx_path1 = indx_path1[0:-1]
                indx_path2 = indx_path2[0:-1]

            indx_path1_all.append(indx_path1)
            indx_path2_all.append(indx_path2)

            # dep_path
            dep_path1 = [dt.nodes[i].dep for i in indx_path1]
            dep_path2 = [dt.nodes[i].dep for i in indx_path2]

            dep_path1_all.append(dep_path1)
            dep_path2_all.append(dep_path2)

            # childs_path
            childs_path1 = [dt.nodes[i].childs for i in indx_path1]
            childs_path2 = [dt.nodes[i].childs for i in indx_path2]

            childs_path1_all.append(childs_path1)
            childs_path2_all.append(childs_path2)

            rel = fi.readline()
            rel = rel.rstrip('\n').replace('(', ' (').split(" ")
            if rel[0] == 'Other':
                rels.append(18)
            elif rel[1] == '(e1,e2)':
                rels.append(rel_dict[rel[0]])
            elif rel[1] == '(e2,e1)':
                rels.append(rel_dict[rel[0]]+1)

            fi.readline() # comment line
            fi.readline() # blank line
except Exception:
    t.close()
    raise
fi.close()

f = open('data/test_path_', 'wb')
pickle.dump([words_seq_all, pos_seq_all, dep_seq_all, indx_path1_all, indx_path2_all, 
             deps_path1_all, dep_path2_all, childs_path1_all, childs_path2_all, rels], f)
f.close()

2020-06-22 08:30:16 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-06-22 08:30:16 INFO: Use device: gpu
2020-06-22 08:30:16 INFO: Loading: tokenize
2020-06-22 08:30:26 INFO: Loading: pos
2020-06-22 08:30:27 INFO: Loading: lemma
2020-06-22 08:30:27 INFO: Loading: depparse
2020-06-22 08:30:28 INFO: Loading: ner
2020-06-22 08:30:28 INFO: Done loading processors!
100%|██████████| 2717/2717 [09:18<00:00,  4.87it/s]


In [None]:
nlp = spacy.load('en_core_web_sm')

f = open('data/test_path_', 'rb')
words_seq, pos_seq, deps_seq, indx_path1, indx_path2, dep_path1, dep_path2, childs_path1, childs_path2, rels = pickle.load(f)
f.close()

ents_seq_all = []

def intv_find(intv_seq, char_indx):
    word_indx = 0
    while char_indx >= intv_seq[word_indx+1]:
        word_indx += 1
    return word_indx

try:
    with tqdm(range(2717)) as t:
        for i in t:
            size = len(words_seq[i])
            ents_seq = [''] * size
            bilou_seq = [''] * size
            intv_seq = [0] * (size+1)

            doc = nlp(' '.join(words_seq[i]))

            sentence = ' '.join(words_seq[i])
            for j in range(1, size+1):
                intv_seq[j] = intv_seq[j-1] + len(words_seq[i][j-1]) + 1
            
            for ent in doc.ents:
                start = intv_find(intv_seq, ent.start_char)
                end = intv_find(intv_seq, ent.end_char)
                for j in range(start, end+1):
                    ents_seq[j] = ent.label_
            
            for j in range(size):
                if ents_seq[j]=='':
                    bilou_seq[j] = 'O'
                elif j==0:
                    if ents_seq[j+1]!=ents_seq[j]:
                        bilou_seq[j] = 'U-'
                    else:
                        bilou_seq[j] = 'B-'
                elif j==size-1:
                    if ents_seq[j-1]!=ents_seq[j]:
                        bilou_seq[j] = 'U-'
                    else:
                        bilou_seq[j]= 'L-'
                elif ents_seq[j-1]!=ents_seq[j] and ents_seq[j+1]!=ents_seq[j]:
                    bilou_seq[j] = 'U-'
                elif ents_seq[j-1]!=ents_seq[j] and ents_seq[j+1]==ents_seq[j]:
                    bilou_seq[j] = 'B-'
                elif ents_seq[j-1]==ents_seq[j] and ents_seq[j+1]!=ents_seq[j]:
                    bilou_seq[j] = 'L-'
                elif ents_seq[j-1]==ents_seq[j] and ents_seq[j+1]==ents_seq[j]:
                    bilou_seq[j] = 'I-'

            ents_seq_all.append([bilou_seq[i]+ents_seq[i] for i in range(size)])

except Exception:
    t.close()
    print(words_seq[i])
    raise

f = open('data/test_path', 'wb')
pickle.dump([words_seq, pos_seq, deps_seq, ents_seq_all, indx_path1, indx_path2, dep_path1, dep_path2, childs_path1, childs_path2, rels], f)
f.close()

100%|██████████| 2717/2717 [00:27<00:00, 97.58it/s] 


In [None]:
dep = []
f = open('data/train_path', 'rb')
words_seq, pos_seq, deps_seq, ents_seq, indx_path1, indx_path2, dep_path1, dep_path2, childs_path1, childs_path2, rels = pickle.load(f)
f.close()
dep.extend(deps_seq)
f = open('data/test_path', 'rb')
words_seq, pos_seq, deps_seq, ents_seq, indx_path1, indx_path2, dep_path1, dep_path2, childs_path1, childs_path2, rels = pickle.load(f)
f.close()
dep.extend(deps_seq)
dep

[['det',
  'nsubj',
  'mark',
  'acl',
  'advmod',
  'root',
  'nmod:poss',
  'amod',
  'obj',
  'case',
  'det',
  'amod',
  'nmod',
  'case',
  'compound',
  'nmod',
  'punct'],
 ['det',
  'nsubj:pass',
  'aux:pass',
  'advmod',
  'root',
  'cc',
  'conj',
  'case',
  'det',
  'obl',
  'case',
  'obl',
  'case',
  'det',
  'nmod',
  'punct'],
 ['det',
  'nsubj',
  'case',
  'det',
  'nmod',
  'root',
  'det',
  'obj',
  'mark',
  'acl',
  'case',
  'det',
  'amod',
  'compound',
  'obl',
  'punct'],
 ['det', 'amod', 'nsubj', 'root', 'case', 'det', 'obl', 'punct'],
 ['det',
  'compound',
  'nsubj',
  'cop',
  'det',
  'root',
  'case',
  'det',
  'amod',
  'compound',
  'nmod',
  'case',
  'det',
  'compound',
  'nmod',
  'case',
  'amod',
  'nmod',
  'case',
  'nmod',
  'punct'],
 ['nsubj',
  'cop',
  'det',
  'amod',
  'root',
  'nsubj',
  'cop',
  'nmod:poss',
  'case',
  'amod',
  'acl:relcl',
  'case',
  'nmod',
  'punct'],
 ['det',
  'amod',
  'nsubj',
  'root',
  'mark',
  'det

In [None]:
dep = str(dep)
dep = dep.replace('[', '').replace(']', '')

In [None]:
dep = list(eval(dep))

In [None]:
dep_freq = pd.value_counts(dep).sort_index()
dep_freq.keys()

Index(['acl', 'acl:relcl', 'advcl', 'advmod', 'amod', 'appos', 'aux',
       'aux:pass', 'case', 'cc', 'cc:preconj', 'ccomp', 'compound',
       'compound:prt', 'conj', 'cop', 'csubj', 'csubj:pass', 'det',
       'det:predet', 'discourse', 'expl', 'fixed', 'flat', 'goeswith', 'iobj',
       'list', 'mark', 'nmod', 'nmod:npmod', 'nmod:poss', 'nmod:tmod', 'nsubj',
       'nsubj:pass', 'nummod', 'obj', 'obl', 'obl:npmod', 'obl:tmod',
       'parataxis', 'punct', 'root', 'vocative', 'xcomp'],
      dtype='object')