In [1]:
import sys
sys.path.append('..')
import pandas as pd
import re

from tools.TextProcessing import nlp, find_dependency_path_from_tree, find_span, find_noun_phrases
from tools.BasicUtils import my_write, my_read, SparseRetrieveSentForPairCoOccur

In [2]:
def examine_sent(doc, path_set:set, kw1:str, kw2:str):
    kw1_span = find_span(doc, kw1, True)
    kw2_span = find_span(doc, kw2, True)
    path = ''
    for kw1_s, kw1_e in kw1_span:
        for kw2_s, kw2_e in kw2_span:
            path = find_dependency_path_from_tree(doc, doc[kw1_s:kw1_e], doc[kw2_s:kw2_e])
            if path in path_set:
                return path
            path = ''
    return path

In [5]:
# Do general analysis about possible paths
p2line_dict = {}
freq = 10
sents = my_read('data/temp_sents.txt')
for i, line in enumerate(sents):
    doc = nlp(line.strip())
    l = [s for s in doc.noun_chunks if s[-1].pos_ != 'PRON']
    if len(l) < 2:
        continue
    for j in range(len(l)-1):
        for k in range(j, len(l)):
            p = find_dependency_path_from_tree(doc, l[j], l[k])
            if not p:
                continue
            if p not in p2line_dict:
                p2line_dict[p] = []
            p2line_dict[p].append({'kw1':str(l[j]), 'kw2':str(l[k]), 'line':i})

print("path number:", len(p2line_dict))
freq_p = [k for k, c in p2line_dict.items() if len(c) > freq]
print(len(freq_p), "paths have the frequency higher than", freq)
my_write('data/freq_path_10.txt', freq_p)

path number: 6534
141 paths have the frequency higher than 10


In [None]:
# Collect sentences containing specific path
p = 'prep pobj conj'
my_write('data/'+p, ['%s\t\t%s\t\t%s' % (d['kw1'], d['kw2'], sents[d['line']]) for d in p2line_dict[p]])

In [3]:
# Examine on sentence
s = 'a path prediction algorithm is tested on this ais database and the testing results show this database can be used as a standardized training resource for different trajectory prediction algorithms and other ais data mining algorithms.'
doc = nlp(s)
l = list(doc.noun_chunks)
print(l)
l_ = find_noun_phrases(doc)
print(l_)
find_dependency_path_from_tree(doc, l_[0], l_[1])

[a path prediction, algorithm, this ais database, the testing results, this database, a standardized training resource, different trajectory prediction algorithms, other ais data mining algorithms]
[a path prediction algorithm, this ais database, the testing results, this database, a standardized training resource, different trajectory prediction algorithms, other ais data mining algorithms]


'i_nsubjpass prep pobj'

In [None]:
sparse_retriever = SparseRetrieveSentForPairCoOccur('../data/corpus/small_sent.txt', '../joint_score_func/data/occur.json')

In [None]:
kw1 = 'database'
kw2 = 'data mining'
path_set = set(my_read('paths.txt'))
sents = sparse_retriever.retrieve(kw1, kw2)
df = pd.DataFrame({'sent':sents})
df['doc'] = df.apply(lambda x: nlp(x['sent']), axis=1)
df['path'] = df.apply(lambda x: examine_sent(x['doc'], path_set, kw1, kw2), axis=1)

In [None]:
df['path']

In [None]:
my_write('temp.txt', sents)

In [130]:
def exact_match(pattern:re.Pattern, path:str):
    mat = pattern.match(path)
    if mat is None:
        return False
    return len(path) == mat.end()

In [126]:
patterns = ['i_nsubj attr( prep pobj)*', 
            'i_nsubj( conj)* dobj( acl prep pobj( conj)*){0,1}', 
            'i_nsubj( prep pobj)+', 
            'i_nsubj advcl dobj( acl attr){0,1}', 
            'appos( conj)*', 
            'appos acl prep pobj( conj)*', 
            'i_nsubjpass( conj)*( prep pobj)+( conj)*', 
            'i_nsubjpass prep pobj acl dobj', 
            '(i_dobj ){0,1}prep pobj( conj)*', 
            '(acl ){0,1}prep pobj( conj)*']
matcher = re.compile('|'.join(patterns))

In [132]:
exact_match(matcher, 'prep pobj conj conj')

True