In [None]:
import sys
sys.path.append('..')
import spacy
import numpy as np
import pandas as pd
import re

from tools.TextProcessing import nlp, find_dependency_path_from_tree, find_span
from tools.BasicUtils import my_write, my_read, SparseRetrieveSentForPairCoOccur

In [None]:
def find_dependency_path_from_tree(doc, kw1:spacy.tokens.span.Span, kw2:spacy.tokens.span.Span):
    idx1 = kw1[-1].i
    idx2 = kw2[-1].i
    branch = np.zeros(len(doc))
    i = idx1
    while branch[i] == 0:
        branch[i] = 1
        i = doc[i].head.i
    i = idx2
    while branch[i] != 1:
        branch[i] = 2
        if i == doc[i].head.i:
            return ''
        i = doc[i].head.i
    dep1 = []
    j = idx1
    while j != i:
        dep1.append('i_%s' % doc[j].dep_)
        j = doc[j].head.i
    dep2 = []
    j = idx2
    while j != i:
        dep2.append(doc[j].dep_)
        j = doc[j].head.i
    dep2.reverse()
    if branch[idx2] == 1:
        # kw2 is along the heads of kw1
        return ' '.join(dep1)
    elif i == idx1:
        # kw1 is along the heads of kw2
        return ' '.join(dep2)
    else:
        return ' '.join(dep1 + dep2)

def find_span(doc:spacy.tokens.doc.Doc, phrase:str, use_lemma:bool=False):
    tokens = phrase.split()
    match_started = False
    pointer, start_idx, end_idx = 0, 0, 0
    match_spans = []
    for i in range(len(doc)):
        present_token = str(doc[i].lemma_ if use_lemma else doc[i])
        if present_token == tokens[pointer]:
            if not match_started:
                start_idx = i
                match_started = True
            pointer += 1
            if pointer == len(tokens):
                end_idx = i+1
                match_spans.append((start_idx, end_idx))
                pointer = 0
                match_started = False
        else:
            pointer = 0
            match_started = False
    return match_spans

def examine_sent(doc, path_set:set, kw1:str, kw2:str):
    kw1_span = find_span(doc, kw1, True)
    kw2_span = find_span(doc, kw2, True)
    path = ''
    for kw1_s, kw1_e in kw1_span:
        for kw2_s, kw2_e in kw2_span:
            path = find_dependency_path_from_tree(doc, doc[kw1_s:kw1_e], doc[kw2_s:kw2_e])
            if path in path_set:
                return path
            path = ''
    return path

In [None]:
p2line_dict = {}
sents = my_read('data/temp_sents.txt')
for i, line in enumerate(sents):
    doc = nlp(line.strip())
    l = [s for s in doc.noun_chunks if s[-1].pos_ != 'PRON']
    if len(l) < 2:
        continue
    for j in range(len(l)-1):
        for k in range(j, len(l)):
            p = find_dependency_path_from_tree(doc, l[j], l[k])
            if not p:
                continue
            if p not in p2line_dict:
                p2line_dict[p] = []
            p2line_dict[p].append({'kw1':str(l[j]), 'kw2':str(l[k]), 'line':i})

In [None]:
len(p2line_dict)

In [None]:
freq_p = [k for k, c in p2line_dict.items() if len(c) > 10]

In [None]:
len(freq_p)

In [None]:
my_write('data/freq_path_10.txt', freq_p)

In [None]:
p = 'prep pobj conj'
my_write('data/'+p, ['%s\t\t%s\t\t%s' % (d['kw1'], d['kw2'], sents[d['line']]) for d in p2line_dict[p]])

In [None]:
s = 'our working database is the 5 - dimensional magnitude space of the sloan digital sky survey with more than 270 million data points, where we show that these techniques can dramatically speed up data mining operations such as finding similar objects by example, classifying objects or comparing extensive simulation sets with observations.'
doc = nlp(s)
l = list(doc.noun_chunks)
print(l)
find_dependency_path_from_tree(doc, l[0], l[3])

In [None]:
sparse_retriever = SparseRetrieveSentForPairCoOccur('../data/corpus/small_sent.txt', '../joint_score_func/data/occur.json')

In [None]:
kw1 = 'database'
kw2 = 'data mining'
path_set = set(my_read('paths.txt'))
sents = sparse_retriever.retrieve(kw1, kw2)
df = pd.DataFrame({'sent':sents})
df['doc'] = df.apply(lambda x: nlp(x['sent']), axis=1)
df['path'] = df.apply(lambda x: examine_sent(x['doc'], path_set, kw1, kw2), axis=1)

In [None]:
df['path']

In [None]:
my_write('temp.txt', sents)

In [130]:
def exact_match(pattern:re.Pattern, path:str):
    mat = pattern.match(path)
    if mat is None:
        return False
    return len(path) == mat.end()

In [126]:
patterns = ['i_nsubj attr( prep pobj)*', 
            'i_nsubj( conj)* dobj( acl prep pobj( conj)*){0,1}', 
            'i_nsubj( prep pobj)+', 
            'i_nsubj advcl dobj( acl attr){0,1}', 
            'appos( conj)*', 
            'appos acl prep pobj( conj)*', 
            'i_nsubjpass( conj)*( prep pobj)+( conj)*', 
            'i_nsubjpass prep pobj acl dobj', 
            '(i_dobj ){0,1}prep pobj( conj)*', 
            '(acl ){0,1}prep pobj( conj)*']
matcher = re.compile('|'.join(patterns))

In [132]:
exact_match(matcher, 'prep pobj conj conj')

True