In [1]:
import sys
sys.path.append('..')
import pandas as pd
import spacy
import numpy as np
from tools.TextProcessing import nlp, find_dependency_path_from_tree, find_span, find_root_in_span
from tools.BasicUtils import my_write, my_read, SparseRetrieveSentForPairCoOccur

In [2]:
modifier_dependencies = {'acl', 'advcl', 'advmod', 'amod', 'mark', 'meta', 'neg', 'nn', 'nmod', 'npmod', 'nummod', 'poss', 'prep', 'quantmod', 'relcl'}
adjunctive_dependencies = {'appos', 'aux', 'auxpass', 'compound', 'cop', 'det', 'expl', 'punct'}

def expand_dependency_info_from_tree(doc, path:np.ndarray):
    dep_path:list = (np.arange(*path.shape)[path!=0]).tolist()
    for element in dep_path:
        if doc[element].dep_ == 'conj':
            path[doc[element].head.i] = 0
    dep_path:list = (np.arange(*path.shape)[path!=0]).tolist()
    modifiers = []
    for element in dep_path:
        for child in doc[element].children:
            if path[child.i] == 0 and (child.dep_ in modifier_dependencies or child.dep_ in adjunctive_dependencies):
                path[child.i] = 1
                modifiers.append(child.i)
    while len(modifiers) > 0:
        modifier = modifiers.pop(0)
        for child in doc[modifier].children:
            if path[child.i] == 0:
                path[child.i] = 1
                modifiers.append(child.i)

def find_dependency_info_from_tree(doc, kw1:spacy.tokens.span.Span, kw2:spacy.tokens.span.Span):
    # Find roots of the spans
    idx1 = find_root_in_span(kw1)
    idx2 = find_root_in_span(kw2)
    kw1_front, kw1_end = kw1[0].i, kw1[-1].i
    kw2_front, kw2_end = kw2[0].i, kw2[-1].i
    branch = np.zeros(len(doc))
    kw1_steps = []
    
    i = idx1
    while branch[i] == 0:
        branch[i] = 1
        kw1_steps.append(i)
        i = doc[i].head.i
        if i >= kw2_front and i <= kw2_end:
            # kw2 is above kw1
            branch[kw1_front : kw1_end+1] = 1
            branch[kw2_front : kw2_end+1] = 1
            expand_dependency_info_from_tree(doc, branch)
            return branch
    
    i = idx2
    while branch[i] != 1:
        branch[i] = 2
        
        if i == doc[i].head.i:
            return np.zeros(1)
        
        i = doc[i].head.i
        if i >= kw1_front and i <= kw1_end:
            # kw1 is above kw2
            branch[branch != 2] = 0
            branch[branch == 2] = 1
            branch[kw1_front : kw1_end+1] = 1
            branch[kw2_front : kw2_end+1] = 1
            expand_dependency_info_from_tree(doc, branch)
            return branch
    # kw1 and kw2 are on two sides, i is their joint
    break_point = kw1_steps.index(i)
    branch[kw1_steps[break_point+1 : ]] = 0
    branch[branch != 0] = 1
    branch[kw1_front : kw1_end+1] = 1
    branch[kw2_front : kw2_end+1] = 1
    expand_dependency_info_from_tree(doc, branch)
    return branch

def informativeness_demo(sent:str, kw1:str, kw2:str):
    doc = nlp(sent)
    kw1_span = find_span(doc, kw1, True, True)[0]
    kw2_span = find_span(doc, kw2, True, True)[0]
    path = find_dependency_info_from_tree(doc, kw1_span, kw2_span)
    context = []
    temp = []
    for i, checked in enumerate(path):
        if checked:
            temp.append(doc[i].text)
        else:
            if temp:
                context.append(' '.join(temp))
                temp = []
    if temp:
        context.append(' '.join(temp))
    return context

In [4]:
informativeness_demo('The economy of California, with a gross state product of $3.2 trillion as of 2019, is the largest sub-national economy in the world.', 'economy of California', 'sub-national economy')

['The economy of California , with a gross state product of $ 3.2 trillion as of 2019 , is the largest sub - national economy in the world .']

In [10]:
doc = nlp('I love machine learning')

In [11]:
doc[3].lemma_

'learning'

In [2]:
def examine_sent(doc, path_set:set, kw1:str, kw2:str):
    kw1_span = find_span(doc, kw1, True)
    kw2_span = find_span(doc, kw2, True)
    path = ''
    for kw1_span in kw1_span:
        for kw2_span in kw2_span:
            path = find_dependency_path_from_tree(doc, kw1_span, kw2_span)
            if path in path_set:
                return path
            path = ''
    return path

In [5]:
# Do general analysis about possible paths
p2line_dict = {}
freq = 10
sents = my_read('data/temp_sents.txt')
for i, line in enumerate(sents):
    doc = nlp(line.strip())
    l = [s for s in doc.noun_chunks if s[-1].pos_ != 'PRON']
    if len(l) < 2:
        continue
    for j in range(len(l)-1):
        for k in range(j, len(l)):
            p = find_dependency_path_from_tree(doc, l[j], l[k])
            if not p:
                continue
            if p not in p2line_dict:
                p2line_dict[p] = []
            p2line_dict[p].append({'kw1':str(l[j]), 'kw2':str(l[k]), 'line':i})

print("path number:", len(p2line_dict))
freq_p = [k for k, c in p2line_dict.items() if len(c) > freq]
print(len(freq_p), "paths have the frequency higher than", freq)
my_write('data/freq_path_10.txt', freq_p)

path number: 6534
141 paths have the frequency higher than 10


In [None]:
# Collect sentences containing specific path
p = 'prep pobj conj'
my_write('data/'+p, ['%s\t\t%s\t\t%s' % (d['kw1'], d['kw2'], sents[d['line']]) for d in p2line_dict[p]])

In [13]:
# Examine on sentence
s = 'Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.'
doc = nlp(s)
l = list(doc.noun_chunks)   
print(l)
# print(get_phrase_full_span(doc, doc[-2:-1]))
# find_dependency_path_from_tree(doc, l_[0], l_[1])

[Machine learning algorithms, a model, sample data, training data, order, predictions, decisions]


In [None]:
sparse_retriever = SparseRetrieveSentForPairCoOccur('../data/corpus/small_sent.txt', '../joint_score_func/data/occur.json')

In [None]:
kw1 = 'database'
kw2 = 'data mining'
path_set = set(my_read('paths.txt'))
sents = sparse_retriever.retrieve(kw1, kw2)
df = pd.DataFrame({'sent':sents})
df['doc'] = df.apply(lambda x: nlp(x['sent']), axis=1)
df['path'] = df.apply(lambda x: examine_sent(x['doc'], path_set, kw1, kw2), axis=1)

In [None]:
df['path']

In [None]:
my_write('temp.txt', sents)