In [21]:
import re
from bs4 import BeautifulSoup
from typing import List
import tqdm
import pandas as pd
import sys
import wikipedia
sys.path.append('..')

from tools.BasicUtils import my_read, get_wiki_page_from_kw, get_wikipedia_entity, my_write
from tools.TextProcessing import find_noun_phrases, clean_text, my_sentence_tokenize, nlp, find_dependency_path_from_tree, exact_match

In [83]:
def collect_sents_from_wiki_page(page:wikipedia.WikipediaPage):
    remove_list = ['See also', 'References', 'Further reading', 'Sources', 'External links']
    dic = {sec : page.section(sec) for sec in page.sections}
    dic['summary'] = page.summary
    sents = []
    section_list = list(dic.keys())
    while len(section_list) > 0:
        section = section_list.pop()
        if section in remove_list:
            continue
        section_text = dic[section]
        if not section_text:
            continue
        # processed_text = clean_text(section_text)
        processed_text = ' '.join(section_text.lower().split())
        temp_sents = my_sentence_tokenize(processed_text, True)
        sents += temp_sents
    return list(sents)

def collect_entity_from_wiki_page(page:wikipedia.WikipediaPage):
    return [text.lower() for text in page.links]

def collect_keyword_from_wiki_page(page:wikipedia.WikipediaPage):
    soup = BeautifulSoup(page.html(), 'html.parser')
    main_block = soup.find('div', class_='mw-parser-output')
    keywords = set([l.text.lower() for l in main_block.findAll('a') if re.match(r'^(<a href="/wiki/)', str(l))])
    return keywords

def filter_by_path(sents:List[str], use_id:bool=False):
    patterns = [
        'i_nsubj attr( prep pobj)*', 
        'i_nsubj( conj)* dobj( acl prep pobj( conj)*){0,1}', 
        'i_nsubj( prep pobj)+( conj)*', 
        'i_nsubj advcl dobj( acl attr){0,1}', 
        'appos( conj)*', 
        'appos acl prep pobj( conj)*', 
        'i_nsubjpass( conj)*( prep pobj)+( conj)*', 
        'i_nsubjpass prep pobj acl dobj', 
        'i_dobj prep pobj( conj)*'
        # 'acl prep pobj( conj)*'
    ]
    matcher = re.compile('|'.join(patterns))

    df = pd.DataFrame(columns=['head', 'head_span', 'tail', 'tail_span', 'sent', 'path'])

    for idx, sent in enumerate(tqdm.tqdm(sents)):
        doc = nlp(sent)
        kws = find_noun_phrases(doc)
        if kws is None or len(kws) < 2:
            continue
        for i in range(len(kws)-1):
            for j in range(1, len(kws)):
                path = find_dependency_path_from_tree(doc, kws[i], kws[j])
                if not path:
                    continue
                i_path = [token[2:] if token[:2] == 'i_' else 'i_' + token for token in path.split()]
                i_path.reverse()
                i_path = ' '.join(i_path)
                if exact_match(matcher, path):
                    df = df.append({'head':kws[i],
                            'head_span':(kws[i][0].i, kws[i][-1].i),
                            'tail':kws[j],
                            'tail_span':(kws[j][0].i, kws[j][-1].i),
                            'sent':sent if not use_id else idx,
                            'path':path}, ignore_index=True)
                if exact_match(matcher, i_path):
                    df = df.append({'head':kws[j],
                            'head_span':(kws[j][0].i, kws[j][-1].i),
                            'tail':kws[i],
                            'tail_span':(kws[i][0].i, kws[i][-1].i),
                            'sent':sent if not use_id else idx,
                            'path':i_path}, ignore_index=True)
    return df

In [84]:
keyword = 'python'

p = wikipedia.page(keyword)
if p is not None:
    sents = collect_sents_from_wiki_page(p)
    keywords = collect_keyword_from_wiki_page(p)
    print('sentences collected')
    my_write('%s.txt' % keyword, sents)
    my_write('%s_kw.txt' % keyword, keywords)
    df = filter_by_path(sents)
    df.to_csv('%s_out.tsv' % keyword, sep='\t', index=False)

    dff = df[df.apply(lambda x: str(x['head']) in keywords and str(x['tail']) in keywords, axis=1)]
    dff.to_csv('%s_out_f.tsv' % keyword, sep='\t', index=False)

sentences collected


100%|██████████| 207/207 [00:01<00:00, 110.03it/s]


In [67]:
df['wanted'] = df.apply(lambda x: str(x['head']) in keywords, axis=1)

In [74]:
df.head()

Unnamed: 0,head,head_span,tail,tail_span,sent,path
0,(python),"(0, 0)","(an, interpreted, high, -, level, general, -, ...","(2, 11)",python is an interpreted high-level general-pu...,i_nsubj attr
1,"(its, design, philosophy)","(0, 2)","(code, readability)","(4, 5)",its design philosophy emphasizes code readabil...,i_nsubj dobj
2,(programmers),"(14, 14)","(clear, ,, logical, code)","(16, 19)",its language constructs as well as its object-...,i_nsubj dobj
3,(programmers),"(14, 14)","(clear, ,, logical, code)","(16, 19)",its language constructs as well as its object-...,i_nsubj dobj
4,(python),"(0, 0)",(version),"(5, 5)",python 2 was discontinued with version 2.7.18 ...,i_nsubjpass prep pobj


In [78]:
dff.head()

Unnamed: 0,head,head_span,tail,tail_span,sent,path
11,(cobra),"(0, 0)",(python),"(13, 13)","cobra uses indentation and a similar syntax, a...",i_nsubj conj dobj
21,(iterators),"(4, 4)",(python),"(8, 8)",ecmascript/javascript borrowed iterators and g...,i_dobj prep pobj
35,(python),"(0, 0)","(web, applications)","(8, 9)",python can serve as a scripting language for w...,i_nsubj prep pobj prep pobj
36,(python),"(0, 0)",(mod_wsgi),"(14, 14)",python can serve as a scripting language for w...,i_nsubj prep pobj
49,(sagemath),"(0, 0)",(python),"(12, 12)",sagemath is a computer algebra system with a n...,i_nsubj attr prep pobj prep pobj


In [81]:
len(dff)

33