In [1]:
import numpy as np
import networkx as nx
from typing import List
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter
import math

import sys
sys.path.append('..')

from tools.BasicUtils import my_read, get_wiki_page_from_kw, clean_sent, my_write

In [2]:
def isf(w:str, D:int, counters:List[Counter]):
    return math.log(D * 1.0 / sum([1 if w in sent else 0 for sent in counters]))

def sentence_filtering(sents:List[str], kw1:str, kw2:str):
    return [sent for sent in sents if sent.count(' ') < 80 
                and sent.count(kw1) == 1 
                and sent.count(kw2) == 1 
                and sent.count('%s %s' % (kw1, kw2)) == 0 
                and sent.count('%s %s' % (kw2, kw1)) == 0]

self_define_stopwords = set(['-', ',', '.'])

def do_pagerank(sents:List[str]):
    # Remove stop words
    sw = set(stopwords.words('english'))
    clean_sents = [[token for token in word_tokenize(sent) if token not in sw and token not in self_define_stopwords] for sent in sents]

    # Generate word counters
    counters = [Counter(sent) for sent in clean_sents]

    # Build similarity matrix
    D = len(clean_sents)
    sim_matrix = np.zeros((D, D))
    part_list = [math.sqrt(sum([(sent[w] * isf(w, D, counters)) ** 2 for w in sent])) for sent in counters]
    # return part_list
    for i in range(D - 1):
        for j in range(i + 1, D):
            sent_1 = counters[i]
            sent_2 = counters[j]
            share_word_set = sent_1 & sent_2
            numerator = sum([(sent_1[w] * sent_2[w] * (isf(w, D, counters) ** 2)) for w in share_word_set])
            denominator = part_list[i] * part_list[j]
            sim_matrix[i, j] = numerator / denominator
    sim_matrix = sim_matrix + sim_matrix.T
    g = nx.from_numpy_array(sim_matrix)
    score = nx.pagerank(g)
    temp = sorted(score.items(), key=lambda x: x[1], reverse=True)
    idx = [item[0] for item in temp]
    return [sents[i] for i in idx], [score[i] for i in idx]

In [3]:
# Collect candidate sentences from arxiv documents
!grep 'python' ../data/corpus/small_sent.txt > temp.txt
!grep 'programming language' temp.txt > all_occurance.txt
sent_list = my_read('all_occurance.txt')
while '' in sent_list:
    sent_list.remove('')
sent_list = sentence_filtering(sent_list, 'python', 'programming language')
len(sent_list)

53

In [11]:
# Collect candidate sentences from wikipedia page
remove_list = ['See also', 'References', 'Further reading']

page_entity_list = ['python (programming language)', 'programming language']
pages = [get_wiki_page_from_kw(entity) for entity in page_entity_list]
kw_1 = 'python'
kw_2 = 'programming language'
sent_list = []
if None in pages:
    print('Below entities have missing page:')
    print([page_entity_list[i] for i in range(len(page_entity_list)) if pages[i] is None])
else:
    for p in pages:
        sections = p.sections.copy()
        for item in remove_list:
            if item in sections:
                sections.remove(item)
        for section in sections:
            text = p.section(section)
            text = clean_sent(text).lower()
            sentences = sent_tokenize(text)
            sent_list += [sent for sent in sentences if kw_1 in sent and kw_2 in sent]
        text = p.summary
        text = clean_sent(text).lower()
        sentences = sent_tokenize(text)
        sent_list += [sent for sent in sentences if kw_1 in sent and kw_2 in sent]

    sent_list = sentence_filtering(sent_list, 'python', 'programming language')
    print(len(sent_list))

9


In [4]:
sents, score = do_pagerank(sent_list)

In [5]:
sents[-10:]

['however, many other programming languages were written in non - english languages, for instance, the chinese basic, the chinese python, the russian rapira, and the arabic loughaty.',
 'python, as a popular programming language in recent years, has not been realized in gui design.',
 'this database can be conveniently searched and accessed from a wide variety of programming languages, such as c++, python, java, matlab, and r. this contribution provides some details about the successful conversion of the exfor library to a mongodb database and shows simple usage examples to underline its merits.',
 'python, a popular and fast - growing programming language, sees heavy use on both sites, with nearly one million questions asked on stack overflow and 400 thousand public gists on github.',
 'this engine is released as part of stocpy, a new turing - complete probabilistic programming language, available as a python library.',
 'we consider the overhead of function calls in the programming l

In [14]:
my_write('top_5_from_wiki.txt', sents[:5])