In [2]:
import json
import os
import tqdm

# hypers
dataset_names = ['kp20k', 'inspec', 'krapivin', 'semeval', 'nus']
root_path = "/zf18/yw9fm/KPG_Project"
data_path = os.path.join(root_path,"data")
DEC_MODEL = "decode_model_500000_1587513120"

In [10]:
import pke
import json
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

text = "object-oriented design. II For pt.I. see Vestn. KhGPU, no.81, p.15-18 (2000). The paper presents the results of development of an object-oriented systemological method used to design complex systems. A formal system representation, as well as an axiomatics of the calculus of systems as functional flow-type objects based on a Node-Function-Object class hierarchy are proposed. A formalized NFO/UFO analysis algorithm and CASE tools used to support it are considered"
# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a TextRank extractor.
extractor = pke.unsupervised.TextRank()
text = "A new hybrid evolutionary based RBF networks method for forecasting time series: A case study of forecasting emergency supply demand time series This paper presents a new hybrid evolutionary based RBF networks method for forecasting time series. A new hybrid evolutionary algorithm is developed to determine both architecture and network parameters of radial basis functions neural networks simultaneously. The applicability and capability are demonstrated for several existing benchmark time series modeling and algorithms. The proposed method is applied for forecasting emergency supply demand time series."
# 2. load the content of the document.
extractor.load_document(input=text,
                        language='en',
                        normalization=None)

# 3. build the graph representation of the document and rank the words.
#    Keyphrase candidates are composed from the 33-percent
#    highest-ranked words.
extractor.candidate_weighting(window=2,
                              pos=pos,
                              top_percent=0.33)

# 4. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=10)
print(keyphrases)



[('basis functions neural networks', 0.2130397787189081), ('time series', 0.13635237204466238), ('hybrid evolutionary', 0.1086398203344952), ('networks', 0.07048082576712096), ('supply', 0.048554125375535745)]


In [4]:
import nltk
import spacy
stemmer = nltk.stem.porter.PorterStemmer()
spacy_nlp = spacy.load('en_core_web_sm')
doc = spacy_nlp(u'keyphrase generator')
for token in doc:
#     print(token.text)
    print(' '.join([stemmer.stem(w) for w in token.text.split()]))

keyphras
gener


In [5]:
def evaluate(K, gold, result):
    precision = []
    recall = []
    assert len(gold)==len(result)
    for gt, rank in zip(gold, result):
        if len(gt)>=1:
            gt = [' '.join(ls) for ls in gt]
            ref = set()
            i = 0
            while len(ref)<K and len(rank)>i:
                ref.add(stem_process(rank[i]))
                i += 1
            tmp = 0
            for rk in ref:
                if rk in gt:
                    tmp += 1
            precision.append(tmp*1.0/min(K,len(gt)))
            recall.append(tmp*1.0/len(gt))

    p = sum(precision)/len(precision)
    r = sum(recall)/len(recall)
#     print('precision:',p)
#     print('recall:',r)
    print('f1:',K,2*p*r/(p+r))
    return precision,recall

def stem_process(text):
    doc = spacy_nlp(text)
    return ' '.join([stemmer.stem(w) for w in doc.text.split()])

def evaluate_oracle(gold,result):
    precision = []
    recall = []
    assert len(gold)==len(result)
    for gt, rank in zip(gold, result):
        if len(gt)>=1:
            gt = [' '.join(ls) for ls in gt]
            k = len(gt)
            ref = set()
            i = 0
            while len(ref)<k and len(rank)>i:
                ref.add(stem_process(rank[i]))
                i += 1
            tmp = 0
            for rk in ref:
                if rk in gt:
                    tmp += 1
            precision.append(tmp*1.0/min(k,len(gt)))
            recall.append(tmp*1.0/len(gt))

    p = sum(precision)/len(precision)
    r = sum(recall)/len(recall)
#     print('precision:',p)
#     print('recall:',r)
    print('oracle f1:',2*p*r/(p+r))
    return precision,recall

f1: 5 0.07425170798686137


In [8]:
results = []
for name in dataset_names:
    print(name)
    test_path = os.path.join(data_path,name,name+'_test_spacynp.json')
    test_data = [json.loads(line) for line in open(test_path, 'r')]
    present = [[[stemmer.stem(word) for word in words] for words in data['present_tgt_phrases']] for data in test_data]  
    result = []
    for data in test_data:
        abstract = data['abstract']
        title = data['title']
        text = title + ' ' + abstract
        

        extractor = pke.unsupervised.TfIdf()

        # 2. load the content of the document.
        extractor.load_document(input=text,
                                language='en',
                                normalization=None)

        # 3. select {1-3}-grams not containing punctuation marks as candidates.
        extractor.candidate_selection(n=3, stoplist=list(string.punctuation))

        # 4. weight the candidates using a `tf` x `idf`
        df = pke.load_document_frequency_file(input_file='path/to/df.tsv.gz')
        extractor.candidate_weighting(df=df)

        # 5. get the 10-highest scored candidates as keyphrases
        keyphrases = extractor.get_n_best(n=20)
        print(keyphrases)
        result.append([k[0] for k in keyphrases])
    results.append(result)

kp20k


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/df.tsv.gz'

In [21]:
for i in range(5):
    print(len(results[i]))

19987
500
460
100
211


In [None]:
for idx, name in enumerate(dataset_names):
    print(name)
    test_path = os.path.join(data_path,name,name+'_test_spacynp.json')
    test_data = [json.loads(line) for line in open(test_path, 'r')]
    print(len(test_data))
    assert len(results[idx]) == len(test_data)
    present = [[[stemmer.stem(word) for word in words] for words in data['present_tgt_phrases']] for data in test_data]
    evaluate(5,present,results[idx])
    evaluate(10,present,results[idx])
    evaluate_oracle(present,results[idx])

kp20k
19987


In [None]:
# singlerank
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a SingleRank extractor.
extractor = pke.unsupervised.SingleRank()

# 2. load the content of the document.
extractor.load_document(input='path/to/input',
                        language='en',
                        normalization=None)

# 3. select the longest sequences of nouns and adjectives as candidates.
extractor.candidate_selection(pos=pos)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk. In the graph, nodes are words of
#    certain part-of-speech (nouns and adjectives) that are connected if
#    they occur in a window of 10 words.
extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=10)

In [7]:
# tf-idf
import string
import pke

# 1. create a TfIdf extractor.
extractor = pke.unsupervised.TfIdf()

# 2. load the content of the document.
extractor.load_document(input='path/to/input',
                        language='en',
                        normalization=None)

# 3. select {1-3}-grams not containing punctuation marks as candidates.
extractor.candidate_selection(n=3, stoplist=list(string.punctuation))

# 4. weight the candidates using a `tf` x `idf`
df = pke.load_document_frequency_file(input_file='path/to/df.tsv.gz')
extractor.candidate_weighting(df=df)

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=10)

FileNotFoundError: [Errno 2] No such file or directory: 'path/to/df.tsv.gz'

In [None]:
#positionrank

# define the valid Part-of-Speeches to occur in the graph
pos = {'NOUN', 'PROPN', 'ADJ'}

# define the grammar for selecting the keyphrase candidates
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

# 1. create a PositionRank extractor.
extractor = pke.unsupervised.PositionRank()

# 2. load the content of the document.
extractor.load_document(input='path/to/input',
                        language='en',
                        normalization=None)

# 3. select the noun phrases up to 3 words as keyphrase candidates.
extractor.candidate_selection(grammar=grammar,
                              maximum_word_number=3)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk biaised with the position of the words
#    in the document. In the graph, nodes are words (nouns and
#    adjectives only) that are connected if they occur in a window of
#    10 words.
extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=10)