In [1]:
import rake
import rake_nltk
from textsemantics.server_api import ServerAPI
from lemmagen.lemmatizer import Lemmatizer
from lemmagen import DICTIONARY_SLOVENE

In [2]:
def rake1(file):
    """Rake method from nltk_rake library, available at https://github.com/zelandiya/RAKE-tutorial"""
    stoppath = "utils/slovenian-stopwords.txt"

    rake_object = rake.Rake(stoppath, 5, 3, 4)
    keywords = rake_object.run(file)

    return keywords

In [3]:
def rake2(text, language="slovene", max_n_grams=3, words=20):
    """Rake method from rake_nltk library, available at https://pypi.org/project/rake-nltk/"""
    with open("utils/slovenian-stopwords.txt") as f:
        stopwords = f.read()

    r = rake_nltk.Rake(stopwords=stopwords, language=language,
                       max_length=max_n_grams)

    r.extract_keywords_from_text(text)
    res = [(x, y) for y, x in r.get_ranked_phrases_with_scores()]

    return res[:words]

In [4]:
def evaluate(keywords, proposed_keywords):
    correct = len(set(keywords).intersection(set(proposed_keywords)))
    return correct / float(len(proposed_keywords)), \
           correct / float(len(keywords))

In [5]:
def evaluate_corpus(corpus, method):
    total_precision = 0
    total_recall = 0

    for i, file in corpus.iterrows():
        keywords = [i.lower() for i in eval(file["Keywords"])]
        proposed_keywords = [i for i, j in method(file["Text"])]
        # split keywords into separate words
        keywords = [j for i in keywords for j in i.split(' ')]
        proposed_keywords = [j for i in proposed_keywords for j in i.split(' ')]
        # then lemmatize and lowercase the results
        lemmatizer = Lemmatizer(dictionary=DICTIONARY_SLOVENE)
        k = [lemmatizer.lemmatize(word.lower()) for word in set(keywords)]
        p = [lemmatizer.lemmatize(word.lower()) for word in
             set(proposed_keywords)]
        prec, rec = evaluate(k, p)
        total_precision += prec
        total_recall += rec

    avg_precision = round(total_precision / float(len(corpus)), 2)
    avg_recall = round(total_recall / float(len(corpus)), 2)

    avg_fmeasure = round(
        2 * avg_precision * avg_recall / (avg_precision + avg_recall), 2)

    return avg_precision, avg_recall, avg_fmeasure

Naložimo CTCH podatke in preizkusimo obe RAKE knjižnici.

In [6]:
api = ServerAPI()
datasets = api.list_datasets()
api.get_dataset_info('CTCH')
metadata = api.get_metadata('CTCH')
corpus = api.get_texts(metadata["File"])
metadata["Text"] = corpus

In [7]:
res1 = evaluate_corpus(metadata, rake1)
res2 = evaluate_corpus(metadata, rake2)