In [13]:
from os.path import normpath, basename, join, exists, expanduser
import gzip
import json
from glob import glob
from termcolor import colored, cprint
from pythonrouge.pythonrouge import Pythonrouge

In [14]:
ROUGE = expanduser("/root/sharefolder/playgrounds/pku/text-mining/project1-summrazation/rouge/pythonrouge/pythonrouge/RELEASE-1.5.5")
ROUGE_PATH = join(ROUGE, "ROUGE-1.5.5.pl")
ROUGE_DATA = join(ROUGE, "data")

### evaluate on us model output

In [15]:
evaluation_file = './data/abstractive_output/test_result.txt'
evaluate_rouge_scores(evaluation_file)

38 entries...
38 entries are used for evaluation.


{'ROUGE-1': 0.0, 'ROUGE-2': 0.0, 'ROUGE-SU4': 0.0}

In [11]:
"""
Computes ROUGE scores for models and datasets that have outputs available
"""

prediction_filename = "prediction.json.gz"
evaluation_filename = "evaluation.json"

def remove_prefix_and_suffix(text, prefix, suffix):
    if text.startswith(prefix):
        text = text[len(prefix):]
    if text.endswith(suffix):
        text = text[:-len(suffix)]
    return text

def remove_tags(sentence):
    return remove_prefix_and_suffix(sentence, "<d> <p> <s>", "</s> </p> </d>").strip()

def evaluate_rouge_scores(evaluation_file):
    summaries = [] # model-generated
    references = [] # human-generated
    # articles = {}
    with open(evaluation_file, encoding='utf8') as file:
        evaluation_lines = file.read().strip().split('\n')
        print("%d entries..." % len(evaluation_lines))
        for line in evaluation_lines:
            sum_line = line.split('\t')[0]
            ref_line = line.split('\t')[1:]
            summaries.append( remove_tags(sum_line).encode('utf-8').split())
            references.append([ remove_tags(example).encode('utf-8').split() for example in ref_line])
    print("%d entries are used for evaluation." % len(summaries))
    
    rouge = Pythonrouge(summary_file_exist=False,
                    summary=summaries, reference=references,
                    n_gram=2, ROUGE_SU4=True, ROUGE_L=False,
                    recall_only=True, stemming=True, stopwords=True,
                    word_level=True, length_limit=True, length=50,
                    use_cf=False, cf=95, scoring_formula='average',
                    resampling=True, samples=1000, favor=True, p=0.5)
    score = rouge.calc_score()
    return score