In [21]:
import utils
from transformers import GPT2Tokenizer
from rouge_score import rouge_scorer, scoring

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
# setup ROUGE scorer and tokenizer

# can edit the different rouge scores we want; research which will be best
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [49]:
# prepare the headline data in the string format used by the scorer
# TODO: get the actual and predicted headlines from the model once it is run
actual_headlines = {} # GET TESTING HEADLINES FOR EACH DATASET
predicted_headlines = {} # GET PREDICTED HEADLINES FOR EACH DATASET
# NOTE: the above dicts should be 1 to 1, (the keys and headline lists should be the same)
for dataset in actual_headlines:
    for i in range(len(actual_headlines[dataset])):
        actual_headlines[dataset][i] = tokenizer.decode(actual_headlines[dataset][i])
        predicted_headlines[dataset][i] = tokenizer.decode(predicted_headlines[dataset][i])


In [26]:
# calculate ROUGE score(s) for each dataset
rouge_scores = {}
for dataset in actual_headlines:
    rouge_scores[dataset] = []
    for i in range(len(actual_headlines[dataset])):
        rouge_scores[dataset].append(scorer.score(actual_headlines[dataset][i], predicted_headlines[dataset][i]))

In [47]:
# calculate average ROUGE score(s) for each dataset
# NOTE: keeping the div by 0 failure to make debugging easier
avg_rouge_scores = {}
for dataset in rouge_scores:
    precision = 0
    recall = 0
    fmeasure = 0
    for score in rouge_scores[dataset]:
        precision += score['rouge1'].precision
        recall += score['rouge1'].recall
        fmeasure += score['rouge1'].fmeasure
    precision /= len(rouge_scores[dataset])
    recall /= len(rouge_scores[dataset])
    fmeasure /= len(rouge_scores[dataset])
    avg_rouge_scores[dataset] = scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)

# also calculate overall average score
precision = 0
recall = 0
fmeasure = 0
for dataset in avg_rouge_scores:
    precision += avg_rouge_scores[dataset].precision
    recall += avg_rouge_scores[dataset].recall
    fmeasure += avg_rouge_scores[dataset].fmeasure
precision /= len(avg_rouge_scores)
recall /= len(avg_rouge_scores)
fmeasure /= len(avg_rouge_scores)
overall_score = scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)

ZeroDivisionError: division by zero

In [50]:
# example usage

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score('The quick brown fox jumps over the lazy dog',
                      'The quick brown dog jumps on the log.')

In [39]:
scores

{'rouge1': Score(precision=0.75, recall=0.6666666666666666, fmeasure=0.7058823529411765),
 'rougeL': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471)}

In [45]:
scoring.Score(.1,.2,.3)

Score(precision=0.1, recall=0.2, fmeasure=0.3)