## Benchmarking Table

Sources:   

https://github.com/Tiiiger/bert_score     
https://arxiv.org/pdf/1904.09675.pdf      

Usage Examples:    
https://github.com/Tiiiger/bert_score/blob/master/example/Demo.ipynb     
https://colab.research.google.com/drive/1kpL8Y_AnUUiCxFjhxSrxCsc6-sDMNb_Q       

In [6]:
import logging
import transformers
import numpy as np

# visuaization
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams

rcParams["xtick.major.size"] = 0
rcParams["xtick.minor.size"] = 0
rcParams["ytick.major.size"] = 0
rcParams["ytick.minor.size"] = 0

rcParams["axes.labelsize"] = "large"
rcParams["axes.axisbelow"] = True
rcParams["axes.grid"] = True

# check bert_score installation
import bert_score
from bert_score import BERTScorer
# bert_score.__version__

import tensorflow as tf
from bleurt import score

# use metrics from HF to calc BLEU, ROUGE
import datasets

### Evaluate BART generations

In [10]:

home = "/home/angelo_ziletti/nlg-ra/T5_experiments"

model = "T5_condition"



'/home/angelo_ziletti/nlg-ra/T5_experiments/T5_condition/T5_condition'

In [8]:
import os.path


ROOT_FOLDER = "/home/angelo_ziletti/nlg-ra/T5_experiments"

MODEL = "T5_condition"

PARTIAL_PATH_GENERATED = "outputs/test_generations_beam_1.txt"

PARTIAL_PATH_ORIGINAL = "input_data/test.target"

PATH_GENERATED = os.path.abspath(os.path.normpath(os.path.join(ROOT_FOLDER, MODEL, PARTIAL_PATH_GENERATED)))
PATH_ORIGINAL = os.path.abspath(os.path.normpath(os.path.join(ROOT_FOLDER, MODEL, PARTIAL_PATH_ORIGINAL)))

test_generations_beam_1.txt'

p

SyntaxError: EOL while scanning string literal (<ipython-input-8-851118b69cb6>, line 5)

In [3]:
# read candidates
with open(PATH_GENERATED) as f:
    bart_generations = [line.strip() for line in f]

In [4]:
# read references
with open(PATH_ORIGINAL) as f:
    gold_references = [line.strip() for line in f]

In [5]:
assert len(bart_generations) == len(gold_references)

### BERTScore

In [6]:
scorer = BERTScorer(lang="en", rescale_with_baseline=True)

In [9]:
bart_precision, bart_recall, bart_F1_score = scorer.score(bart_generations, gold_references)


The outputs of the score function are Tensors of precision, recall, and F1 respectively. Each Tensor has the same number of items with the candidate and reference lists. Each item in the list is a scalar, representing the score for the corresponding candidates and references.

In [11]:
print(f"BART BERTScore (F1 score): {bart_F1_score.mean():.3f}")

BART BERTScore (F1 score): 0.370


### BLEURT

In [15]:
# run this line to avoid bugs
# https://github.com/google-research/bleurt/issues/7
tf.compat.v1.flags.DEFINE_string('f','','')

<absl.flags._flagvalues.FlagHolder at 0x7f9881e23150>

In [16]:
checkpoint_base = '/home/ruslan_yermakov/nlg-ra/evaluations/bleurt/bleurt-base-512'

In [30]:
scorer = score.BleurtScorer(checkpoint=checkpoint_base)
bart_bleurt_score = scorer.score(bart_generations, gold_references)

bart_bleurt_score

[-0.28073549270629883,
 -0.6532983183860779,
 -0.8578920364379883,
 -0.7483250498771667,
 0.04863320291042328,
 -0.18075129389762878,
 -0.49433743953704834,
 -0.725640594959259,
 -1.0101701021194458,
 -0.5460492372512817,
 -0.8862862586975098,
 -0.13390004634857178,
 -0.2583967447280884,
 -0.8037494421005249,
 -1.0095574855804443,
 -0.364612340927124,
 0.06424427777528763,
 -0.2753830850124359,
 0.2042696177959442,
 -0.8002218008041382,
 -0.8538126349449158,
 -1.008170247077942,
 0.07150513678789139,
 -0.6042264103889465,
 0.14394447207450867,
 -0.9141138195991516,
 -0.4035884439945221,
 -0.517869234085083,
 -0.2706965208053589,
 0.13398124277591705,
 -0.8207862377166748,
 -0.3488457202911377,
 -0.6457170248031616,
 -0.2191348671913147,
 -0.7718898057937622,
 -0.3507400155067444,
 -0.7824611663818359,
 -0.995337188243866,
 -0.21979808807373047,
 -0.46998587250709534,
 -0.18503564596176147,
 -0.18927882611751556,
 -0.8707035183906555,
 -0.568454384803772,
 -0.5808593034744263,
 0.125921

In [31]:
print("BART BLEURT SCore:", np.mean(bart_bleurt_score))

BART BLEURT SCore: -0.4382374292375103


### ROUGE Score

In [25]:
# load rouge metric
rouge_metric = datasets.load_metric('rouge')

# add pairs of predictions/reference to a temporary and memory efficient cache table (HF)
rouge_metric.add_batch(predictions=bart_generations, references=gold_references)

# length of a Metric object will return the number of examples (predictions or predictions/references pair)
assert len(rouge_metric) == len(bart_generations)

# gathers all the cached predictions and references to compute the metric score
final_score = rouge_metric.compute()

bart_rouge_scores = {k: round(v.mid.fmeasure * 100, 4) for k, v in final_score.items()}

print('BART ROUGE results: ', bart_rouge_scores)

# get only the rouge2
bart_rouge2 = bart_rouge_scores['rouge2']

BART ROUGE results:  {'rouge1': 45.2693, 'rouge2': 33.5639, 'rougeL': 39.4679, 'rougeLsum': 39.4814}


### BLEU Score

In [37]:
# possible fix to calculate BLEU ?

# for ind in range(len(bart_generations)):
#     tmp_list = []
#     tmp_list.append(bart_generations[ind])
#     bart_generations[ind] = tmp_list

    
# for ind in range(len(gold_references)):
#     tmp_list = []
#     tmp_list.append(gold_references[ind])
#     gold_references[ind] = tmp_list

In [None]:
# # load BLEU metric
# bleu_metric = datasets.load_metric('bleu')

# # add pairs of predictions/reference to a temporary and memory efficient cache table (HF)
# bleu_metric.add_batch(predictions=bart_generations, references=gold_references)

# # length of a Metric object will return the number of examples (predictions or predictions/references pair)
# assert len(bleu_metric) == len(bart_generations)

# # gathers all the cached predictions and references to compute the metric score
# bart_bleu_score = bleu_metric.compute()

# print('BART BLEU results: ', bart_bleu_score)

### Evaluate T5 generations

In [43]:
PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/results/t5_plain/test_generations_beam_1.txt'
PATH_ORIGINAL = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/test.target'

In [48]:
# read candidates
with open(PATH_GENERATED) as f:
    t5_generations = [line.strip() for line in f]

In [45]:
# read references
with open(PATH_ORIGINAL) as f:
    gold_references = [line.strip() for line in f]

In [46]:
assert len(t5_generations) == len(gold_references)

### BERTScore

In [47]:
scorer = BERTScorer(lang="en", rescale_with_baseline=True)

In [50]:
t5_precision, t5_recall, t5_F1_score = scorer.score(t5_generations, gold_references)

In [51]:
print(f"T5 BERTScore (F1 score): {t5_F1_score.mean():.3f}")

T5 BERTScore (F1 score): 0.363


### BLEURT

In [52]:
scorer = score.BleurtScorer(checkpoint=checkpoint_base)
t5_bleurt_score = scorer.score(t5_generations, gold_references)

t5_bleurt_score

INFO:tensorflow:Reading checkpoint /home/ruslan_yermakov/nlg-ra/evaluations/bleurt/bleurt-base-512.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Performs basic checks...
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Loading model...
INFO:tensorflow:BLEURT initialized.


[-0.10160675644874573,
 -0.4783018231391907,
 -0.5819965600967407,
 -0.6186337471008301,
 0.034514687955379486,
 -0.16736236214637756,
 -0.38427573442459106,
 -0.4744120240211487,
 -0.565367579460144,
 -0.42565080523490906,
 -0.7510333061218262,
 -0.07255915552377701,
 -0.18887914717197418,
 -0.622505247592926,
 -0.9077445268630981,
 0.07297306507825851,
 -0.33511197566986084,
 -0.42029085755348206,
 0.11334118247032166,
 -0.6032075881958008,
 -0.5796566605567932,
 -0.8064694404602051,
 0.07483282685279846,
 -0.43672826886177063,
 0.09288763254880905,
 -0.45380938053131104,
 -0.2817057967185974,
 -0.501639187335968,
 -0.25569063425064087,
 0.30657386779785156,
 -0.4640360176563263,
 -0.21618866920471191,
 -0.651645302772522,
 -0.027234897017478943,
 -0.5506269931793213,
 -0.11951619386672974,
 -0.47704607248306274,
 -0.6827062368392944,
 0.08198373019695282,
 -0.2868940234184265,
 0.0414331778883934,
 -0.22030022740364075,
 -0.5775912404060364,
 -0.4899177551269531,
 -0.423904836177825

In [53]:
print("T5 BLEURT Score:", np.mean(t5_bleurt_score))

T5 BLEURT Score: -0.31759179322766967


### ROUGE Score

In [54]:
# load rouge metric
rouge_metric = datasets.load_metric('rouge')

# add pairs of predictions/reference to a temporary and memory efficient cache table (HF)
rouge_metric.add_batch(predictions=t5_generations, references=gold_references)

# length of a Metric object will return the number of examples (predictions or predictions/references pair)
assert len(rouge_metric) == len(t5_generations)

# gathers all the cached predictions and references to compute the metric score
final_score = rouge_metric.compute()

t5_rouge_scores = {k: round(v.mid.fmeasure * 100, 4) for k, v in final_score.items()}

print('T5 ROUGE results: ', t5_rouge_scores)

# get only the rouge2
t5_rouge2 = bart_rouge_scores['rouge2']

T5 ROUGE results:  {'rouge1': 49.0801, 'rouge2': 35.8582, 'rougeL': 42.0133, 'rougeLsum': 42.0334}


### BLEU Score ? 