## Benchmarking Table

Sources:   

https://github.com/Tiiiger/bert_score     
https://arxiv.org/pdf/1904.09675.pdf      

Usage Examples:    
https://github.com/Tiiiger/bert_score/blob/master/example/Demo.ipynb     
https://colab.research.google.com/drive/1kpL8Y_AnUUiCxFjhxSrxCsc6-sDMNb_Q       

In [7]:
import logging
import transformers
import numpy as np

# check bert_score installation
import bert_score
from bert_score import BERTScorer
# bert_score.__version__

import tensorflow as tf
#physical_devices = tf.config.list_physical_devices('GPU') 
#tf.config.experimental.set_memory_growth(physical_devices[0], True)

#configuration = tf.compat.v1.ConfigProto()
#configuration.gpu_options.allow_growth = True
#session = tf.compat.v1.Session(config=configuration)

from bleurt import score

# use metrics from HF to calc BLEU, ROUGE
import datasets

### Evaluate BART generations

In [261]:
import os.path


ROOT_FOLDER = "/home/angelo_ziletti/nlg-ra/T5_experiments"

#MODEL = "T5_plain"
#PARTIAL_PATH_GENERATED = "hp_1e_3_1/test_generations_beam_1.txt"
#MODEL = "T5_condition"
#MODEL = "T5_condition_semantics"
MODEL = "BART_base"
#MODEL = "BART_condition"
#MODEL = "BART_condition_semantics"

seed=7
#PARTIAL_PATH_GENERATED = "outputs/test_generations_beam_1.txt".format(seed)
PARTIAL_PATH_GENERATED = "outputs_seed_{}/test_generations_beam_1.txt".format(seed)
PARTIAL_PATH_ORIGINAL = "input_data/test.target"

PATH_GENERATED = os.path.abspath(os.path.normpath(os.path.join(ROOT_FOLDER, MODEL, PARTIAL_PATH_GENERATED)))
PATH_ORIGINAL = os.path.abspath(os.path.normpath(os.path.join(ROOT_FOLDER, MODEL, PARTIAL_PATH_ORIGINAL)))

# for content planner only
PATH_GENERATED = '/home/angelo_ziletti/nlg-ra/T5_experiments/content_planner/all_generations_Content_Planner.txt'
PATH_ORIGINAL = '/home/angelo_ziletti/nlg-ra/T5_experiments/content_planner/all_references_Content_Planner.txt'

print(PATH_GENERATED)
print(PATH_ORIGINAL)

/home/angelo_ziletti/nlg-ra/T5_experiments/content_planner/all_generations_Content_Planner.txt
/home/angelo_ziletti/nlg-ra/T5_experiments/content_planner/all_references_Content_Planner.txt


In [262]:
# read candidates
with open(PATH_GENERATED) as f:
    bart_generations = [line.strip() for line in f]
    print(len(bart_generations))

742


In [263]:
# read references
with open(PATH_ORIGINAL) as f:
    gold_references = [line.strip() for line in f]
    print(len(gold_references))

742


In [264]:
assert len(bart_generations) == len(gold_references)

### BERTScore

In [265]:
scorer = BERTScorer(lang="en", rescale_with_baseline=True)

In [266]:
bart_precision, bart_recall, bart_F1_score = scorer.score(bart_generations, gold_references)


The outputs of the score function are Tensors of precision, recall, and F1 respectively. Each Tensor has the same number of items with the candidate and reference lists. Each item in the list is a scalar, representing the score for the corresponding candidates and references.

In [267]:
print(f"BART BERTScore (F1 score): {bart_F1_score.mean():.3f}")

BART BERTScore (F1 score): 0.214


### BLEURT

In [268]:
# run this line to avoid bugs
# https://github.com/google-research/bleurt/issues/7
#tf.compat.v1.flags.DEFINE_string('f','','')

In [269]:
#checkpoint_base = '/home/angelo_ziletti/nlg-ra/evaluations/bleurt/bleurt-base-512'
checkpoint_base = '/home/angelo_ziletti/nlg-ra/evaluations/bleurt/bleurt-tiny-512'

In [270]:
scorer = score.BleurtScorer(checkpoint=checkpoint_base)
bart_bleurt_score = scorer.score(bart_generations, gold_references)

bart_bleurt_score

INFO:tensorflow:Reading checkpoint /home/angelo_ziletti/nlg-ra/evaluations/bleurt/bleurt-tiny-512.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Performs basic checks...
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Loading model...
INFO:tensorflow:Loading model.
INFO:tensorflow:Done.
INFO:tensorflow:BLEURT initialized.


[-0.3157899081707001,
 0.1509823501110077,
 -0.618306040763855,
 -0.28988733887672424,
 0.3167155086994171,
 0.5447707176208496,
 -0.33844855427742004,
 -0.09300699084997177,
 0.5467036962509155,
 -0.29770973324775696,
 -0.22845354676246643,
 -0.359506756067276,
 -0.3919811546802521,
 0.26108279824256897,
 0.701481819152832,
 -0.21999028325080872,
 0.7054827809333801,
 -0.7342337369918823,
 0.1740386188030243,
 -0.2682139575481415,
 0.597314715385437,
 -0.08225705474615097,
 -0.4868786633014679,
 -0.8725941181182861,
 -0.18235787749290466,
 -0.0031959041953086853,
 -0.19042328000068665,
 0.5587823390960693,
 -0.5390888452529907,
 -0.4991314113140106,
 0.7184403538703918,
 -0.9019498229026794,
 0.7172349691390991,
 -0.3653845489025116,
 -0.568967342376709,
 -0.640125036239624,
 -0.3532160818576813,
 -0.033290885388851166,
 -0.08995714038610458,
 0.088911272585392,
 -0.37355074286460876,
 0.27846357226371765,
 -0.20746830105781555,
 0.5124648809432983,
 -0.8003880381584167,
 0.6846399903

In [271]:
print("BART BLEURT Score:", np.mean(bart_bleurt_score))

BART BLEURT Score: -0.05547609260541731


### ROUGE Score

In [22]:
# load rouge metric
rouge_metric = datasets.load_metric('rouge')

# add pairs of predictions/reference to a temporary and memory efficient cache table (HF)
rouge_metric.add_batch(predictions=bart_generations, references=gold_references)

# length of a Metric object will return the number of examples (predictions or predictions/references pair)
assert len(rouge_metric) == len(bart_generations)

# gathers all the cached predictions and references to compute the metric score
final_score = rouge_metric.compute()

bart_rouge_scores = {k: round(v.mid.fmeasure * 100, 4) for k, v in final_score.items()}

print('BART ROUGE results: ', bart_rouge_scores)

# get only the rouge2
bart_rouge2 = bart_rouge_scores['rouge2']

BART ROUGE results:  {'rouge1': 50.6679, 'rouge2': 33.7483, 'rougeL': 39.3234, 'rougeLsum': 39.3519}


### BLEU Score

In [None]:
# possible fix to calculate BLEU ?

# for ind in range(len(bart_generations)):
#     tmp_list = []
#     tmp_list.append(bart_generations[ind])
#     bart_generations[ind] = tmp_list

    
# for ind in range(len(gold_references)):
#     tmp_list = []
#     tmp_list.append(gold_references[ind])
#     gold_references[ind] = tmp_list

In [185]:
# # load BLEU metric
#metric = datasets.load_metric('sacrebleu')

# bleu_metric = datasets.load_metric('bleu')

# # add pairs of predictions/reference to a temporary and memory efficient cache table (HF)
# bleu_metric.add_batch(predictions=bart_generations, references=gold_references)

# # length of a Metric object will return the number of examples (predictions or predictions/references pair)
# assert len(bleu_metric) == len(bart_generations)

# # gathers all the cached predictions and references to compute the metric score
# bart_bleu_score = bleu_metric.compute()

# print('BART BLEU results: ', bart_bleu_score)