Choose BART model from checkpoints based on ROUGE-2 score.    

Find out which **beam_search** worked the best.

In [1]:
import os

# use metrics from HF
import datasets

In [2]:
# list availbale metrics

from datasets import list_metrics
metrics_list = list_metrics()

metrics_list

['accuracy',
 'bertscore',
 'bleu',
 'bleurt',
 'comet',
 'coval',
 'f1',
 'gleu',
 'glue',
 'indic_glue',
 'meteor',
 'precision',
 'recall',
 'rouge',
 'sacrebleu',
 'sari',
 'seqeval',
 'squad',
 'squad_v2',
 'super_glue',
 'wer',
 'xnli']

In [3]:
# load rouge metric
metric = datasets.load_metric('rouge')

In [4]:
# example
test1 = ['this is test and I believe it works']
test2 = ['this is test and I hope it works well enough']

metric.add_batch(predictions=test1, references=test2)

# computer the rouge score
res = metric.compute()

{k: round(v.mid.fmeasure * 100, 4) for k, v in res.items()}

{'rouge1': 77.7778, 'rouge2': 62.5, 'rougeL': 77.7778, 'rougeLsum': 77.7778}

In [27]:
def calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED):
    """
    PATH_ORIGINAL - target test generations
    PATH_GENERATED - path to the generated sections
    """
    
    # read references
    with open(PATH_ORIGINAL) as f:
        gold_references = [line.strip() for line in f]
    
    # read candidates
    with open(PATH_GENERATED) as f:
        current_candidates = [line.strip() for line in f]

    # check same length
    assert len(current_candidates) == len(gold_references)

    ### calculate ROUGE scores

    # add pairs of predictions/reference to a temporary and memory efficient cache table (HF)
    metric.add_batch(predictions=current_candidates, references=gold_references)

    # length of a Metric object will return the number of examples (predictions or predictions/references pair)
    assert len(metric) == len(current_candidates)

    # gathers all the cached predictions and references to compute the metric score
    final_score = metric.compute()

    # logging
    # print(PATH_GENERATED, " ~~~ ", {k: round(v.mid.fmeasure * 100, 4) for k, v in final_score.items()})

    final_rouge_scores = {k: round(v.mid.fmeasure * 100, 4) for k, v in final_score.items()}

    # get only the rouge2
    rouge2 = final_rouge_scores['rouge2']
    
    return {PATH_GENERATED: (rouge2, final_rouge_scores)}

In [28]:
PATH_ORIGINAL = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/test.target'

In [32]:
# keep results of different bart models(checkpints) and beam_size
bart_all_results = []

In [33]:
# BART outputs final

PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_1.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_10.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_5.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)

{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_1.txt': (33.5639, {'rouge1': 45.2693, 'rouge2': 33.5639, 'rougeL': 39.4679, 'rougeLsum': 39.4814})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_10.txt': (30.9003, {'rouge1': 43.437, 'rouge2': 30.9003, 'rougeL': 36.0984, 'rougeLsum': 36.1253})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_5.txt': (31.9611, {'rouge1': 44.2596, 'rouge2': 31.9611, 'rougeL': 37.2803, 'rougeLsum': 37.2782})}


ROUGE-score results when running ./run_eval:    

"Beam_size 1": {"rouge1": 45.9848, "rouge2": 34.1539, "rougeL": 39.9457, "rougeLsum": 45.1163, "n_obs": 742, "runtime": 95, "seconds_per_sample": 0.128}   

"Beam_size 10": {"rouge1": 44.0955, "rouge2": 31.2756, "rougeL": 36.4828, "rougeLsum": 43.0052, "n_obs": 742, "runtime": 569, "seconds_per_sample": 0.7668}   

"Beam_size 5": {"rouge1": 44.8606, "rouge2": 32.4029, "rougeL": 37.6062, "rougeLsum": 43.8227, "n_obs": 742, "runtime": 285, "seconds_per_sample": 0.3841}    


**Note**: Results a bit different....

In [35]:
# BART outputs - checkpoint 1000

PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-1000/test_generations_beam_1.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-1000/test_generations_beam_10.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-1000/test_generations_beam_5.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)

{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-1000/test_generations_beam_1.txt': (30.4052, {'rouge1': 43.5828, 'rouge2': 30.4052, 'rougeL': 36.7879, 'rougeLsum': 36.7659})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-1000/test_generations_beam_10.txt': (29.1274, {'rouge1': 42.5189, 'rouge2': 29.1274, 'rougeL': 34.776, 'rougeLsum': 34.7649})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-1000/test_generations_beam_5.txt': (29.38, {'rouge1': 42.8437, 'rouge2': 29.38, 'rougeL': 35.1359, 'rougeLsum': 35.1595})}


ROUGE-score results when running ./run_eval:      

"Beam_size 1": {"rouge1": 44.3901, "rouge2": 31.0958, "rougeL": 37.3945, "rougeLsum": 43.3649, "n_obs": 742, "runtime": 91, "seconds_per_sample": 0.1226}       

"Beam_size 10": {"rouge1": 43.2283, "rouge2": 29.5757, "rougeL": 35.1969, "rougeLsum": 42.0986, "n_obs": 742, "runtime": 559, "seconds_per_sample": 0.7534}      

"Beam_size 5": {"rouge1": 43.5783, "rouge2": 29.8195, "rougeL": 35.6655, "rougeLsum": 42.4785, "n_obs": 742, "runtime": 282, "seconds_per_sample": 0.3801}       


In [37]:
# BART outputs - checkpoint 2000

PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-2000/test_generations_beam_1.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-2000/test_generations_beam_10.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-2000/test_generations_beam_5.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)

{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-2000/test_generations_beam_1.txt': (32.3359, {'rouge1': 44.6612, 'rouge2': 32.3359, 'rougeL': 38.5314, 'rougeLsum': 38.5622})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-2000/test_generations_beam_10.txt': (30.1754, {'rouge1': 42.9894, 'rouge2': 30.1754, 'rougeL': 35.6597, 'rougeLsum': 35.6787})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-2000/test_generations_beam_5.txt': (30.7662, {'rouge1': 43.5571, 'rouge2': 30.7662, 'rougeL': 36.3665, 'rougeLsum': 36.3722})}


In [38]:
# BART outputs - checkpoint 3000

PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-3000/test_generations_beam_1.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-3000/test_generations_beam_10.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-3000/test_generations_beam_5.txt'

res = calc_rouge_bart(PATH_ORIGINAL, PATH_GENERATED)
print(res)

bart_all_results.append(res)

{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-3000/test_generations_beam_1.txt': (33.1498, {'rouge1': 45.0864, 'rouge2': 33.1498, 'rougeL': 39.1962, 'rougeLsum': 39.2036})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-3000/test_generations_beam_10.txt': (30.7035, {'rouge1': 43.42, 'rouge2': 30.7035, 'rougeL': 36.0595, 'rougeLsum': 36.078})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/checkpoint-3000/test_generations_beam_5.txt': (31.4999, {'rouge1': 43.8639, 'rouge2': 31.4999, 'rougeL': 36.8585, 'rougeLsum': 36.8544})}


### Choose the best-performing model

In [52]:
len(bart_all_results)

12

In [49]:
def _sorting_fun_rouge1(result):
    for key in result:
        return result[key][1]['rouge1']

def _sorting_fun_rouge2(result):
    for key in result:
        return result[key][1]['rouge2']
    
def _sorting_fun_rougeL(result):
    for key in result:
        return result[key][1]['rougeL']

def _sorting_fun_rougeLsum(result):
    for key in result:
        return result[key][1]['rougeLsum']

In [50]:
print('Best generated text according to ROUGE-1: ', sorted(bart_all_results, key=_sorting_fun_rouge1, reverse=True)[0])

Best generated text according to ROUGE1:  {'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_1.txt': (33.5639, {'rouge1': 45.2693, 'rouge2': 33.5639, 'rougeL': 39.4679, 'rougeLsum': 39.4814})}


In [53]:
print('Best generated text according to ROUGE-2: ', sorted(bart_all_results, key=_sorting_fun_rouge2, reverse=True)[0])

Best generated text according to ROUGE2:  {'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_1.txt': (33.5639, {'rouge1': 45.2693, 'rouge2': 33.5639, 'rougeL': 39.4679, 'rougeLsum': 39.4814})}


In [54]:
print('Best generated text according to ROUGE-L: ', sorted(bart_all_results, key=_sorting_fun_rougeL, reverse=True)[0])

Best generated text according to ROUGE-L:  {'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_1.txt': (33.5639, {'rouge1': 45.2693, 'rouge2': 33.5639, 'rougeL': 39.4679, 'rougeLsum': 39.4814})}


In [55]:
print('Best generated text according to ROUGE-Lsum: ', sorted(bart_all_results, key=_sorting_fun_rougeLsum, reverse=True)[0])

Best generated text according to ROUGE-Lsum:  {'/home/ruslan_yermakov/nlg-ra/T5_experiments/BART_base/test_generations_beam_1.txt': (33.5639, {'rouge1': 45.2693, 'rouge2': 33.5639, 'rougeL': 39.4679, 'rougeLsum': 39.4814})}
