Choose T5 model from checkpoints based on ROUGE-2 score.    

Find out which **beam_search** worked the best.

In [1]:
import os

# use metrics from HF
import datasets

In [2]:
# list availbale metrics

from datasets import list_metrics
metrics_list = list_metrics()

metrics_list

['accuracy',
 'bertscore',
 'bleu',
 'bleurt',
 'comet',
 'coval',
 'f1',
 'gleu',
 'glue',
 'indic_glue',
 'meteor',
 'precision',
 'recall',
 'rouge',
 'sacrebleu',
 'sari',
 'seqeval',
 'squad',
 'squad_v2',
 'super_glue',
 'wer',
 'xnli']

In [3]:
# load rouge metric
metric = datasets.load_metric('rouge')

In [4]:
# example
test1 = ['this is test and I believe it works']
test2 = ['this is test and I hope it works well enough']

metric.add_batch(predictions=test1, references=test2)

# computer the rouge score
res = metric.compute()

{k: round(v.mid.fmeasure * 100, 4) for k, v in res.items()}

{'rouge1': 77.7778, 'rouge2': 62.5, 'rougeL': 77.7778, 'rougeLsum': 77.7778}

In [5]:
def calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED):
    """
    PATH_ORIGINAL - target test generations
    PATH_GENERATED - path to the generated sections
    """
    
    # read references
    with open(PATH_ORIGINAL) as f:
        gold_references = [line.strip() for line in f]
    
    # read candidates
    with open(PATH_GENERATED) as f:
        current_candidates = [line.strip() for line in f]

    # check same length
    assert len(current_candidates) == len(gold_references)

    ### calculate ROUGE scores

    # add pairs of predictions/reference to a temporary and memory efficient cache table (HF)
    metric.add_batch(predictions=current_candidates, references=gold_references)

    # length of a Metric object will return the number of examples (predictions or predictions/references pair)
    assert len(metric) == len(current_candidates)

    # gathers all the cached predictions and references to compute the metric score
    final_score = metric.compute()

    # logging
    # print(PATH_GENERATED, " ~~~ ", {k: round(v.mid.fmeasure * 100, 4) for k, v in final_score.items()})

    final_rouge_scores = {k: round(v.mid.fmeasure * 100, 4) for k, v in final_score.items()}

    # get only the rouge2
    rouge2 = final_rouge_scores['rouge2']
    
    return {PATH_GENERATED: (rouge2, final_rouge_scores)}

In [6]:
PATH_ORIGINAL = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/test.target'

In [7]:
# keep results of different bart models(checkpints) and beam_size
t5_all_results = []

In [8]:
# T5 hp_1e_3_1 outputs final

PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_1.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_10.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_5.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)

{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_1.txt': (35.8582, {'rouge1': 49.0801, 'rouge2': 35.8582, 'rougeL': 42.0133, 'rougeLsum': 42.0334})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_10.txt': (31.3907, {'rouge1': 46.609, 'rouge2': 31.3907, 'rougeL': 37.1173, 'rougeLsum': 37.1775})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_5.txt': (32.7242, {'rouge1': 47.8031, 'rouge2': 32.7242, 'rougeL': 38.6072, 'rougeLsum': 38.6208})}


ROUGE-score results when running ./run_eval:    

"Beam_size 1": {"rouge1": 49.9203, "rouge2": 36.5194, "rougeL": 42.626, "rougeLsum": 48.9432, "n_obs": 742, "runtime": 344, "seconds_per_sample": 0.4636}   

"Beam_size 10": {"rouge1": 47.4838, "rouge2": 31.7817, "rougeL": 37.6113, "rougeLsum": 46.0008, "n_obs": 742, "runtime": 1650, "seconds_per_sample": 2.2237}   

"Beam_size 5": {"rouge1": 48.6738, "rouge2": 33.2454, "rougeL": 39.1518, "rougeLsum": 47.3546, "n_obs": 742, "runtime": 844, "seconds_per_sample": 1.1375}    


**Note**: Results a bit different....

In [10]:
# T5 hp_1e_3_1 outputs checkpoint-2000

PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/checkpoint-2000/test_generations_beam_1.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/checkpoint-2000/test_generations_beam_10.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/checkpoint-2000/test_generations_beam_5.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)

{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/checkpoint-2000/test_generations_beam_1.txt': (34.5407, {'rouge1': 48.5086, 'rouge2': 34.5407, 'rougeL': 40.9011, 'rougeLsum': 40.9303})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/checkpoint-2000/test_generations_beam_10.txt': (31.6091, {'rouge1': 46.9026, 'rouge2': 31.6091, 'rougeL': 37.3589, 'rougeLsum': 37.3751})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/checkpoint-2000/test_generations_beam_5.txt': (32.3363, {'rouge1': 47.6245, 'rouge2': 32.3363, 'rougeL': 38.2515, 'rougeLsum': 38.2648})}


In [12]:
# T5 hp_5e_5_1 outputs final

PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/test_generations_beam_1.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/test_generations_beam_10.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/test_generations_beam_5.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)

{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/test_generations_beam_1.txt': (29.8581, {'rouge1': 46.0089, 'rouge2': 29.8581, 'rougeL': 36.7163, 'rougeLsum': 36.6954})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/test_generations_beam_10.txt': (29.7606, {'rouge1': 46.4836, 'rouge2': 29.7606, 'rougeL': 35.8134, 'rougeLsum': 35.8189})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/test_generations_beam_5.txt': (29.9955, {'rouge1': 47.0268, 'rouge2': 29.9955, 'rougeL': 36.1874, 'rougeLsum': 36.1776})}


In [14]:
# T5 hp_5e_5_1 outputs checkpoint-1000

PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/checkpoint-1000/test_generations_beam_1.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/checkpoint-1000/test_generations_beam_10.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/checkpoint-1000/test_generations_beam_5.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)

{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/checkpoint-1000/test_generations_beam_1.txt': (28.4462, {'rouge1': 44.7807, 'rouge2': 28.4462, 'rougeL': 35.5207, 'rougeLsum': 35.5078})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/checkpoint-1000/test_generations_beam_10.txt': (29.4525, {'rouge1': 46.0201, 'rouge2': 29.4525, 'rougeL': 35.3518, 'rougeLsum': 35.3531})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_5e_5_1/checkpoint-1000/test_generations_beam_5.txt': (29.6051, {'rouge1': 46.3869, 'rouge2': 29.6051, 'rougeL': 35.7923, 'rougeLsum': 35.7673})}


In [15]:
# T5 hp_1e_5_1_1 outputs final

PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_5_1_1/test_generations_beam_1.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_5_1_1/test_generations_beam_10.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)


PATH_GENERATED = '/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_5_1_1/test_generations_beam_5.txt'

res = calc_rouge_t5(PATH_ORIGINAL, PATH_GENERATED)
print(res)

t5_all_results.append(res)

{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_5_1_1/test_generations_beam_1.txt': (27.5561, {'rouge1': 44.2003, 'rouge2': 27.5561, 'rougeL': 34.3786, 'rougeLsum': 34.3755})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_5_1_1/test_generations_beam_10.txt': (29.1512, {'rouge1': 45.8374, 'rouge2': 29.1512, 'rougeL': 35.0529, 'rougeLsum': 35.0477})}
{'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_5_1_1/test_generations_beam_5.txt': (29.1522, {'rouge1': 46.0351, 'rouge2': 29.1522, 'rougeL': 35.2759, 'rougeLsum': 35.2766})}


In [16]:
len(t5_all_results)

15

In [17]:
def _sorting_fun_rouge1(result):
    for key in result:
        return result[key][1]['rouge1']

def _sorting_fun_rouge2(result):
    for key in result:
        return result[key][1]['rouge2']
    
def _sorting_fun_rougeL(result):
    for key in result:
        return result[key][1]['rougeL']

def _sorting_fun_rougeLsum(result):
    for key in result:
        return result[key][1]['rougeLsum']

In [18]:
print('Best generated text according to ROUGE-1: ', sorted(t5_all_results, key=_sorting_fun_rouge1, reverse=True)[0])

Best generated text according to ROUGE-1:  {'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_1.txt': (35.8582, {'rouge1': 49.0801, 'rouge2': 35.8582, 'rougeL': 42.0133, 'rougeLsum': 42.0334})}


In [19]:
print('Best generated text according to ROUGE-2: ', sorted(t5_all_results, key=_sorting_fun_rouge2, reverse=True)[0])

Best generated text according to ROUGE-2:  {'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_1.txt': (35.8582, {'rouge1': 49.0801, 'rouge2': 35.8582, 'rougeL': 42.0133, 'rougeLsum': 42.0334})}


In [20]:
print('Best generated text according to ROUGE-L: ', sorted(t5_all_results, key=_sorting_fun_rougeL, reverse=True)[0])

Best generated text according to ROUGE-L:  {'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_1.txt': (35.8582, {'rouge1': 49.0801, 'rouge2': 35.8582, 'rougeL': 42.0133, 'rougeLsum': 42.0334})}


In [21]:
print('Best generated text according to ROUGE-Lsum: ', sorted(t5_all_results, key=_sorting_fun_rougeLsum, reverse=True)[0])

Best generated text according to ROUGE-Lsum:  {'/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/hp_1e_3_1/test_generations_beam_1.txt': (35.8582, {'rouge1': 49.0801, 'rouge2': 35.8582, 'rougeL': 42.0133, 'rougeLsum': 42.0334})}
