In [None]:
%pip install datasets
%pip install py7zr
%pip install tabulate
%pip install numpy

In [18]:
from datasets import load_dataset
import json
from tabulate import tabulate
import numpy as np

# Load ACU Datasets

In [None]:
cnndm_test = load_dataset("Salesforce/rose", "cnndm_test", use_auth_token=AUTH_TOKEN)["data"]
cnndm_val = load_dataset("Salesforce/rose", "cnndm_validation", use_auth_token=AUTH_TOKEN)["data"]
# xsum = load_dataset("Salesforce/rose", "xsum", use_auth_token=AUTH_TOKEN, download_mode="force_redownload")["data"]
samsum = load_dataset("Salesforce/rose", "samsum", use_auth_token=AUTH_TOKEN)["data"]
cnndm_protocol = load_dataset("Salesforce/rose", "cnndm_protocol", use_auth_token=AUTH_TOKEN)["data"]
cnndm_protocol_gpt3 = load_dataset("Salesforce/rose", "cnndm_protocol_gpt3", use_auth_token=AUTH_TOKEN)["data"]

# Human Annotations with ACU Protocol
## Check the dataset information of the ACU annotations on CNNDM

In [7]:
print(cnndm_test)

Dataset({
    features: ['source', 'reference', 'reference_acus', 'count_id', 'example_id', 'annotations', 'system_outputs'],
    num_rows: 500
})


## Check an example of ACU annotations

In [10]:
print(json.dumps(cnndm_test[0], indent=2))

{
  "source": "West Ham wonderkid Reece Oxford pushed himself closer to a long-awaited first-team debut after scoring for the Under 21s against Manchester United. The 16-year-old, who has been on the bench for the senior side this season, scored with a header in the Hammers 3-2 defeat against the Red Devils on Tuesday night. The goal will push Oxford's claims for a senior debut before the end of the season. Reece Oxford, pictured here earlier in the season, has moved closer to a first-team debut for West Ham . Oxford scored for West Ham's under 21 team in their 3-2 defeat to Manchester United on Tuesday night . Oxford signed his first professional contract earlier this season, snubbing interest from all of the Premier League's big guns. The central defender, who has been likened to Rio Ferdinand, has tipped to be a future England international. Oxford regularly trains with the first team and is hopeful Sam Allardyce will give him his first team debut before the end of campaign.",
  "re

## Check the available systems

In [11]:
print(cnndm_test[0]["annotations"].keys())

dict_keys(['bart', 'gold', 'pegasus', 'brio', 'gsum', 'simcls', 'cliff', 'ctrlsum', 'frost', 'glob', 'matchsum', 'brio-ext'])


## Check the available annotations 

In [12]:
print(cnndm_test[0]["annotations"]["brio"].keys())

dict_keys(['acu_labels', 'acu', 'normalized_acu'])


## Calculate the average ACU scores of each system

In [23]:
systems = list(cnndm_test[0]["annotations"].keys())
system_scores = [(system, np.mean([x["annotations"][system]["acu"] for x in cnndm_test]).item()) for system in systems]
system_scores = sorted(system_scores, key=lambda x: x[1], reverse=True)
print(tabulate(system_scores, headers=["System", "ACU Score"]))

System      ACU Score
--------  -----------
ctrlsum      0.445805
gsum         0.444687
brio         0.440345
matchsum     0.425005
brio-ext     0.417233
simcls       0.404669
bart         0.38832
cliff        0.385072
frost        0.384386
gold         0.381019
pegasus      0.375578
glob         0.364016


## Calculate the average *normalized* ACU scores of each system

In [24]:
systems = list(cnndm_test[0]["annotations"].keys())
system_scores = [(system, np.mean([x["annotations"][system]["normalized_acu"] for x in cnndm_test]).item()) for system in systems]
system_scores = sorted(system_scores, key=lambda x: x[1], reverse=True)
print(tabulate(system_scores, headers=["System", "ACU Score"]))

System      ACU Score
--------  -----------
brio         0.371826
ctrlsum      0.36131
simcls       0.359992
gsum         0.348765
glob         0.340702
gold         0.33799
frost        0.336847
matchsum     0.336829
brio-ext     0.335767
cliff        0.329551
bart         0.323364
pegasus      0.319942


# Human Annotation with Four Different Protocols
## Check available systems

In [26]:
print(cnndm_protocol[0]["annotations"].keys())

dict_keys(['bart', 'gold', 'pegasus', 'brio', 'gsum', 'simcls', 'cliff', 'ctrlsum', 'frost', 'glob', 'matchsum', 'brio-ext'])


## Check available protocols

In [27]:
print(cnndm_protocol[0]["annotations"]["brio"].keys())

dict_keys(['prior', 'ref_based', 'ref_free', 'acu_labels', 'acu', 'normalized_acu'])


## Check GPT-3 performance under different protocols

In [33]:
protocols = ["prior", "ref_based", "ref_free", "acu"]
result = [np.mean([x["annotations"]["gpt3"][p] for x in cnndm_protocol_gpt3]).item() for p in protocols]
print(tabulate([result], headers=protocols))

[3.723333364725113, 2.74000000834465, 3.7633333563804627, 0.26899106994271277]
  prior    ref_based    ref_free       acu
-------  -----------  ----------  --------
3.72333         2.74     3.76333  0.268991


## Calculate *LLMs* performance under different protocols (with baseline fine-tuned models)

In [35]:
protocols = ["prior", "ref_based", "ref_free", "acu"]
systems = ["gpt3", "brio", "t0", "bart"]
result = [[s]+ [np.mean([x["annotations"][s][p] for x in cnndm_protocol_gpt3]).item() for p in protocols] for s in systems]
print(tabulate(result, headers=["System"]+protocols))

System      prior    ref_based    ref_free       acu
--------  -------  -----------  ----------  --------
gpt3      3.72333      2.74        3.76333  0.268991
brio      3.51333      3.07333     3.49     0.429045
t0        3.33333      2.84333     3.24333  0.294712
bart      3.58333      2.92667     3.52333  0.36711


## Calculate *fine-tuned* system performance under different protocols

In [36]:
protocols = ["prior", "ref_based", "ref_free", "acu"]
systems = list(cnndm_protocol[0]["annotations"].keys())
result = [[s]+ [np.mean([x["annotations"][s][p] for x in cnndm_protocol]).item() for p in protocols] for s in systems]
print(tabulate(result, headers=["System"]+protocols))

System      prior    ref_based    ref_free       acu
--------  -------  -----------  ----------  --------
bart         3.12         2.47        3.4   0.36711
gold         2.74         2.52        2.98  0.370999
pegasus      2.74         2.44        2.99  0.378522
brio         3.09         2.52        3.29  0.429045
gsum         3.3          2.54        3.53  0.432917
simcls       2.93         2.53        3.18  0.397583
cliff        2.92         2.49        3.3   0.393997
ctrlsum      3.3          2.56        3.26  0.451103
frost        2.79         2.58        2.96  0.404047
glob         2.79         2.55        2.91  0.349981
matchsum     3.05         2.36        3.38  0.396766
brio-ext     3.1          2.38        3.31  0.370685
