In [46]:
import os
import csv
import pandas as pd
import seaborn as sns
import simpledorff
from collections import defaultdict, Counter
from pprint import pprint

"""COLUMNS
['docid',
 'model',
 'Background',
 'Target Summary',
 'Generated Summary',
 'Is the generated summary fluent?',
 'Is the *population* in the generated summary the same as the population in '
 'the target summary?',
 'Is the *intervention* in the generated summary the same as the intervention '
 'in the target summary?',
 'Is the *outcome* in the generated summary the same as the outcome in the '
 'target summary?',
 'Comments about PIO agreement (optional)',
 'What is the effect direction in the *target* summary?',
 'What is the effect direction in the *generated* summary?',
 'Comments about effect directions (optional)',
 'What is the strength of the claim made in the *target* summary?',
 'What is the strength of the claim made in the *generated* summary?',
 'Comments about strength of claim (optional)',
 '',
 'annotator']
"""

BASE_DIR = '../data/'
DATA_FILES = os.listdir(BASE_DIR)
ANNOTATOR_ORDER = ['Bailey', 'Erin', 'Madeleine']

ANNOT_KEYS = [
 'Is the generated summary fluent?',
 'Is the *population* in the generated summary the same as the population in '
 'the target summary?',
 'Is the *intervention* in the generated summary the same as the intervention '
 'in the target summary?',
 'Is the *outcome* in the generated summary the same as the outcome in the '
 'target summary?',
 'What is the effect direction in the *target* summary?',
 'What is the effect direction in the *generated* summary?',
 'What is the strength of the claim made in the *target* summary?',
 'What is the strength of the claim made in the *generated* summary?',
]

ANSWER_KEYS = {
    'Is the generated summary fluent?': {
        2: ['2: Yes--there are no errors that impact comprehension of the summary'],
        1: ['1: Somewhat--there are some grammatical or lexical errors but I can understand the meaning'],
        0: ['0: No--there are major grammatical or lexical errors that impact comprehension']
    }, 
    'Is the *population* in the generated summary the same as the population in the target summary?': {
        0: ['0: No'], 
        1: ['1: Partially'], 
        2: ['2: Yes']
    },
    'Is the *intervention* in the generated summary the same as the intervention in the target summary?': {
        0: ['0: No'], 
        1: ['1: Partially'], 
        2: ['2: Yes']
    }, 
    'Is the *outcome* in the generated summary the same as the outcome in the target summary?': {
        0: ['0: No'], 
        1: ['1: Partially'], 
        2: ['2: Yes']
    },
    'What is the effect direction in the *target* summary?': {
        2: ['(+1): Positive effect'], 
        1: ['0: No effect'], 
        0: ['(-1): Negative effect']
    }, 
    'What is the effect direction in the *generated* summary?': {
        2: ['(+1): Positive effect'], 
        1: ['0: No effect'],
        0: ['(-1): Negative effect']    
    }, 
    'What is the strength of the claim made in the *target* summary?': {
        3: ['3: Strong claim'],
        2: ['2: Moderate claim'],
        1: ['1: Weak claim'],
        0: ['0: Not enough evidence (there is insufficient evidence to draw a conclusion)']
    }, 
    'What is the strength of the claim made in the *generated* summary?': {
        3: ['3: Strong claim'],
        2: ['2: Moderate claim'],
        1: ['1: Weak claim'],
        0: ['0: Not enough evidence (there is insufficient evidence to draw a conclusion)']
    }
}

REV_ANSWER_MAP = {
    question: {atext: num for num, atexts in answers.items() for atext in atexts} for question, answers in ANSWER_KEYS.items()
}


def df_to_experiment_annotator_table(df, experiment_col, annotator_col, class_col):
    return df.pivot_table(
        index=annotator_col, columns=experiment_col, values=class_col, aggfunc="first"
    )

In [4]:
# Make data frame

all_data = []
for fname in DATA_FILES:
    annotator = fname.split('-')[-1].strip()[:-4]
    print(fname)
    with open(os.path.join(BASE_DIR, fname), 'r') as f:
        reader = csv.DictReader(f, delimiter='\t', quotechar='"')
        for row in reader:
            if not row['Is the generated summary fluent?']:
                continue
            row['annotator'] = annotator
            row['docid_model'] = f"{row['docid']}_{row['model']}"
            all_data.append(row)

all_data.sort(key=lambda x: ANNOTATOR_ORDER.index(x['annotator']))

df = pd.DataFrame(all_data)

display(df)

Data for MSLR Annotation - 100 instances - Bailey.tsv
Data for MSLR Annotation - 100 instances - Erin.tsv
Data for MSLR Annotation - 100 instances - Madeleine.tsv


Unnamed: 0,docid,model,Background,Target Summary,Generated Summary,Is the generated summary fluent?,Is the *population* in the generated summary the same as the population in the target summary?,Is the *intervention* in the generated summary the same as the intervention in the target summary?,Is the *outcome* in the generated summary the same as the outcome in the target summary?,Comments about PIO agreement (optional),What is the effect direction in the *target* summary?,What is the effect direction in the *generated* summary?,Comments about effect directions (optional),What is the strength of the claim made in the *target* summary?,What is the strength of the claim made in the *generated* summary?,Comments about strength of claim (optional),Unnamed: 17,annotator,docid_model
0,31042097,longformer_predictions,Objectives Since the peptidyl arginine deimina...,The present findings indicate that RA is often...,Theibody titers of P. gingivalis are significa...,1: Somewhat--there are some grammatical or lex...,2: Yes,2: Yes,2: Yes,,(+1): Positive effect,(+1): Positive effect,,3: Strong claim,3: Strong claim,,,Bailey,31042097_longformer_predictions
1,1989168,longformer_predictions,PURPOSE To specify the effectiveness of head a...,"Concerning the treatment , surgical resection ...",Theirradiation of recurrent head and neck canc...,1: Somewhat--there are some grammatical or lex...,2: Yes,1: Partially,2: Yes,,(+1): Positive effect,(+1): Positive effect,,2: Moderate claim,2: Moderate claim,,,Bailey,1989168_longformer_predictions
2,1989168,bart_predictions,PURPOSE To specify the effectiveness of head a...,"Concerning the treatment , surgical resection ...",Theirradiation of recurrent head and neck canc...,1: Somewhat--there are some grammatical or lex...,2: Yes,2: Yes,1: Partially,,(+1): Positive effect,(+1): Positive effect,,2: Moderate claim,2: Moderate claim,,,Bailey,1989168_bart_predictions
3,78091788,bart_predictions,Background Denosumab is a human monoclonal ant...,The use of denosumab as an adjuvant treatment ...,Theosumab is an effective neoadjuvant therapy ...,1: Somewhat--there are some grammatical or lex...,1: Partially,2: Yes,2: Yes,,(+1): Positive effect,(+1): Positive effect,target summary has mixed effects but 2/3 are p...,1: Weak claim,2: Moderate claim,,,Bailey,78091788_bart_predictions
4,34774880,longformer_predictions,BACKGROUND Consensus statements recommend the ...,In steroid-naive patients with mild to moderat...,The is insufficient evidence to support the ad...,1: Somewhat--there are some grammatical or lex...,1: Partially,2: Yes,N/A: No outcome in generated summary,steroid-naive missing missing rate of exacerb...,0: No effect,N/A: no effect direction is specified in the g...,insufficient evidence to support?,2: Moderate claim,0: Not enough evidence (there is insufficient ...,does not significantly reduce = moderate?,,Bailey,34774880_longformer_predictions
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,10220655,bart_predictions,Platinum agents such as cisplatin and carbopla...,"In conclusion , pCR rates increase significant...",Theplatin and carboplatin were associated with...,2: Yes--there are no errors that impact compre...,,,,,,,,,,,,Madeleine,10220655_bart_predictions
197,207941217,bart_predictions,A bioartificial endocrine pancreas is proposed...,Accumulating evidence shows that scaffold-base...,The present systematic review provides an over...,2: Yes--there are no errors that impact compre...,,,,,,,,,,,,Madeleine,207941217_bart_predictions
198,3272236,longformer_predictions,BACKGROUND Cytotoxic chemotherapy has a limite...,"OS was improved when doxorubicin , cisplatin a...",The is the first systematic review to evaluate...,1: Somewhat--there are some grammatical or lex...,,,,,,,,,,,,Madeleine,3272236_longformer_predictions
199,270616,bart_predictions,Introduction Approximately 11.1 million cancer...,A small to moderate positive effect of physica...,The systematic review and meta- analysis sugge...,2: Yes--there are no errors that impact compre...,,,,,,,,,,,,Madeleine,270616_bart_predictions


In [10]:
# Compute Krippendorff's alpha

alphas = dict()
krippendorff_tables = dict()
for question in ANNOT_KEYS:
    alpha = simpledorff.calculate_krippendorffs_alpha_for_df(
        df,
        experiment_col='docid_model',
        annotator_col='annotator',
        class_col=question
    )
    alphas[question] = alpha
    table = df_to_experiment_annotator_table(df, 'docid_model', 'annotator', question)
    krippendorff_tables[question] = table
    

In [14]:
# Compute Agreement

by_docid = defaultdict(list)
for entry in all_data:
    by_docid[(entry['docid'], entry['model'])].append(entry)

# IAA
num_more_than_2 = 0
num_with_2 = 0
num_less_than_2 = 0
agreement_per_question = defaultdict(list)
split_by_model = {
    'bart_predictions': defaultdict(list),
    'longformer_predictions': defaultdict(list)
}
for (docid, model), entries in by_docid.items():
    for question in ANNOT_KEYS:
        answers = [entry[question] for entry in entries]
        if len(answers) > 2:
            num_more_than_2 += 1
            agreement_per_question[question].append(answers)
            split_by_model[model][question].append(answers)
            continue
        elif len(answers) == 2:
            num_with_2 += 1
            agreement_per_question[question].append(answers)
            split_by_model[model][question].append(answers)
        else:
            num_less_than_2 += 1
            continue
            
print(f'Number entries w/ more than 2 annotations: {num_more_than_2}')
print(f'Number entries w/ exactly 2 annotations: {num_with_2}')
print(f'Number entries w/ fewer than 2 annotations: {num_less_than_2}')

# agreement of first two annotations
agreements = dict()
ns = dict()
for question, answers in agreement_per_question.items():
    agrees = [a[0] == a[1] for a in answers]
    perc_agree = sum(agrees) / len(agrees)
    agreements[question] = perc_agree
    ns[question] = len(agrees)

Number entries w/ more than 2 annotations: 128
Number entries w/ exactly 2 annotations: 568
Number entries w/ fewer than 2 annotations: 88


In [15]:
# Manually compute alphas with only first two annotations

manual_alphas = dict()
for question, answers in agreement_per_question.items():
    entries = []
    for i, aaa in enumerate(answers):
        a0 = aaa[0]
        a1 = aaa[1]
        entries.append((i, a0, 'A'))
        entries.append((i, a1, 'B'))
    df = pd.DataFrame(entries, columns=['exp', 'class', 'ann'])
    alpha = simpledorff.calculate_krippendorffs_alpha_for_df(
        df,
        experiment_col='exp',
        annotator_col='ann',
        class_col='class'
    )
    manual_alphas[question] = alpha

In [36]:
# IAA
qlen = 50
print('Question\t\t\t\t\t\tAlpha\tAgreement\tBest Annotators Alpha')
print('----------------------------------------------------------------------------------------------------')
for question in alphas:
    alpha = alphas[question]
    agreement = agreements[question]
    manual_alpha = manual_alphas[question]
    spaces = ''
    if len(question) < qlen:
        spaces = ' ' * (qlen - len(question) + 3 - 1)
    print(f'{question[:qlen]}...{spaces}\t{alpha:.3f}\t{agreement:.3f}\t\t{manual_alpha:.3f}')
    

Question						Alpha	Agreement	Best Annotators Alpha
----------------------------------------------------------------------------------------------------
Is the generated summary fluent?...                    	0.703	0.851		0.697
Is the *population* in the generated summary the s...	0.246	0.805		0.427
Is the *intervention* in the generated summary the...	0.264	0.782		0.449
Is the *outcome* in the generated summary the same...	0.173	0.563		0.270
What is the effect direction in the *target* summa...	0.357	0.759		0.544
What is the effect direction in the *generated* su...	0.597	0.885		0.807
What is the strength of the claim made in the *tar...	0.241	0.632		0.364
What is the strength of the claim made in the *gen...	0.544	0.828		0.720


In [111]:
# show results per model
results_by_model = defaultdict(dict)

for model, splits in split_by_model.items():
    for question, answers in splits.items():
        use_answers = [a[0] for a in answers]
        ans_count = Counter(use_answers)
        results_by_model[model][question] = ans_count

models = list(results_by_model.keys())

for question in ANNOT_KEYS:
    comparisons = [results_by_model[model][question] for model in models]
    qa_map = REV_ANSWER_MAP[question]
    print(question)
    print('-----------')
    print(f"Answer\tText\t\t\t\t{'        '.join(models)}")
    for text, num in qa_map.items():
        print(f'{num}\t', end='')
        if text.startswith('0: Not enough evidence'):
            disp_text = '0: Not enough evidence'
        else:
            disp_text = text
        print(disp_text.split('--')[0] + ' '*(32 - len(disp_text.split('--')[0])), end='')
        for m_ind, model in enumerate(models):
            total = sum(comparisons[m_ind].values())
            count = comparisons[m_ind].get(text, 0)
            perc = 100 * count / total
            print(f'{count} ({perc:.1f}%)\t\t', end='')
        print()
    print()
    

Is the generated summary fluent?
-----------
Answer	Text				bart_predictions        longformer_predictions
2	2: Yes                          21 (47.7%)		20 (46.5%)		
1	1: Somewhat                     23 (52.3%)		23 (53.5%)		
0	0: No                           0 (0.0%)		0 (0.0%)		

Is the *population* in the generated summary the same as the population in the target summary?
-----------
Answer	Text				bart_predictions        longformer_predictions
0	0: No                           0 (0.0%)		1 (2.3%)		
1	1: Partially                    11 (25.0%)		9 (20.9%)		
2	2: Yes                          33 (75.0%)		33 (76.7%)		

Is the *intervention* in the generated summary the same as the intervention in the target summary?
-----------
Answer	Text				bart_predictions        longformer_predictions
0	0: No                           0 (0.0%)		1 (2.3%)		
1	1: Partially                    11 (25.0%)		4 (9.3%)		
2	2: Yes                          32 (72.7%)		36 (83.7%)		

Is the *outcome* in the generate

In [138]:
columns = df.columns
values = df.values.tolist()
seen_ids = set([])
keep_first = []
for row in values:
    if (row[0], row[1]) not in seen_ids:
        keep_first.append(row)
        seen_ids.add((row[0], row[1]))


98 98


In [161]:
df = pd.DataFrame(keep_first, columns=columns)

df['p_okay'] = df['Is the *population* in the generated summary the same as the population in the target summary?'].isin(['2: Yes', '1: Partially']) 
df['i_okay'] = df['Is the *intervention* in the generated summary the same as the intervention in the target summary?'].isin(['2: Yes', '1: Partially']) 
df['o_okay'] = df['Is the *outcome* in the generated summary the same as the outcome in the target summary?'].isin(['2: Yes', '1: Partially']) 
df['pio_okay'] = df['p_okay'] & df['i_okay'] & df['o_okay']
df['effect_direction_same'] = df['What is the effect direction in the *target* summary?'] == df['What is the effect direction in the *generated* summary?']
# df['claim_strength_same'] = df['What is the strength of the claim made in the *target* summary?'] == df['What is the strength of the claim made in the *generated* summary?']

df_bart = df[df['model'] == 'bart_predictions']
df_led = df[df['model'] == 'longformer_predictions']

print('\t\tbart\t\tled')
for key in ['pio_okay', 'effect_direction_same']:
    print(f'{key}')
    vals_bart = df_bart[key].value_counts(normalize=True).mul(100).astype(str)
    vals_led = df_led[key].value_counts(normalize=True).mul(100).astype(str)
    print(f'True\t\t{float(vals_bart[True]):.2f}%\t\t{float(vals_led[True]):.2f}%')
    print(f'False\t\t{float(vals_bart[False]):.2f}%\t\t{float(vals_led[False]):.2f}%')
    print()




		bart		led
pio_okay
True		71.43%		67.35%
False		28.57%		32.65%

effect_direction_same
True		51.02%		40.82%
False		48.98%		59.18%

