In [11]:
import os
import csv
import pandas as pd
import seaborn as sns
import simpledorff
from sklearn.metrics import cohen_kappa_score
from collections import defaultdict, Counter
from pprint import pprint
import numpy as np

"""COLUMNS
['ExpID',
 'ReviewID',
 'Background',
 'Target Summary',
 'Generated Summary',
 'Is the generated summary fluent?',
 'Is the *population* in the generated summary the same as the population in '
 'the target summary?',
 'Is the *intervention* in the generated summary the same as the intervention '
 'in the target summary?',
 'Is the *outcome* in the generated summary the same as the outcome in the '
 'target summary?',
 'Comments about PIO agreement (optional)',
 'What is the effect direction in the *target* summary?',
 'What is the effect direction in the *generated* summary?',
 'Comments about effect directions (optional)',
 'What is the strength of the claim made in the *target* summary?',
 'What is the strength of the claim made in the *generated* summary?',
 'Comments about strength of claim (optional)',
 '',
 'annotator']
"""

BASE_DIR = '../data/Annotations/'
DATA_FILES = os.listdir(BASE_DIR)
ANNOTATOR_ORDER = ['Bailey', 'Erin']

ANNOT_KEYS = [
 'Is the generated summary fluent?',
 'Is the *population* in the generated summary the same as the population in '
 'the target summary?',
 'Is the *intervention* in the generated summary the same as the intervention '
 'in the target summary?',
 'Is the *outcome* in the generated summary the same as the outcome in the '
 'target summary?',
 'What is the effect direction in the *target* summary?',
 'What is the effect direction in the *generated* summary?',
 'What is the strength of the claim made in the *target* summary?',
 'What is the strength of the claim made in the *generated* summary?',
]

ANNOT_ABBREV = {
 'Is the generated summary fluent?': 'Fluency',
 'Is the *population* in the generated summary the same as the population in '
 'the target summary?': 'Population',
 'Is the *intervention* in the generated summary the same as the intervention '
 'in the target summary?': 'Intervention',
 'Is the *outcome* in the generated summary the same as the outcome in the '
 'target summary?': 'Outcome',
 'What is the effect direction in the *target* summary?': 'Direction-target',
 'What is the effect direction in the *generated* summary?': 'Direction-generated',
 'What is the strength of the claim made in the *target* summary?': 'Strength-target',
 'What is the strength of the claim made in the *generated* summary?': 'Strength-generated',
}

ANSWER_KEYS = {
    'Is the generated summary fluent?': {
        2: ['2: Yes--there are no errors that impact comprehension of the summary'],
        1: ['1: Somewhat--there are some grammatical or lexical errors but I can understand the meaning'],
        0: ['0: No--there are major grammatical or lexical errors that impact comprehension']
    }, 
    'Is the *population* in the generated summary the same as the population in the target summary?': {
        0: ['0: No'], 
        1: ['1: Partially'], 
        2: ['2: Yes']
    },
    'Is the *intervention* in the generated summary the same as the intervention in the target summary?': {
        0: ['0: No'], 
        1: ['1: Partially'], 
        2: ['2: Yes']
    }, 
    'Is the *outcome* in the generated summary the same as the outcome in the target summary?': {
        0: ['0: No'], 
        1: ['1: Partially'], 
        2: ['2: Yes']
    },
    'What is the effect direction in the *target* summary?': {
        2: ['(+1): Positive effect'], 
        1: ['0: No effect'], 
        0: ['(-1): Negative effect']
    }, 
    'What is the effect direction in the *generated* summary?': {
        2: ['(+1): Positive effect'], 
        1: ['0: No effect'],
        0: ['(-1): Negative effect']    
    }, 
    'What is the strength of the claim made in the *target* summary?': {
        3: ['3: Strong claim'],
        2: ['2: Moderate claim'],
        1: ['1: Weak claim'],
        0: ['0: Not enough evidence (there is insufficient evidence to draw a conclusion)']
    }, 
    'What is the strength of the claim made in the *generated* summary?': {
        3: ['3: Strong claim'],
        2: ['2: Moderate claim'],
        1: ['1: Weak claim'],
        0: ['0: Not enough evidence (there is insufficient evidence to draw a conclusion)']
    }
}

REV_ANSWER_MAP = {
    question: {atext: num for num, atexts in answers.items() for atext in atexts} for question, answers in ANSWER_KEYS.items()
}


def df_to_experiment_annotator_table(df, experiment_col, annotator_col, class_col):
    return df.pivot_table(
        index=annotator_col, columns=experiment_col, values=class_col, aggfunc="first"
    )

In [12]:
# Make data frame

all_data = []
for fname in DATA_FILES:
    if not fname.endswith('tsv'):
        continue
    annotator = fname.split('-')[-1].strip()[:-4]
    print(fname)
    with open(os.path.join(BASE_DIR, fname), 'r') as f:
        reader = csv.DictReader(f, delimiter='\t', quotechar='"')
        for row in reader:
            if not row['Is the generated summary fluent?']:
                continue
            row['annotator'] = annotator
            row['docid_model'] = f"{row['ReviewID']}_{row['ExpID']}"
            row['ExpID_short'] = row['ExpID'][-6:]
            all_data.append(row)

all_data.sort(key=lambda x: ANNOTATOR_ORDER.index(x['annotator']))

df = pd.DataFrame(all_data)

display(df)

Data for MSLR Annotation - Cochrane Subtask - Erin.tsv
Data for MSLR Annotation - Cochrane Subtask - Bailey.tsv


Unnamed: 0,ExpID,ReviewID,Target Summary,Generated Summary,Is the generated summary fluent?,Is the *population* in the generated summary the same as the population in the target summary?,Is the *intervention* in the generated summary the same as the intervention in the target summary?,Is the *outcome* in the generated summary the same as the outcome in the target summary?,Comments about PIO agreement (optional),What is the effect direction in the *target* summary?,What is the effect direction in the *generated* summary?,Comments about effect directions (optional),What is the strength of the claim made in the *target* summary?,What is the strength of the claim made in the *generated* summary?,Comments about strength of claim (optional),annotator,docid_model,ExpID_short
0,01G8WPZRN2E3EHA2WENHVNCH8M,CD000024,"Since the last version of the review, neither ...",There is insufficient evidence to support the ...,2: Yes--there are no errors that impact compre...,2: Yes,1: Partially,1: Partially,Herapin not in target summary,(-1): Negative effect,N/A: no effect direction is specified in the t...,Data do not support,2: Moderate claim,0: Not enough evidence (there is insufficient ...,,Bailey,CD000024_01G8WPZRN2E3EHA2WENHVNCH8M,VNCH8M
1,01GA1HEQEJHQHEAQD8YX8FWF5T,CD000123,Lipid-lowering therapy is effective in reducin...,There is insufficient evidence to support the ...,2: Yes--there are no errors that impact compre...,2: Yes,2: Yes,1: Partially,,(+1): Positive effect,N/A: no effect direction is specified in the t...,,2: Moderate claim,0: Not enough evidence (there is insufficient ...,,Bailey,CD000123_01GA1HEQEJHQHEAQD8YX8FWF5T,8FWF5T
2,01G4NE2DDS5G6Q047M97PX7SGV,CD000123,Lipid-lowering therapy is effective in reducin...,There is insufficient evidence from randomised...,2: Yes--there are no errors that impact compre...,2: Yes,2: Yes,1: Partially,,(+1): Positive effect,N/A: no effect direction is specified in the t...,,2: Moderate claim,0: Not enough evidence (there is insufficient ...,,Bailey,CD000123_01G4NE2DDS5G6Q047M97PX7SGV,PX7SGV
3,01G9JE4STYHQ2136MCATAQ85CE,CD000123,Lipid-lowering therapy is effective in reducin...,Atorvastatin appears to be effective in reduci...,2: Yes--there are no errors that impact compre...,0: No,1: Partially,2: Yes,,(+1): Positive effect,(+1): Positive effect,,2: Moderate claim,1: Weak claim,,Bailey,CD000123_01G9JE4STYHQ2136MCATAQ85CE,AQ85CE
4,01G9RKHTAQVPR038VTDCJB6Z8F,CD000123,Lipid-lowering therapy is effective in reducin...,There is insufficient evidence to support the ...,2: Yes--there are no errors that impact compre...,0: No,1: Partially,1: Partially,,(+1): Positive effect,N/A: no effect direction is specified in the t...,,2: Moderate claim,0: Not enough evidence (there is insufficient ...,,Bailey,CD000123_01G9RKHTAQVPR038VTDCJB6Z8F,JB6Z8F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,01G8WPZRN2E3EHA2WENHVNCH8M,CD003917,No definitive conclusions can be made about th...,Dexrazoxane reduces cardiotoxicity in patients...,1: Somewhat--there are some grammatical or lex...,1: Partially,2: Yes,1: Partially,,Other / uncertain (please comment),(+1): Positive effect,multiple outcomes in target w/ different effec...,Other / uncertain (please comment),2: Moderate claim,multiple outcomes in target w/ different effec...,Erin,CD003917_01G8WPZRN2E3EHA2WENHVNCH8M,VNCH8M
633,01G9RKHTAQVPR038VTDCJB6Z8F,CD003917,No definitive conclusions can be made about th...,There is insufficient evidence to support the ...,2: Yes--there are no errors that impact compre...,1: Partially,2: Yes,1: Partially,,Other / uncertain (please comment),N/A: no effect direction is specified in the t...,multiple outcomes in target w/ different effec...,Other / uncertain (please comment),0: Not enough evidence (there is insufficient ...,multiple outcomes in target w/ different effec...,Erin,CD003917_01G9RKHTAQVPR038VTDCJB6Z8F,JB6Z8F
634,01GCRZERDX9XKMDWQ5GDSPNXTA,CD003937,There is insufficient evidence to support the ...,There is insufficient evidence to support or r...,2: Yes--there are no errors that impact compre...,0: No,1: Partially,N/A: No outcome in generated summary,,N/A: no effect direction is specified in the t...,N/A: no effect direction is specified in the t...,,0: Not enough evidence (there is insufficient ...,0: Not enough evidence (there is insufficient ...,,Erin,CD003937_01GCRZERDX9XKMDWQ5GDSPNXTA,SPNXTA
635,01GCRZERDX9XKMDWQ5GDSPNXTA,CD003948,There is weak evidence from one small trial th...,The results of this review do not support the ...,2: Yes--there are no errors that impact compre...,1: Partially,0: No,N/A: No outcome in generated summary,,(+1): Positive effect,0: No effect,,1: Weak claim,1: Weak claim,,Erin,CD003948_01GCRZERDX9XKMDWQ5GDSPNXTA,SPNXTA


In [18]:
# Compute Krippendorff's alpha
alphas = dict()
krippendorff_tables = dict()
for question in ANNOT_KEYS:
    alpha = simpledorff.calculate_krippendorffs_alpha_for_df(
        df,
        experiment_col='docid_model',
        annotator_col='annotator',
        class_col=question
    )
    alphas[question] = alpha
    table = df_to_experiment_annotator_table(df, 'docid_model', 'annotator', question)
    krippendorff_tables[question] = table

for question, abbrev in ANNOT_ABBREV.items():
    print(f'{abbrev}: {alphas[question]:.3f}')

Fluency: 0.521
Population: 0.336
Intervention: 0.604
Outcome: 0.197
Direction-target: 0.850
Direction-generated: 0.786
Strength-target: 0.298
Strength-generated: 0.776


In [19]:
# Compute Agreement

by_docid = defaultdict(list)
for entry in all_data:
    by_docid[(entry['ReviewID'], entry['ExpID'])].append(entry)

# IAA
num_more_than_2 = 0
num_with_2 = 0
num_less_than_2 = 0
agreement_per_question = defaultdict(list)
split_by_model = defaultdict(lambda: defaultdict(list))
for (docid, model), entries in by_docid.items():
    for question in ANNOT_KEYS:
        answers = [entry[question] for entry in entries]
        if len(answers) > 2:
            num_more_than_2 += 1
            agreement_per_question[question].append(answers)
            split_by_model[model][question].append(answers)
            continue
        elif len(answers) == 2:
            num_with_2 += 1
            agreement_per_question[question].append(answers)
            split_by_model[model][question].append(answers)
        else:
            num_less_than_2 += 1
            continue
            
print(f'Number entries w/ more than 2 annotations: {num_more_than_2}')
print(f'Number entries w/ exactly 2 annotations: {num_with_2}')
print(f'Number entries w/ fewer than 2 annotations: {num_less_than_2}')

# agreement of first two annotations
agreements = dict()
ns = dict()
for question, answers in agreement_per_question.items():
    agrees = [a[0] == a[1] for a in answers]
    perc_agree = sum(agrees) / len(agrees)
    agreements[question] = perc_agree
    ns[question] = len(agrees)

Number entries w/ more than 2 annotations: 0
Number entries w/ exactly 2 annotations: 312
Number entries w/ fewer than 2 annotations: 4472


In [21]:
# Manually compute alphas with only first two annotations
manual_alphas = dict()
for question, answers in agreement_per_question.items():
    entries = []
    for i, aaa in enumerate(answers):
        a0 = aaa[0]
        a1 = aaa[1]
        entries.append((i, a0, 'A'))
        entries.append((i, a1, 'B'))
    df = pd.DataFrame(entries, columns=['exp', 'class', 'ann'])
    alpha = simpledorff.calculate_krippendorffs_alpha_for_df(
        df,
        experiment_col='exp',
        annotator_col='ann',
        class_col='class'
    )
    manual_alphas[question] = alpha
    
for question, abbrev in ANNOT_ABBREV.items():
    print(f'{abbrev}: {manual_alphas[question]:.3f}')

Fluency: 0.521
Population: 0.336
Intervention: 0.604
Outcome: 0.197
Direction-target: 0.850
Direction-generated: 0.786
Strength-target: 0.298
Strength-generated: 0.776


In [26]:
# Cohen's kappas
kappas = dict()
for question, answers in agreement_per_question.items():
    ann1 = [aaa[0] for aaa in answers]
    ann2 = [aaa[1] for aaa in answers] 
    k = cohen_kappa_score(ann1, ann2)
    kappas[question] = k
    
for question, abbrev in ANNOT_ABBREV.items():
    print(f'{abbrev}: {kappas[question]:.3f}')

Fluency: 0.519
Population: 0.334
Intervention: 0.601
Outcome: 0.244
Direction-target: 0.848
Direction-generated: 0.784
Strength-target: 0.304
Strength-generated: 0.774


In [27]:
# IAA
qlen = 50
print('Question\t\t\t\t\t\tAlpha\tKappa\tAgreement\tBest Annotators Alpha')
print('----------------------------------------------------------------------------------------------------')
for question in alphas:
    alpha = alphas[question]
    kappa = kappas[question]
    agreement = agreements[question]
    manual_alpha = manual_alphas[question]
    spaces = ''
    if len(question) < qlen:
        spaces = ' ' * (qlen - len(question) + 3 - 1)
    print(f'{question[:qlen]}...{spaces}\t{alpha:.3f}\t{kappa:.3f}\t{agreement:.3f}\t\t{manual_alpha:.3f}')
    

Question						Alpha	Kappa	Agreement	Best Annotators Alpha
----------------------------------------------------------------------------------------------------
Is the generated summary fluent?...                    	0.521	0.519	0.872		0.521
Is the *population* in the generated summary the s...	0.336	0.334	0.564		0.336
Is the *intervention* in the generated summary the...	0.604	0.601	0.769		0.604
Is the *outcome* in the generated summary the same...	0.197	0.244	0.359		0.197
What is the effect direction in the *target* summa...	0.850	0.848	0.897		0.850
What is the effect direction in the *generated* su...	0.786	0.784	0.897		0.786
What is the strength of the claim made in the *tar...	0.298	0.304	0.538		0.298
What is the strength of the claim made in the *gen...	0.776	0.774	0.897		0.776


In [28]:
# show results per model
results_by_model = defaultdict(dict)

for model, splits in split_by_model.items():
    for question, answers in splits.items():
        use_answers = [a[0] for a in answers]
        ans_count = Counter(use_answers)
        results_by_model[model][question] = ans_count

models = list(results_by_model.keys())        
model_names = [model_name[-6:] for model_name in models]

for question in ANNOT_KEYS:
    comparisons = [results_by_model[model][question] for model in models]
    qa_map = REV_ANSWER_MAP[question]
    print(question)
    print('-----------')
    print(f"Answer\tText\t\t  {'          '.join(model_names)}")
    for text, num in qa_map.items():
        print(f'{num}  ', end='')
        if text.startswith('0: Not enough evidence'):
            disp_text = '0: Not enough evidence'
        else:
            disp_text = text
        print(disp_text.split('--')[0] + ' '*(23 - len(disp_text.split('--')[0])), end='')
        for m_ind, model in enumerate(models):
            total = sum(comparisons[m_ind].values())
            count = comparisons[m_ind].get(text, 0)
            perc = 100 * count / total
            print(f'{count} ({perc:.1f}%)\t', end='')
        print()
    print()
    

Is the generated summary fluent?
-----------
Answer	Text		  JB6Z8F          SPNXTA          PX7SGV          8FWF5T          AQ85CE          VNCH8M
2  2: Yes                 8 (100.0%)	4 (66.7%)	8 (100.0%)	4 (50.0%)	4 (100.0%)	4 (80.0%)	
1  1: Somewhat            0 (0.0%)	2 (33.3%)	0 (0.0%)	4 (50.0%)	0 (0.0%)	1 (20.0%)	
0  0: No                  0 (0.0%)	0 (0.0%)	0 (0.0%)	0 (0.0%)	0 (0.0%)	0 (0.0%)	

Is the *population* in the generated summary the same as the population in the target summary?
-----------
Answer	Text		  JB6Z8F          SPNXTA          PX7SGV          8FWF5T          AQ85CE          VNCH8M
0  0: No                  2 (25.0%)	0 (0.0%)	2 (25.0%)	3 (37.5%)	1 (25.0%)	1 (20.0%)	
1  1: Partially           4 (50.0%)	2 (33.3%)	3 (37.5%)	3 (37.5%)	2 (50.0%)	4 (80.0%)	
2  2: Yes                 1 (12.5%)	2 (33.3%)	2 (25.0%)	0 (0.0%)	1 (25.0%)	0 (0.0%)	

Is the *intervention* in the generated summary the same as the intervention in the target summary?
-----------
Answer	Text		  JB6

In [29]:
columns = df.columns
values = df.values.tolist()
seen_ids = set([])
keep_first = []
for row in values:
    if (row[0], row[1]) not in seen_ids:
        keep_first.append(row)
        seen_ids.add((row[0], row[1]))

In [31]:
df = pd.DataFrame(keep_first, columns=columns)
exp_ids = set(df['ExpID'])

df['p_okay'] = df['Is the *population* in the generated summary the same as the population in the target summary?'].isin(['2: Yes', '1: Partially']) 
df['i_okay'] = df['Is the *intervention* in the generated summary the same as the intervention in the target summary?'].isin(['2: Yes', '1: Partially']) 
df['o_okay'] = df['Is the *outcome* in the generated summary the same as the outcome in the target summary?'].isin(['2: Yes', '1: Partially']) 
df['pio_okay'] = df['p_okay'] & df['i_okay'] & df['o_okay']
df['effect_direction_same'] = df['What is the effect direction in the *target* summary?'] == df['What is the effect direction in the *generated* summary?']
df['claim_strength_same'] = df['What is the strength of the claim made in the *target* summary?'] == df['What is the strength of the claim made in the *generated* summary?']

for exp_id in exp_ids:
    df_exp = df[df['ExpID'] == exp_id]
    print(exp_id)
    scores = []
    for key in ['pio_okay', 'effect_direction_same', 'claim_strength_same']:
        vals = df_exp[key].value_counts(normalize=True).astype(str)
        print(f'{key}: {float(vals[True]):.3f}')
        scores.append(float(vals[True]))
    print(f'Mean: {np.mean(scores):.3f}')
    print()




Unnamed: 0,exp,class,ann
0,0,2: Yes--there are no errors that impact compre...,A
1,1,1: Somewhat--there are some grammatical or lex...,A
2,1,0: No--there are major grammatical or lexical ...,B
3,2,2: Yes--there are no errors that impact compre...,A
4,3,2: Yes--there are no errors that impact compre...,A
5,4,2: Yes--there are no errors that impact compre...,A
6,5,2: Yes--there are no errors that impact compre...,A
7,5,1: Somewhat--there are some grammatical or lex...,B
8,6,2: Yes--there are no errors that impact compre...,A
9,7,2: Yes--there are no errors that impact compre...,A


KeyError: 'ExpID'