In [218]:
# imports + loading data
# several CSVs are compressed for size constraints
# HEY YOU
# OUTPUT FILES ARE ZIPPED HERE:
# https://drive.google.com/file/d/13xA5W3DuuVVOOnorpcdQfagUk2yV-iRn/view?usp=sharing

import itertools
import json
import os

import datasets
import evaluate
import numpy as np
import pandas as pd
import torch

import rrnlp

from collections import Counter

from transformers import AutoTokenizer, RobertaForSequenceClassification

assert torch.cuda.is_available()

DATAPATH = '../data/'

def load_array(array_column):
    def load(row_elem):
        if isinstance(row_elem, (list, tuple)):
            return row_elem
        else:
            res = row_elem.replace("['", '').replace("']", '').split('\n')
            return res
    return load

test_sets_converters = {x: load_array(x) for x in 
                        ['title', 'abstract', 
                         'significantly_decreased_of_input', 'no_significant_difference_of_input', 'significantly_increased_of_input', 'significant_effect_of_input']}
# there are other elements like this now!
joined_converters = {x:load_array(x) for x in 
    ['significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction', 'title', 'abstract', 'source', 'background',
       'significantly_decreased_of_input',
       'no_significant_difference_of_input',
       'significantly_increased_of_input', 'significant_effect_of_input']
}
       

In [2]:
# load test sets + decodings over them if available
test_sets_file = f'{DATAPATH}/test_set_inputs.csv'
test_sets_file_rr = f'{DATAPATH}/test_set_inputs_rr.csv'
test_sets_file_rr_ei = f'{DATAPATH}/test_set_inputs_rr_ei.csv'

if not os.path.exists(test_sets_file):
    # load MSLR input data, prepare a dataframe and a couple of convenience dictionaries
    cochrane_test = datasets.load_dataset('allenai/mslr2022', 'cochrane')['test'].to_pandas()
    cochrane_test['source'] = 'cochrane'
    ms2_test = datasets.load_dataset('allenai/mslr2022', 'ms2')['test'].to_pandas()
    ms2_test['source'] = 'ms2'
    test_sets = pd.concat([cochrane_test, ms2_test], ignore_index=True)
    test_sets['review_id'] = test_sets['review_id']
    del test_sets['target']
    # del test_sets['pmid']
    # del test_sets['title']
    # del test_sets['abstract']
    # del test_sets['background']

    print(test_sets.columns)
    test_sets.head()
print(test_sets['pmid'][0][0])
test_sets.head()



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Index(['review_id', 'pmid', 'title', 'abstract', 'source', 'background'], dtype='object')
11519502


Unnamed: 0,review_id,pmid,title,abstract,source,background
0,CD000220,"[11519502, 6845046]",[Failure of metronidazole to prevent preterm d...,[Infection with Trichomonas vaginalis during p...,cochrane,
1,CD008120,"[16259547, 18246327, 18007568, 16139813, 12503...",[Adjunctive risperidone in generalized anxiety...,[Although significant advances have been made ...,cochrane,
2,CD002968,"[9699091, 12351469, 12829654, 11891019, 145782...",[Effects of a short-term circuit weight traini...,[This study assessed the effects of short-term...,cochrane,
3,CD008472,"[20484064, 19995115, 20484066, 17846333]",[Remote ischemic preconditioning for cerebral ...,[Remote ischemic preconditioning (RIPC) is a p...,cochrane,
4,CD006373,"[16293396, 15567053, 15963687, 11566828, 10223...",[The effectiveness of a voice treatment approa...,[Teachers are considered the professional grou...,cochrane,


In [3]:
# Using RRNLP:
# - extract a punchline from each input abstract
# - extract an evidence inference signifance direction (+/0/-, also NSD/SD) from each input abstract
# - save these to a csv

trial_reader = rrnlp.TrialReader(tasks=['punchline_bot'])

classes = ["significantly_decreased", "no_significant_difference", "significantly_increased"]

rr_to_classes = {
    "↓ sig. decrease": "significantly_decreased",
    "— no diff": "no_significant_difference",
    "↑ sig. increase": "significantly_increased",
}
def compute_evidence_inference_scores_for_inputs_rr(row):
    titles = row['title']
    abstracts = row['abstract']
    
    ret = { kls + "_of_input_rr":[] for kls in classes }
    ret['punchlines'] = []
    for title, abstract in zip(titles, abstracts):
        ti_abs = {"ti": titles[0], 'ab': abstracts[0]}
        preds = trial_reader.read_trial(ti_abs, process_rcts_only=False, task_list=['punchline_bot'])
        punchline = preds['punchline_bot']['punchline_text']
        ret['punchlines'].append(punchline)
        effect = preds['punchline_bot']['effect']
        effect_probs = preds['punchline_bot']['effect_probs']
        for instance in effect_probs:
            for kls, prob in zip(classes, instance):
                ret[kls + '_of_input_rr'].append(prob)
    return ret

if os.path.exists(test_sets_file_rr):
    test_sets = pd.read_csv(test_sets_file, converters=test_sets_converters)
else:
    print('Computing evidence inference punchlines and scores over all input abstracts')
    test_sets_evidence_inference = test_sets.apply(
        compute_evidence_inference_scores_for_inputs_rr,
        result_type='expand',
        axis=1)
    test_sets = pd.concat([test_sets, test_sets_evidence_inference], axis=1)
    test_sets['significant_effect_of_input_rr'] = test_sets["significantly_decreased_of_input_rr"] + test_sets["significantly_increased_of_input_rr"]
    test_sets.to_csv(test_sets_file_rr)

review_id_to_abstract = dict(zip(test_sets['review_id'], test_sets['abstract']))
review_id_to_title = dict(zip(test_sets['review_id'], test_sets['title']))

test_sets.head()


Computing evidence inference punchlines and scores over all input abstracts


Unnamed: 0,review_id,pmid,title,abstract,source,background,significantly_decreased_of_input_rr,no_significant_difference_of_input_rr,significantly_increased_of_input_rr,punchlines,significant_effect_of_input_rr
0,CD000220,"[11519502, 6845046]",[Failure of metronidazole to prevent preterm d...,[Infection with Trichomonas vaginalis during p...,cochrane,,"[0.716418, 0.716418]","[0.010048083, 0.010048083]","[0.2735339, 0.2735339]",[Delivery occurred before 37 weeks of gestatio...,"[0.716418, 0.716418, 0.2735339, 0.2735339]"
1,CD008120,"[16259547, 18246327, 18007568, 16139813, 12503...",[Adjunctive risperidone in generalized anxiety...,[Although significant advances have been made ...,cochrane,,"[0.8884527, 0.8884527, 0.8884527, 0.8884527, 0...","[0.009669417, 0.009669417, 0.009669417, 0.0096...","[0.10187782, 0.10187782, 0.10187782, 0.1018778...",[Adjunctive risperidone was associated with st...,"[0.8884527, 0.8884527, 0.8884527, 0.8884527, 0..."
2,CD002968,"[9699091, 12351469, 12829654, 11891019, 145782...",[Effects of a short-term circuit weight traini...,[This study assessed the effects of short-term...,cochrane,,"[0.96397215, 0.96397215, 0.96397215, 0.9639721...","[0.013200386, 0.013200386, 0.013200386, 0.0132...","[0.022827525, 0.022827525, 0.022827525, 0.0228...",[Significant reductions from baseline values w...,"[0.96397215, 0.96397215, 0.96397215, 0.9639721..."
3,CD008472,"[20484064, 19995115, 20484066, 17846333]",[Remote ischemic preconditioning for cerebral ...,[Remote ischemic preconditioning (RIPC) is a p...,cochrane,,"[0.0017564379, 0.0017564379, 0.0017564379, 0.0...","[0.9969964, 0.9969964, 0.9969964, 0.9969964]","[0.00124719, 0.00124719, 0.00124719, 0.00124719]",[Although there were fewer saccadic latency de...,"[0.0017564379, 0.0017564379, 0.0017564379, 0.0..."
4,CD006373,"[16293396, 15567053, 15963687, 11566828, 10223...",[The effectiveness of a voice treatment approa...,[Teachers are considered the professional grou...,cochrane,,"[0.13579932, 0.13579932, 0.13579932, 0.1357993...","[0.0069637527, 0.0069637527, 0.0069637527, 0.0...","[0.857237, 0.857237, 0.857237, 0.857237, 0.857...",[The difference in voice care knowledge areas ...,"[0.13579932, 0.13579932, 0.13579932, 0.1357993..."


In [6]:
# using evidence inference
# - find an evidence inference signifance direction (+/0/-, also NSD/SD) from each input abstract
# - save these to a csv

# n.b. targets have to come later :frown:
evidence_inference_tokenizer = AutoTokenizer.from_pretrained('allenai/biomed_roberta_base')
evidence_inference_classifier = RobertaForSequenceClassification.from_pretrained(
    'allenai/biomed_roberta_base',
    num_labels=3,
)
evidence_inference_classifier = evidence_inference_classifier.cuda()
state_dict_ei = torch.load('../evidence_inference_models/unconditioned_evidence_classifier/unconditioned_evidence_classifier.pt')
state_dict_ei = {k.replace('bert.', ''):v for k,v in state_dict_ei.items()}
evidence_inference_classifier.load_state_dict(state_dict_ei)

# classes = ["significantly_decreased", "no_significant_difference", "significantly_increased"]
def compute_evidence_inference_scores_for_inputs(row):
    titles = row['title']
    abstracts = row['abstract']
    
    # definitely not how that was trained...
    title_abstracts = [x[0] + ' ' + x[1] for x in zip(titles, abstracts)]
    ret = { kls + "_of_input_ei":[] for kls in classes }
    for i in range(0, len(title_abstracts), 16):
        inputs = evidence_inference_tokenizer(title_abstracts[i:i+6], return_tensors='pt', padding=True, max_length=512, truncation=True)
        inputs = inputs.to(evidence_inference_classifier.device)
        with torch.no_grad():
            logits = evidence_inference_classifier(**inputs).logits
            instance_probs = torch.softmax(logits, axis=1).cpu().tolist()
            for instance in instance_probs:
                for kls, prob in zip(classes, instance):
                    ret[kls + '_of_input_ei'].append(prob)
    return ret

    
if os.path.exists(test_sets_file_rr_ei):
    test_sets = pd.read_csv(test_sets_file, converters=test_sets_converters)
else:
    test_sets_evidence_inference = test_sets.apply(
        compute_evidence_inference_scores_for_inputs,
        result_type='expand',
        axis=1)
    test_sets = pd.concat([test_sets, test_sets_evidence_inference], axis=1)
    test_sets['significant_effect_of_input_ei'] = test_sets["significantly_decreased_of_input_ei"] + test_sets["significantly_increased_of_input_ei"]
    test_sets.to_csv(test_sets_file_rr_ei)

review_id_to_abstract = dict(zip(test_sets['review_id'], test_sets['abstract']))
review_id_to_title = dict(zip(test_sets['review_id'], test_sets['title']))

test_sets.head()


Unnamed: 0,review_id,pmid,title,abstract,source,background,significantly_decreased_of_input_rr,no_significant_difference_of_input_rr,significantly_increased_of_input_rr,punchlines,significant_effect_of_input_rr,significantly_decreased_of_input_ei,no_significant_difference_of_input_ei,significantly_increased_of_input_ei,significant_effect_of_input_ei
0,CD000220,"[11519502, 6845046]",[Failure of metronidazole to prevent preterm d...,[Infection with Trichomonas vaginalis during p...,cochrane,,"[0.716418, 0.716418]","[0.010048083, 0.010048083]","[0.2735339, 0.2735339]",[Delivery occurred before 37 weeks of gestatio...,"[0.716418, 0.716418, 0.2735339, 0.2735339]","[0.1268150359392166, 0.17824169993400574]","[0.21311891078948975, 0.5389404296875]","[0.6600660681724548, 0.28281787037849426]","[0.1268150359392166, 0.17824169993400574, 0.66..."
1,CD008120,"[16259547, 18246327, 18007568, 16139813, 12503...",[Adjunctive risperidone in generalized anxiety...,[Although significant advances have been made ...,cochrane,,"[0.8884527, 0.8884527, 0.8884527, 0.8884527, 0...","[0.009669417, 0.009669417, 0.009669417, 0.0096...","[0.10187782, 0.10187782, 0.10187782, 0.1018778...",[Adjunctive risperidone was associated with st...,"[0.8884527, 0.8884527, 0.8884527, 0.8884527, 0...","[0.3804469704627991, 0.35312458872795105, 0.09...","[0.18082500994205475, 0.34517616033554077, 0.4...","[0.4387280344963074, 0.3016992211341858, 0.480...","[0.3804469704627991, 0.35312458872795105, 0.09..."
2,CD002968,"[9699091, 12351469, 12829654, 11891019, 145782...",[Effects of a short-term circuit weight traini...,[This study assessed the effects of short-term...,cochrane,,"[0.96397215, 0.96397215, 0.96397215, 0.9639721...","[0.013200386, 0.013200386, 0.013200386, 0.0132...","[0.022827525, 0.022827525, 0.022827525, 0.0228...",[Significant reductions from baseline values w...,"[0.96397215, 0.96397215, 0.96397215, 0.9639721...","[0.7270705103874207, 0.6144201159477234, 0.145...","[0.0399714931845665, 0.12528373301029205, 0.05...","[0.23295795917510986, 0.26029613614082336, 0.7...","[0.7270705103874207, 0.6144201159477234, 0.145..."
3,CD008472,"[20484064, 19995115, 20484066, 17846333]",[Remote ischemic preconditioning for cerebral ...,[Remote ischemic preconditioning (RIPC) is a p...,cochrane,,"[0.0017564379, 0.0017564379, 0.0017564379, 0.0...","[0.9969964, 0.9969964, 0.9969964, 0.9969964]","[0.00124719, 0.00124719, 0.00124719, 0.00124719]",[Although there were fewer saccadic latency de...,"[0.0017564379, 0.0017564379, 0.0017564379, 0.0...","[0.04978577420115471, 0.2443036437034607, 0.17...","[0.8936231136322021, 0.3732493817806244, 0.533...","[0.056591182947158813, 0.3824469745159149, 0.2...","[0.04978577420115471, 0.2443036437034607, 0.17..."
4,CD006373,"[16293396, 15567053, 15963687, 11566828, 10223...",[The effectiveness of a voice treatment approa...,[Teachers are considered the professional grou...,cochrane,,"[0.13579932, 0.13579932, 0.13579932, 0.1357993...","[0.0069637527, 0.0069637527, 0.0069637527, 0.0...","[0.857237, 0.857237, 0.857237, 0.857237, 0.857...",[The difference in voice care knowledge areas ...,"[0.13579932, 0.13579932, 0.13579932, 0.1357993...","[0.08196700364351273, 0.13510577380657196, 0.1...","[0.07671602070331573, 0.02303481660783291, 0.3...","[0.8413169384002686, 0.8418593406677246, 0.513...","[0.08196700364351273, 0.13510577380657196, 0.1..."


In [23]:
# 1. update the stored json files with the source data abstracts and titles
# 2. create a dataframe of the system productions and whatever outputs are already present
# 3. I should probably feel bad about co-mingling these two, but...I don't?

def rr_classifcation_scores_for_outputs(title, summary):
    ti_abs = {"ti": title, 'ab': summary}
    preds = trial_reader.read_trial(ti_abs, process_rcts_only=False, task_list=['punchline_bot'])
    preds['punchline_bot']['effect_probs'] = preds['punchline_bot']['effect_probs'].tolist()
    return preds


def evidence_inference_scores_for_outputs(summary):
    inputs = evidence_inference_tokenizer(summary, return_tensors='pt', padding=True, max_length=512, truncation=True)
    inputs = inputs.to(evidence_inference_classifier.device)
    with torch.no_grad():
        logits = evidence_inference_classifier(**inputs).logits
        instance_probs = torch.softmax(logits, axis=1).cpu().tolist()
    return instance_probs[0]

# this is a hack
# after this runs, we have the output with metrics
for datafile, new_datafile in [
    (f'{DATAPATH}/processed_data.json', f'{DATAPATH}/processed_data_w_inputs.json'),
    (f'{DATAPATH}/processed_data_w_metrics.json', f'{DATAPATH}/processed_data_w_metrics_w_inputs.json')]:
    system_productions = []
    system_productions_lines = []
    with open(datafile, 'r') as inf, open(new_datafile, 'w') as of:
        for line in inf:
            instance = json.loads(line)
            instance['abstract'] = review_id_to_abstract[instance['review_id']].tolist()
            instance['title'] = review_id_to_title[instance['review_id']].tolist()
            for prediction in instance['predictions']:
                # only compute if needed
                if 'evidence_inference_target' not in instance:
                    instance['evidence_inference_target'] = evidence_inference_scores_for_outputs(instance['target'])
                # only compute if needed
                if 'evidence_inference_target_rr' not in instance:
                    rr_predictions = rr_classifcation_scores_for_outputs('', instance['target'])
                    instance['evidence_inference_target_rr'] = rr_predictions['punchline_bot']['effect_probs']
                    instance['evidence_inference_target_punchline_rr'] = rr_predictions['punchline_bot']['punchline_text']

                prediction_dict = {
                    'review_id': instance['review_id'],
                    'subtask': instance['subtask'],
                    'target': instance['target'],
                    'prediction': prediction['prediction'],
#                     'evidence_inference_prediction': evidence_inference_prediction,
                    'exp_short': prediction['exp_short'],
                }
                
                # only compute if needed: 
                if 'no_significant_difference_prediction' not in prediction:
                    for kls, score in zip(classes, evidence_inference_scores_for_outputs(prediction['prediction'])):
                        prediction_dict[kls + '_prediction'] = score
                        prediction[kls + '_prediction'] = score
                else:
                    for kls in classes:
                        prediction_dict[kls + '_prediction'] = prediction[kls + '_prediction']
                
                if 'no_significant_difference_prediction_rr' not in prediction:
                    if len(prediction['prediction'].strip()) == 0:
                        for kls in classes:
                            prediction_dict[kls + '_prediction_rr'] = 0
                            prediction[kls + '_prediction_rr'] = 0
                    else:
                        rr_preds = rr_classifcation_scores_for_outputs('', prediction['prediction'])
                        for kls, score in zip(classes, rr_preds['punchline_bot']['effect_probs']):
                            prediction_dict[kls + '_prediction_rr'] = score
                            prediction[kls + '_prediction_rr'] = score
                else:
                    for kls in classes:
                        prediction_dict[kls + '_prediction_rr'] = prediction[kls + '_prediction_rr']
                
                # copy automatic scoring over
                if 'scores' in prediction:
                    for k, v in prediction['scores'].items():
                        assert k not in prediction_dict
                        prediction_dict[k] = v
                # TODO annotations?
                
                system_productions.append(prediction_dict)
            of.write(json.dumps(instance))
            of.write('\n')
    system_productions = pd.DataFrame(system_productions)
    system_productions['review_id'] = system_productions['review_id'].astype(str)
    
    os.replace(new_datafile, datafile)

print(system_productions.columns)
print(system_productions['review_id'][0])
system_productions.head()

Index(['review_id', 'subtask', 'target', 'prediction', 'exp_short',
       'significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction',
       'significantly_decreased_prediction_rr', 'bertscore_p', 'bertscore_r',
       'bertscore_f', 'rouge1_p', 'rouge1_r', 'rouge1_f', 'rouge2_p',
       'rouge2_r', 'rouge2_f', 'rougeL_p', 'rougeL_r', 'rougeL_f',
       'rougeLsum_p', 'rougeLsum_r', 'rougeLsum_f', 'ei_score', 'claimver',
       'sts', 'nli', 'no_significant_difference_prediction_rr',
       'significantly_increased_prediction_rr'],
      dtype='object')
24894943


Unnamed: 0,review_id,subtask,target,prediction,exp_short,significantly_decreased_prediction,no_significant_difference_prediction,significantly_increased_prediction,significantly_decreased_prediction_rr,bertscore_p,...,rougeL_f,rougeLsum_p,rougeLsum_r,rougeLsum_f,ei_score,claimver,sts,nli,no_significant_difference_prediction_rr,significantly_increased_prediction_rr
0,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,EFD8HX,0.053528,0.049263,0.897209,"[0.06685246527194977, 0.21206951141357422, 0.7...",0.816605,...,0.15625,0.171429,0.206897,0.1875,0.836226,0.476795,0.631644,0.804932,,
1,24894943,MS2,Significant OS differences were observed in ol...,INTRODUCTION / BACKGROUND An individual patien...,MG3N0D,0.25475,0.220718,0.524532,"[0.43473660945892334, 0.256255179643631, 0.309...",0.841731,...,0.133333,0.129032,0.137931,0.133333,0.078508,0.770375,0.605519,0.712774,,
2,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,W6E2CQ,0.189049,0.180032,0.630919,"[0.18661248683929443, 0.5224055051803589, 0.29...",0.817489,...,0.2,0.363636,0.137931,0.2,0.032762,0.721033,0.633378,0.739471,,
3,24894943,MS2,Significant OS differences were observed in ol...,backgroundthe aim of this study was to compare...,RDZ7K5,0.229608,0.35056,0.419832,"[0.17127756774425507, 0.7187755703926086, 0.10...",0.848861,...,0.090909,0.133333,0.068966,0.090909,0.036774,0.365915,0.397677,0.613397,,
4,24894943,MS2,Significant OS differences were observed in ol...,Conclusions : The results of this meta- analys...,9EKG14,0.029836,0.016749,0.953415,"[0.07271230220794678, 0.1009221002459526, 0.82...",0.854012,...,0.166667,0.16129,0.172414,0.166667,0.985187,0.230184,0.488337,0.693682,,


In [24]:
print(set(test_sets.columns) & set(system_productions.columns))
print(len(set(system_productions['review_id']) & set(test_sets['review_id'])))

df = system_productions.merge(right=test_sets, on='review_id', how='inner')
print(df.columns, len(df))
df.head()

{'review_id'}
2137
Index(['review_id', 'subtask', 'target', 'prediction', 'exp_short',
       'significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction',
       'significantly_decreased_prediction_rr', 'bertscore_p', 'bertscore_r',
       'bertscore_f', 'rouge1_p', 'rouge1_r', 'rouge1_f', 'rouge2_p',
       'rouge2_r', 'rouge2_f', 'rougeL_p', 'rougeL_r', 'rougeL_f',
       'rougeLsum_p', 'rougeLsum_r', 'rougeLsum_f', 'ei_score', 'claimver',
       'sts', 'nli', 'no_significant_difference_prediction_rr',
       'significantly_increased_prediction_rr', 'pmid', 'title', 'abstract',
       'source', 'background', 'significantly_decreased_of_input_rr',
       'no_significant_difference_of_input_rr',
       'significantly_increased_of_input_rr', 'punchlines',
       'significant_effect_of_input_rr', 'significantly_decreased_of_input_ei',
       'no_significant_difference_of_input_ei',
       'significantly_increased_of_inp

Unnamed: 0,review_id,subtask,target,prediction,exp_short,significantly_decreased_prediction,no_significant_difference_prediction,significantly_increased_prediction,significantly_decreased_prediction_rr,bertscore_p,...,background,significantly_decreased_of_input_rr,no_significant_difference_of_input_rr,significantly_increased_of_input_rr,punchlines,significant_effect_of_input_rr,significantly_decreased_of_input_ei,no_significant_difference_of_input_ei,significantly_increased_of_input_ei,significant_effect_of_input_ei
0,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,EFD8HX,0.053528,0.049263,0.897209,"[0.06685246527194977, 0.21206951141357422, 0.7...",0.816605,...,INTRODUCTION / BACKGROUND An individual patien...,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]",[Progression-free survival ( PFS ) was better ...,"[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."
1,24894943,MS2,Significant OS differences were observed in ol...,INTRODUCTION / BACKGROUND An individual patien...,MG3N0D,0.25475,0.220718,0.524532,"[0.43473660945892334, 0.256255179643631, 0.309...",0.841731,...,INTRODUCTION / BACKGROUND An individual patien...,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]",[Progression-free survival ( PFS ) was better ...,"[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."
2,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,W6E2CQ,0.189049,0.180032,0.630919,"[0.18661248683929443, 0.5224055051803589, 0.29...",0.817489,...,INTRODUCTION / BACKGROUND An individual patien...,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]",[Progression-free survival ( PFS ) was better ...,"[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."
3,24894943,MS2,Significant OS differences were observed in ol...,backgroundthe aim of this study was to compare...,RDZ7K5,0.229608,0.35056,0.419832,"[0.17127756774425507, 0.7187755703926086, 0.10...",0.848861,...,INTRODUCTION / BACKGROUND An individual patien...,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]",[Progression-free survival ( PFS ) was better ...,"[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."
4,24894943,MS2,Significant OS differences were observed in ol...,Conclusions : The results of this meta- analys...,9EKG14,0.029836,0.016749,0.953415,"[0.07271230220794678, 0.1009221002459526, 0.82...",0.854012,...,INTRODUCTION / BACKGROUND An individual patien...,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]",[Progression-free survival ( PFS ) was better ...,"[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."


In [None]:
# compute ROUGE of (input, prediction) in several variations:
# - (single abstract, prediction)
# - (single abstract punchline, prediction)
# - (title, prediction)
# - (background, prediction)

# note to self: robot reviewer should have the "using default tokenizer" set to warn Once or info Once
rouge = evaluate.load('rouge')

def input_copy_rouge1(row):
    # vs input abstracts
    ret = {
        'rouge1_vs_input_abstract': [],
        'rouge2_vs_input_abstract': [],
        'rougeL_vs_input_abstract': [],
        'rougeLsum_vs_input_abstract': [],
    }
    
    for inp in row['punchlines']:
        rouge_scores = rouge.compute(references=[inp], predictions=[row['prediction']])
        for k, v in rouge_scores.items():
            ret[k + '_vs_input_abstract'].append(v)

    ret = {
        **ret, 
        'rouge1_vs_input_punchlines': [],
        'rouge2_vs_input_punchlines': [],
        'rougeL_vs_input_punchlines': [],
        'rougeLsum_vs_input_punchlines': [],
    }        
    for inp in row['abstract']:
        rouge_scores = rouge.compute(references=[inp], predictions=[row['prediction']])
        for k, v in rouge_scores.items():
            ret[k + '_vs_input_punchlines'].append(v)
            
    # vs input titles
    ret = {
        **ret, 
        'rouge1_vs_input_title': [],
        'rouge2_vs_input_title': [],
        'rougeL_vs_input_title': [],
        'rougeLsum_vs_input_title': [],
    }
    
    for inp in row['title']:
        rouge_scores = rouge.compute(references=[inp], predictions=[row['prediction']])
        for k, v in rouge_scores.items():
            ret[k + '_vs_input_title'].append(v)
    
    # vs background
    if row['background'] == row['background'] and row['background'] is not None:
        background_rouge_scores = rouge.compute(references=[row['background']], predictions=[row['prediction']])
        for k, v in background_rouge_scores.items():
            ret[k + '_vs_background'] = v
    
    return ret

input_rouge_result_file = f'{DATAPATH}/processed_data_w_input_rouge_scores.csv'
if os.path.exists(input_rouge_result_file):
    df = pd.read_json(input_rouge_result_file, converters=joined_converters)
else:
    copying_eval = df.apply(input_copy_rouge1, result_type='expand', axis=1)
    df = pd.concat([df, copying_eval], axis=1)
    df.to_csv(input_rouge_result_file)

df.head()

In [53]:
# oops! we forgot to load some elements before, so rather than redo anything expensive, let's load them from the source
df_copy = df.copy()

updated_df_elements = []
with open(f'{DATAPATH}/processed_data_w_metrics.json', 'r') as inf, open(f'{DATAPATH}/processed_data_w_metrics_w_inputs.json', 'w') as of:
    for line in inf:
        contents = json.loads(line)
        for prediction in contents['predictions']:
            # fixing an oops!
            rr_result = prediction['significantly_decreased_prediction_rr']
            if rr_result != 0:
                assert isinstance(rr_result, list), (contents['review_id'] + str(prediction))
                update_elem = {
                    'review_id': contents['review_id'],
                    'exp_short': prediction['exp_short'],
                }
                for kls, score in zip(classes, rr_result):
                    prediction[kls + '_prediction_rr'] = score
                    update_elem[kls + '_prediction_rr'] = score
                updated_df_elements.append(update_elem)
            else:
                update_elem = {
                    'review_id': contents['review_id'],
                    'exp_short': prediction['exp_short'],
                }
                # no annotations done
                for kls, score in zip(classes, [0,0,0]):
                    update_elem[kls + '_prediction_rr'] = score
                updated_df_elements.append(update_elem)
        of.write(json.dumps(instance))
        of.write('\n')

update_df = pd.DataFrame(updated_df_elements)
assert (df_copy[['review_id', 'exp_short']] == update_df[['review_id', 'exp_short']]).all().all()
df_copy[['significantly_decreased_prediction_rr',
        'no_significant_difference_prediction_rr',
       'significantly_increased_prediction_rr',]] = update_df[['significantly_decreased_prediction_rr',
        'no_significant_difference_prediction_rr',
       'significantly_increased_prediction_rr',]]
os.replace(f'{DATAPATH}/processed_data_w_metrics_w_inputs.json', 'processed_data_w_metrics.json')

df_copy[['significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction',
       'significantly_decreased_prediction_rr',
        'no_significant_difference_prediction_rr',
       'significantly_increased_prediction_rr',
    'significantly_decreased_of_input_rr',
       'no_significant_difference_of_input_rr',
       'significantly_increased_of_input_rr',
    'significant_effect_of_input_rr', 'significantly_decreased_of_input_ei',
       'no_significant_difference_of_input_ei',
       'significantly_increased_of_input_ei', 'significant_effect_of_input_ei',
   ]]



Unnamed: 0,significantly_decreased_prediction,no_significant_difference_prediction,significantly_increased_prediction,significantly_decreased_prediction_rr,no_significant_difference_prediction_rr,significantly_increased_prediction_rr,significantly_decreased_of_input_rr,no_significant_difference_of_input_rr,significantly_increased_of_input_rr,significant_effect_of_input_rr,significantly_decreased_of_input_ei,no_significant_difference_of_input_ei,significantly_increased_of_input_ei,significant_effect_of_input_ei
0,0.053528,0.049263,0.897209,0.066852,0.212070,0.721078,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."
1,0.254750,0.220718,0.524532,0.434737,0.256255,0.309008,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."
2,0.189049,0.180032,0.630919,0.186612,0.522406,0.290982,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."
3,0.229608,0.350560,0.419832,0.171278,0.718776,0.109947,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."
4,0.029836,0.016749,0.953415,0.072712,0.100922,0.826366,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17988,0.238323,0.274627,0.487050,0.180022,0.678657,0.141321,"[0.0015160621, 0.0015160621]","[0.9973514, 0.9973514]","[0.0011325332, 0.0011325332]","[0.0015160621, 0.0015160621, 0.0011325332, 0.0...","[0.1988724321126938, 0.2142958790063858]","[0.5597877502441406, 0.6097846031188965]","[0.24133983254432678, 0.17591948807239532]","[0.1988724321126938, 0.2142958790063858, 0.241..."
17989,0.026364,0.901532,0.072104,0.005692,0.979640,0.014668,"[0.0015160621, 0.0015160621]","[0.9973514, 0.9973514]","[0.0011325332, 0.0011325332]","[0.0015160621, 0.0015160621, 0.0011325332, 0.0...","[0.1988724321126938, 0.2142958790063858]","[0.5597877502441406, 0.6097846031188965]","[0.24133983254432678, 0.17591948807239532]","[0.1988724321126938, 0.2142958790063858, 0.241..."
17990,0.013497,0.964115,0.022388,0.009610,0.979698,0.010692,"[0.0015160621, 0.0015160621]","[0.9973514, 0.9973514]","[0.0011325332, 0.0011325332]","[0.0015160621, 0.0015160621, 0.0011325332, 0.0...","[0.1988724321126938, 0.2142958790063858]","[0.5597877502441406, 0.6097846031188965]","[0.24133983254432678, 0.17591948807239532]","[0.1988724321126938, 0.2142958790063858, 0.241..."
17991,0.042273,0.825310,0.132418,0.009395,0.965898,0.024707,"[0.0015160621, 0.0015160621]","[0.9973514, 0.9973514]","[0.0011325332, 0.0011325332]","[0.0015160621, 0.0015160621, 0.0011325332, 0.0...","[0.1988724321126938, 0.2142958790063858]","[0.5597877502441406, 0.6097846031188965]","[0.24133983254432678, 0.17591948807239532]","[0.1988724321126938, 0.2142958790063858, 0.241..."


In [54]:
df = df_copy
df.to_csv('large_results_df.csv')

In [63]:
pd.set_option('display.max_columns', None)
review_ids = Counter(df['review_id'])
view_df = pd.concat([df[:3], df[100:103]])
view_df.head()

Unnamed: 0,review_id,subtask,target,prediction,exp_short,significantly_decreased_prediction,no_significant_difference_prediction,significantly_increased_prediction,significantly_decreased_prediction_rr,bertscore_p,bertscore_r,bertscore_f,rouge1_p,rouge1_r,rouge1_f,rouge2_p,rouge2_r,rouge2_f,rougeL_p,rougeL_r,rougeL_f,rougeLsum_p,rougeLsum_r,rougeLsum_f,ei_score,claimver,sts,nli,no_significant_difference_prediction_rr,significantly_increased_prediction_rr,pmid,title,abstract,source,background,significantly_decreased_of_input_rr,no_significant_difference_of_input_rr,significantly_increased_of_input_rr,punchlines,significant_effect_of_input_rr,significantly_decreased_of_input_ei,no_significant_difference_of_input_ei,significantly_increased_of_input_ei,significant_effect_of_input_ei,rouge1_vs_input_abstract,rouge2_vs_input_abstract,rougeL_vs_input_abstract,rougeLsum_vs_input_abstract,rouge1_vs_input_punchlines,rouge2_vs_input_punchlines,rougeL_vs_input_punchlines,rougeLsum_vs_input_punchlines,rouge1_vs_input_title,rouge2_vs_input_title,rougeL_vs_input_title,rougeLsum_vs_input_title,rouge1_vs_background,rouge2_vs_background,rougeL_vs_background,rougeLsum_vs_background,max_rouge1_vs_input,max_rouge1_vs_input_position,rouge1_vs_reference_is_higher,rouge1_vs_reference_delta
0,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,EFD8HX,0.053528,0.049263,0.897209,0.066852,0.816605,0.841818,0.82902,0.285714,0.344828,0.3125,0.029412,0.035714,0.032258,0.142857,0.172414,0.15625,0.171429,0.206897,0.1875,0.836226,0.476795,0.631644,0.804932,0.21207,0.721078,"[19632716, 20573926, 23724913, 21903745]",[Radiotherapy plus chemotherapy with or withou...,[BACKGROUND Results from phase II studies in p...,ms2,INTRODUCTION / BACKGROUND An individual patien...,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]",[Progression-free survival ( PFS ) was better ...,"[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.1263157894736842, 0.1263157894736842, 0.126...","[0.0, 0.0, 0.0, 0.0]","[0.0631578947368421, 0.0631578947368421, 0.063...","[0.0631578947368421, 0.0631578947368421, 0.063...","[0.08730158730158731, 0.09248554913294797, 0.1...","[0.02390438247011952, 0.017441860465116282, 0....","[0.05952380952380953, 0.06936416184971099, 0.0...","[0.05952380952380953, 0.06936416184971099, 0.0...","[0.17543859649122806, 0.0851063829787234, 0.04...","[0.07272727272727271, 0.0, 0.0, 0.039215686274...","[0.17543859649122806, 0.0851063829787234, 0.04...","[0.17543859649122806, 0.0851063829787234, 0.04...",0.242424,0.09375,0.151515,0.151515,0.126316,0,False,-0.186184
1,24894943,MS2,Significant OS differences were observed in ol...,INTRODUCTION / BACKGROUND An individual patien...,MG3N0D,0.25475,0.220718,0.524532,0.434737,0.841731,0.859274,0.850412,0.193548,0.206897,0.2,0.033333,0.035714,0.034483,0.129032,0.137931,0.133333,0.129032,0.137931,0.133333,0.078508,0.770375,0.605519,0.712774,0.256255,0.309008,"[19632716, 20573926, 23724913, 21903745]",[Radiotherapy plus chemotherapy with or withou...,[BACKGROUND Results from phase II studies in p...,ms2,INTRODUCTION / BACKGROUND An individual patien...,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]",[Progression-free survival ( PFS ) was better ...,"[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.08791208791208792, 0.08791208791208792, 0.0...","[0.02247191011235955, 0.02247191011235955, 0.0...","[0.08791208791208792, 0.08791208791208792, 0.0...","[0.08791208791208792, 0.08791208791208792, 0.0...","[0.064, 0.08187134502923976, 0.081743869209809...","[0.012048192771084336, 0.0058823529411764705, ...","[0.044000000000000004, 0.05263157894736842, 0....","[0.044000000000000004, 0.05263157894736842, 0....","[0.07547169811320754, 0.046511627906976744, 0....","[0.0, 0.0, 0.0, 0.0]","[0.03773584905660377, 0.046511627906976744, 0....","[0.03773584905660377, 0.046511627906976744, 0....",1.0,1.0,1.0,1.0,0.087912,0,False,-0.112088
2,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,W6E2CQ,0.189049,0.180032,0.630919,0.186612,0.817489,0.82791,0.822666,0.454545,0.172414,0.25,0.1,0.035714,0.052632,0.363636,0.137931,0.2,0.363636,0.137931,0.2,0.032762,0.721033,0.633378,0.739471,0.522406,0.290982,"[19632716, 20573926, 23724913, 21903745]",[Radiotherapy plus chemotherapy with or withou...,[BACKGROUND Results from phase II studies in p...,ms2,INTRODUCTION / BACKGROUND An individual patien...,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]",[Progression-free survival ( PFS ) was better ...,"[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.08450704225352113, 0.08450704225352113, 0.0...","[0.0, 0.0, 0.0, 0.0]","[0.056338028169014086, 0.056338028169014086, 0...","[0.056338028169014086, 0.056338028169014086, 0...","[0.020833333333333332, 0.024844720496894408, 0...","[0.008368200836820083, 0.00625, 0.011594202898...","[0.020833333333333332, 0.024844720496894408, 0...","[0.020833333333333332, 0.024844720496894408, 0...","[0.060606060606060615, 0.08695652173913043, 0....","[0.0, 0.0, 0.0, 0.0]","[0.060606060606060615, 0.08695652173913043, 0....","[0.060606060606060615, 0.08695652173913043, 0....",0.238095,0.15,0.238095,0.238095,0.084507,0,False,-0.165493
100,19821336,MS2,"PCV is effective in preventing IPD , X-ray def...",The results of this review suggest that pneumo...,9EKG14,0.047323,0.887084,0.065593,0.001909,0.83781,0.818859,0.828226,0.138614,0.297872,0.189189,0.05,0.108696,0.068493,0.09901,0.212766,0.135135,0.128713,0.276596,0.175676,0.990367,0.592028,0.645171,0.744843,0.994096,0.003996,"[15702037, 15780443, 7095883, 1968589, 1073809...",[Immunogenicity and Tolerability of a Heptaval...,[Background : The recommended vaccination sche...,ms2,"BACKGROUND Pneumonia , caused by Streptococcus...","[0.14282429, 0.14282429, 0.14282429, 0.1428242...","[0.41341034, 0.41341034, 0.41341034, 0.4134103...","[0.44376537, 0.44376537, 0.44376537, 0.4437653...",[The results suggest that the implementation o...,"[0.14282429, 0.14282429, 0.14282429, 0.1428242...","[0.12145879119634628, 0.07524408400058746, 0.4...","[0.37345144152641296, 0.14628413319587708, 0.0...","[0.5050898194313049, 0.7784718871116638, 0.441...","[0.12145879119634628, 0.07524408400058746, 0.4...","[0.16949152542372883, 0.16949152542372883, 0.1...","[0.06896551724137932, 0.06896551724137932, 0.0...","[0.16949152542372883, 0.16949152542372883, 0.1...","[0.16949152542372883, 0.16949152542372883, 0.1...","[0.24936386768447838, 0.25790754257907544, 0.3...","[0.04603580562659847, 0.05378973105134474, 0.0...","[0.1475826972010178, 0.16058394160583941, 0.21...","[0.16284987277353688, 0.1751824817518248, 0.25...","[0.15126050420168066, 0.15126050420168066, 0.0...","[0.03418803418803419, 0.03418803418803419, 0.0...","[0.11764705882352942, 0.10084033613445377, 0.0...","[0.13445378151260506, 0.13445378151260506, 0.0...",0.267442,0.035294,0.151163,0.244186,0.169492,0,False,-0.019698
101,19821336,MS2,"PCV is effective in preventing IPD , X-ray def...",There-valent Pneumococcal conjugate vaccine is...,V0PMWV,0.141022,0.747162,0.111816,0.471431,0.849282,0.827403,0.8382,0.183333,0.234043,0.205607,0.050847,0.065217,0.057143,0.133333,0.170213,0.149533,0.166667,0.212766,0.186916,0.995243,0.66143,0.660025,0.772111,0.333464,0.195105,"[15702037, 15780443, 7095883, 1968589, 1073809...",[Immunogenicity and Tolerability of a Heptaval...,[Background : The recommended vaccination sche...,ms2,"BACKGROUND Pneumonia , caused by Streptococcus...","[0.14282429, 0.14282429, 0.14282429, 0.1428242...","[0.41341034, 0.41341034, 0.41341034, 0.4134103...","[0.44376537, 0.44376537, 0.44376537, 0.4437653...",[The results suggest that the implementation o...,"[0.14282429, 0.14282429, 0.14282429, 0.1428242...","[0.12145879119634628, 0.07524408400058746, 0.4...","[0.37345144152641296, 0.14628413319587708, 0.0...","[0.5050898194313049, 0.7784718871116638, 0.441...","[0.12145879119634628, 0.07524408400058746, 0.4...","[0.07792207792207792, 0.07792207792207792, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.07792207792207792, 0.07792207792207792, 0.0...","[0.07792207792207792, 0.07792207792207792, 0.0...","[0.15909090909090906, 0.1837837837837838, 0.24...","[0.04, 0.03804347826086957, 0.0662983425414364...","[0.11363636363636363, 0.11351351351351353, 0.1...","[0.11363636363636363, 0.11351351351351353, 0.1...","[0.23076923076923075, 0.10256410256410256, 0.0...","[0.13157894736842105, 0.05263157894736842, 0.0...","[0.20512820512820512, 0.10256410256410256, 0.0...","[0.20512820512820512, 0.10256410256410256, 0.0...",0.305344,0.046512,0.183206,0.244275,0.077922,0,False,-0.127685


In [None]:
# copying information vs. the target!
rouge = evaluate.load('rouge')

def input_target_copy_rouge1(row):
    # vs input abstracts
    ret = {
        'rouge1_target_vs_input_abstract': [],
        'rouge2_target_vs_input_abstract': [],
        'rougeL_target_vs_input_abstract': [],
        'rougeLsum_target_vs_input_abstract': [],
    }
    
    target = df[df['review_id'] == row['review_id']]['target'].tolist()
    assert len(set(target)) == 1
    target = target[0]
    for inp in row['punchlines']:
        rouge_scores = rouge.compute(references=[inp], predictions=[target])
        for k, v in rouge_scores.items():
            ret[k + '_target_vs_input_abstract'].append(v)

    ret = {
        **ret, 
        'rouge1_target_vs_input_punchlines': [],
        'rouge2_target_vs_input_punchlines': [],
        'rougeL_target_vs_input_punchlines': [],
        'rougeLsum_target_vs_input_punchlines': [],
    }        
    for inp in row['abstract']:
        rouge_scores = rouge.compute(references=[inp], predictions=[target])
        for k, v in rouge_scores.items():
            ret[k + '_target_vs_input_punchlines'].append(v)
            
    # vs input titles
    ret = {
        **ret, 
        'rouge1_target_vs_input_title': [],
        'rouge2_target_vs_input_title': [],
        'rougeL_target_vs_input_title': [],
        'rougeLsum_target_vs_input_title': [],
    }
    
    for inp in row['title']:
        rouge_scores = rouge.compute(references=[inp], predictions=[target])
        for k, v in rouge_scores.items():
            ret[k + '_target_vs_input_title'].append(v)
    
    # vs background
    if row['background'] == row['background'] and row['background'] is not None:
        background_rouge_scores = rouge.compute(references=[row['background']], predictions=[target])
        for k, v in background_rouge_scores.items():
            ret[k + '_target_vs_background'] = v
    
    return ret

updated_test_rows_with_additional_rouges = []
for _, row in test_sets.iterrows():
    updated_test_rows_with_additional_rouges.append(input_target_copy_rouge1(row))

print(len(updated_test_rows_with_additional_rouges))

In [210]:
print(len(updated_test_rows_with_additional_rouges))
test_scores = []
# classes = ["significantly_decreased", "no_significant_difference", "significantly_increased"]

with open(f'{DATAPATH}/processed_data_w_metrics.json', 'r') as inf:
    for row in inf:
        row = json.loads(row)
        evidence_inference_target_scores = row['evidence_inference_target']
        evidence_inference_target_rr_scores = row['evidence_inference_target_rr'][0]
        review_id = row['review_id']
        row_res = {
            'review_id': review_id,
        }
        for kls, score in zip(classes, evidence_inference_target_scores):
            row_res[kls + '_target_ei'] = score
        row_res['significant_difference_target_ei'] = row_res["significantly_decreased_target_ei"] + row_res["significantly_increased_target_ei"]
        for kls, score in zip(classes, evidence_inference_target_rr_scores):
            row_res[kls + '_target_rr'] = score
        row_res['significant_difference_target_rr'] = row_res["significantly_decreased_target_rr"] + row_res["significantly_increased_target_rr"]
        test_scores.append(row_res)
test_scores_df = pd.DataFrame(test_scores)
        
test_analysis_df = pd.concat([test_sets, pd.DataFrame(updated_test_rows_with_additional_rouges)], axis=1)
test_analysis_df = test_analysis_df.merge(test_scores_df, on='review_id')

test_analysis_df['prediction_kls_of_target_ei'] = [x.replace('_target_ei', '') for x in test_analysis_df[['significantly_decreased_target_ei', 'no_significant_difference_target_ei', 'significantly_increased_target_ei']].idxmax(axis=1)]
test_analysis_df['prediction_kls_of_target_rr'] = [x.replace('_target_rr', '') for x in test_analysis_df[['significantly_decreased_target_rr', 'no_significant_difference_target_rr', 'significantly_increased_target_rr']].idxmax(axis=1)]

input_classifications = []
for _, row in test_analysis_df[['significantly_decreased_of_input_ei','no_significant_difference_of_input_ei', 'significantly_increased_of_input_ei']].iterrows():
    # preds per input!
    input_preds = list(zip(*row))
    max_position_per_pred = list(map(np.argmax, input_preds))
    assert len(max_position_per_pred) == len(input_preds)
    kls = [classes[x] for x in max_position_per_pred]
    input_classifications.append(kls)
test_analysis_df['input_classifications_ei'] = input_classifications

input_classifications = []
for _, row in test_analysis_df[['significantly_decreased_of_input_rr','no_significant_difference_of_input_rr', 'significantly_increased_of_input_rr']].iterrows():
    # preds per input!
    input_preds = list(zip(*row))
    max_position_per_pred = list(map(np.argmax, input_preds))
    assert len(max_position_per_pred) == len(input_preds)
    kls = [classes[x] for x in max_position_per_pred]
    input_classifications.append(kls)
test_analysis_df['input_classifications_rr'] = input_classifications


test_analysis_df['max_rouge1_target_vs_input'] = test_analysis_df['rouge1_target_vs_input_abstract'].apply(max)
test_analysis_df['max_rouge1_target_vs_input_position'] = test_analysis_df['rouge1_target_vs_input_abstract'].apply(np.argmax)
test_analysis_df['max_rouge1_target_vs_input_kls_ei'] = [x[pos] for (x, pos) in zip(test_analysis_df['input_classifications_ei'], test_analysis_df['max_rouge1_target_vs_input_position'])]

test_analysis_df['max_rouge1_target_vs_input_punchlines'] = test_analysis_df['rouge1_target_vs_input_punchlines'].apply(max)
test_analysis_df['max_rouge1_target_vs_input_position_punchlines'] = test_analysis_df['rouge1_target_vs_input_punchlines'].apply(np.argmax)
test_analysis_df['max_rouge1_target_vs_input_kls_rr_punchlines'] = [x[pos] for (x, pos) in zip(test_analysis_df['input_classifications_rr'], test_analysis_df['max_rouge1_target_vs_input_position_punchlines'])]




print(test_analysis_df.columns)
print('ei target dist', Counter(test_analysis_df['prediction_kls_of_target_ei']))
all_ei_inputs = list(itertools.chain.from_iterable(test_analysis_df['input_classifications_ei']))
print('ei input dist', Counter(all_ei_inputs))
print('rr target dist', Counter(test_analysis_df['prediction_kls_of_target_rr']))
all_rr_inputs = list(itertools.chain.from_iterable(test_analysis_df['input_classifications_rr']))
print('rr input dist', Counter(all_rr_inputs))
test_analysis_df.to_csv(f'{DATAPATH}/test_scores_updated.csv')
test_analysis_df.head()


2137
Index(['review_id', 'pmid', 'title', 'abstract', 'source', 'background',
       'significantly_decreased_of_input_rr',
       'no_significant_difference_of_input_rr',
       'significantly_increased_of_input_rr', 'punchlines',
       'significant_effect_of_input_rr', 'significantly_decreased_of_input_ei',
       'no_significant_difference_of_input_ei',
       'significantly_increased_of_input_ei', 'significant_effect_of_input_ei',
       'rouge1_target_vs_input_abstract', 'rouge2_target_vs_input_abstract',
       'rougeL_target_vs_input_abstract', 'rougeLsum_target_vs_input_abstract',
       'rouge1_target_vs_input_punchlines',
       'rouge2_target_vs_input_punchlines',
       'rougeL_target_vs_input_punchlines',
       'rougeLsum_target_vs_input_punchlines', 'rouge1_target_vs_input_title',
       'rouge2_target_vs_input_title', 'rougeL_target_vs_input_title',
       'rougeLsum_target_vs_input_title', 'rouge1_target_vs_background',
       'rouge2_target_vs_background', 'rougeL_ta

Unnamed: 0,review_id,pmid,title,abstract,source,background,significantly_decreased_of_input_rr,no_significant_difference_of_input_rr,significantly_increased_of_input_rr,punchlines,significant_effect_of_input_rr,significantly_decreased_of_input_ei,no_significant_difference_of_input_ei,significantly_increased_of_input_ei,significant_effect_of_input_ei,rouge1_target_vs_input_abstract,rouge2_target_vs_input_abstract,rougeL_target_vs_input_abstract,rougeLsum_target_vs_input_abstract,rouge1_target_vs_input_punchlines,rouge2_target_vs_input_punchlines,rougeL_target_vs_input_punchlines,rougeLsum_target_vs_input_punchlines,rouge1_target_vs_input_title,rouge2_target_vs_input_title,rougeL_target_vs_input_title,rougeLsum_target_vs_input_title,rouge1_target_vs_background,rouge2_target_vs_background,rougeL_target_vs_background,rougeLsum_target_vs_background,significantly_decreased_target_ei,no_significant_difference_target_ei,significantly_increased_target_ei,significant_difference_target_ei,significantly_decreased_target_rr,no_significant_difference_target_rr,significantly_increased_target_rr,significant_difference_target_rr,prediction_kls_of_target_ei,prediction_kls_of_target_rr,input_classifications_ei,input_classifications_rr,max_rouge1_target_vs_input,max_rouge1_target_vs_input_position,max_rouge1_target_vs_input_kls_ei,max_rouge1_target_vs_input_punchlines,max_rouge1_target_vs_input_position_punchlines,max_rouge1_target_vs_input_kls_rr_punchlines
0,CD000220,"[11519502, 6845046]",[Failure of metronidazole to prevent preterm d...,[Infection with Trichomonas vaginalis during p...,cochrane,,"[0.716418, 0.716418]","[0.010048083, 0.010048083]","[0.2735339, 0.2735339]",[Delivery occurred before 37 weeks of gestatio...,"[0.716418, 0.716418, 0.2735339, 0.2735339]","[0.1268150359392166, 0.17824169993400574]","[0.21311891078948975, 0.5389404296875]","[0.6600660681724548, 0.28281787037849426]","[0.1268150359392166, 0.17824169993400574, 0.66...","[0.09411764705882353, 0.09411764705882353]","[0.0, 0.0]","[0.047058823529411764, 0.047058823529411764]","[0.047058823529411764, 0.047058823529411764]","[0.10897435897435898, 0.205607476635514]","[0.012903225806451613, 0.05714285714285714]","[0.05128205128205128, 0.14953271028037382]","[0.07692307692307693, 0.14953271028037382]","[0.07017543859649122, 0.0784313725490196]","[0.0, 0.0]","[0.07017543859649122, 0.0392156862745098]","[0.07017543859649122, 0.0392156862745098]",,,,,0.093894,0.680258,0.225848,0.319742,0.047458,0.300456,0.652085,0.699544,no_significant_difference,significantly_increased,"[significantly_increased, no_significant_diffe...","[significantly_decreased, significantly_decrea...",0.094118,0,significantly_increased,0.205607,1,significantly_decreased
1,CD008120,"[16259547, 18246327, 18007568, 16139813, 12503...",[Adjunctive risperidone in generalized anxiety...,[Although significant advances have been made ...,cochrane,,"[0.8884527, 0.8884527, 0.8884527, 0.8884527, 0...","[0.009669417, 0.009669417, 0.009669417, 0.0096...","[0.10187782, 0.10187782, 0.10187782, 0.1018778...",[Adjunctive risperidone was associated with st...,"[0.8884527, 0.8884527, 0.8884527, 0.8884527, 0...","[0.3804469704627991, 0.35312458872795105, 0.09...","[0.18082500994205475, 0.34517616033554077, 0.4...","[0.4387280344963074, 0.3016992211341858, 0.480...","[0.3804469704627991, 0.35312458872795105, 0.09...","[0.1276595744680851, 0.1276595744680851, 0.127...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.10638297872340427, 0.10638297872340427, 0.1...","[0.10638297872340427, 0.10638297872340427, 0.1...","[0.1371841155234657, 0.14953271028037382, 0.13...","[0.007272727272727273, 0.006269592476489028, 0...","[0.07942238267148014, 0.07476635514018691, 0.0...","[0.10830324909747291, 0.12461059190031153, 0.1...","[0.1142857142857143, 0.1643835616438356, 0.16,...","[0.029411764705882353, 0.028169014084507043, 0...","[0.1142857142857143, 0.136986301369863, 0.1333...","[0.1142857142857143, 0.136986301369863, 0.1333...",,,,,0.511284,0.246139,0.242576,0.753861,0.158286,0.756909,0.084805,0.243091,significantly_decreased,no_significant_difference,"[significantly_increased, significantly_decrea...","[significantly_decreased, significantly_decrea...",0.12766,0,significantly_increased,0.168498,4,significantly_decreased
2,CD002968,"[9699091, 12351469, 12829654, 11891019, 145782...",[Effects of a short-term circuit weight traini...,[This study assessed the effects of short-term...,cochrane,,"[0.96397215, 0.96397215, 0.96397215, 0.9639721...","[0.013200386, 0.013200386, 0.013200386, 0.0132...","[0.022827525, 0.022827525, 0.022827525, 0.0228...",[Significant reductions from baseline values w...,"[0.96397215, 0.96397215, 0.96397215, 0.9639721...","[0.7270705103874207, 0.6144201159477234, 0.145...","[0.0399714931845665, 0.12528373301029205, 0.05...","[0.23295795917510986, 0.26029613614082336, 0.7...","[0.7270705103874207, 0.6144201159477234, 0.145...","[0.08108108108108107, 0.08108108108108107, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.05405405405405405, 0.05405405405405405, 0.0...","[0.05405405405405405, 0.05405405405405405, 0.0...","[0.10526315789473684, 0.08108108108108107, 0.0...","[0.016326530612244896, 0.03401360544217687, 0....","[0.06477732793522267, 0.06756756756756756, 0.0...","[0.06477732793522267, 0.08108108108108107, 0.0...","[0.17391304347826086, 0.30434782608695654, 0.2...","[0.045454545454545456, 0.13636363636363635, 0....","[0.13043478260869562, 0.30434782608695654, 0.2...","[0.13043478260869562, 0.30434782608695654, 0.2...",,,,,0.740417,0.067822,0.191761,0.932178,0.612875,0.172689,0.214436,0.827311,significantly_decreased,significantly_decreased,"[significantly_decreased, significantly_decrea...","[significantly_decreased, significantly_decrea...",0.081081,0,significantly_decreased,0.130841,7,significantly_decreased
3,CD008472,"[20484064, 19995115, 20484066, 17846333]",[Remote ischemic preconditioning for cerebral ...,[Remote ischemic preconditioning (RIPC) is a p...,cochrane,,"[0.0017564379, 0.0017564379, 0.0017564379, 0.0...","[0.9969964, 0.9969964, 0.9969964, 0.9969964]","[0.00124719, 0.00124719, 0.00124719, 0.00124719]",[Although there were fewer saccadic latency de...,"[0.0017564379, 0.0017564379, 0.0017564379, 0.0...","[0.04978577420115471, 0.2443036437034607, 0.17...","[0.8936231136322021, 0.3732493817806244, 0.533...","[0.056591182947158813, 0.3824469745159149, 0.2...","[0.04978577420115471, 0.2443036437034607, 0.17...","[0.1111111111111111, 0.1111111111111111, 0.111...","[0.0, 0.0, 0.0, 0.0]","[0.08333333333333333, 0.08333333333333333, 0.0...","[0.08333333333333333, 0.08333333333333333, 0.0...","[0.14354066985645933, 0.1310344827586207, 0.09...","[0.009661835748792272, 0.006944444444444443, 0...","[0.10526315789473684, 0.08275862068965519, 0.0...","[0.10526315789473684, 0.09655172413793103, 0.0...","[0.11764705882352941, 0.06060606060606061, 0.0...","[0.0, 0.0, 0.0, 0.0]","[0.058823529411764705, 0.06060606060606061, 0....","[0.058823529411764705, 0.06060606060606061, 0....",,,,,0.193724,0.558236,0.24804,0.441764,0.007782,0.978182,0.014035,0.021818,no_significant_difference,no_significant_difference,"[no_significant_difference, significantly_incr...","[no_significant_difference, no_significant_dif...",0.111111,0,no_significant_difference,0.143541,0,no_significant_difference
4,CD006373,"[16293396, 15567053, 15963687, 11566828, 10223...",[The effectiveness of a voice treatment approa...,[Teachers are considered the professional grou...,cochrane,,"[0.13579932, 0.13579932, 0.13579932, 0.1357993...","[0.0069637527, 0.0069637527, 0.0069637527, 0.0...","[0.857237, 0.857237, 0.857237, 0.857237, 0.857...",[The difference in voice care knowledge areas ...,"[0.13579932, 0.13579932, 0.13579932, 0.1357993...","[0.08196700364351273, 0.13510577380657196, 0.1...","[0.07671602070331573, 0.02303481660783291, 0.3...","[0.8413169384002686, 0.8418593406677246, 0.513...","[0.08196700364351273, 0.13510577380657196, 0.1...","[0.15873015873015875, 0.15873015873015875, 0.1...","[0.03278688524590164, 0.03278688524590164, 0.0...","[0.12698412698412698, 0.12698412698412698, 0.1...","[0.12698412698412698, 0.12698412698412698, 0.1...","[0.19047619047619047, 0.17167381974248927, 0.2...","[0.04, 0.03463203463203463, 0.0208333333333333...","[0.10317460317460317, 0.10300429184549359, 0.1...","[0.10317460317460317, 0.10300429184549359, 0.1...","[0.3333333333333333, 0.19354838709677416, 0.22...","[0.10344827586206898, 0.06666666666666667, 0.0...","[0.23333333333333334, 0.16129032258064516, 0.1...","[0.23333333333333334, 0.16129032258064516, 0.1...",,,,,0.072695,0.735804,0.191501,0.264196,0.003557,0.991941,0.004501,0.008059,no_significant_difference,no_significant_difference,"[significantly_increased, significantly_increa...","[significantly_increased, significantly_increa...",0.15873,0,significantly_increased,0.237113,2,significantly_increased


In [211]:
print(test_analysis_df['max_rouge1_target_vs_input_kls_ei'].isnull().any())

print(test_analysis_df['prediction_kls_of_target_ei'].isnull().any())
print((test_analysis_df['max_rouge1_target_vs_input_kls_ei'] == test_analysis_df['prediction_kls_of_target_ei']).mean())

print(Counter(test_analysis_df['source']))
test_analysis_df_ = test_analysis_df[test_analysis_df['source'] == 'ms2']

avg_synthesis_times_ei = (test_analysis_df_['max_rouge1_target_vs_input_kls_ei'] == test_analysis_df_['prediction_kls_of_target_ei']).mean()
print(avg_synthesis_times_ei)

False
False
0.45016378100140386
Counter({'ms2': 1667, 'cochrane': 470})
0.46070785842831435


In [212]:
print(df.columns)
df['significant_difference_prediction'] = df['significantly_decreased_prediction'] + df['significantly_increased_prediction']
df['prediction_kls_ei'] = [x.replace('_prediction', '') for x in df[['significantly_decreased_prediction', 'no_significant_difference_prediction', 'significantly_increased_prediction']].idxmax(axis=1)]
df['significant_difference_prediction_rr'] = df['significantly_decreased_prediction_rr'] + df['significantly_increased_prediction_rr']
df['prediction_kls_rr'] = [x.replace('_prediction_rr', '') for x in df[['significantly_decreased_prediction_rr', 'no_significant_difference_prediction_rr', 'significantly_increased_prediction_rr']].idxmax(axis=1)]

input_classifications = []
for _, row in df[['significantly_decreased_of_input_ei', 'no_significant_difference_of_input_ei', 'significantly_increased_of_input_ei']].iterrows():
    # preds per input!
    input_preds = list(zip(*row))
    max_position_per_pred = list(map(np.argmax, input_preds))
    assert len(max_position_per_pred) == len(input_preds)
    kls = [classes[x] for x in max_position_per_pred]
    input_classifications.append(kls)
df['prediction_kls_of_input_ei'] = input_classifications

input_classifications = []
for _, row in df[['significantly_decreased_of_input_rr', 'no_significant_difference_of_input_rr', 'significantly_increased_of_input_rr']].iterrows():
    # preds per input!
    input_preds = list(zip(*row))
    for y in input_preds:
        assert len(y) == 3
    max_position_per_pred = list(map(np.argmax, input_preds))
    assert len(max_position_per_pred) == len(input_preds)
    kls = [classes[x] for x in max_position_per_pred]
    input_classifications.append(kls)
df['prediction_kls_of_input_rr'] = input_classifications


df[[
    # types: float
    # evidence inference prediction_scores
    'significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction',
        'significant_difference_prediction',
        'prediction_kls_ei',
    # robot reviewer prediction scores
       'significantly_decreased_prediction_rr',
        'no_significant_difference_prediction_rr',
       'significantly_decreased_prediction_rr',
        'significant_difference_prediction_rr',
        'prediction_kls_rr',
    # types: list[float, str]
    # robot reviewer input scores
        'significantly_decreased_of_input_rr',
       'no_significant_difference_of_input_rr',
        'significantly_increased_of_input_rr',
        'significant_effect_of_input_rr', 
        'prediction_kls_of_input_rr',
        
    # evidence inference input scores
    'significantly_decreased_of_input_ei',
    'no_significant_difference_of_input_ei',
    'significantly_increased_of_input_ei',
    'significant_effect_of_input_ei',
    'prediction_kls_of_input_ei',
   ]]



Index(['review_id', 'subtask', 'target', 'prediction', 'exp_short',
       'significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction',
       'significantly_decreased_prediction_rr', 'bertscore_p', 'bertscore_r',
       'bertscore_f', 'rouge1_p', 'rouge1_r', 'rouge1_f', 'rouge2_p',
       'rouge2_r', 'rouge2_f', 'rougeL_p', 'rougeL_r', 'rougeL_f',
       'rougeLsum_p', 'rougeLsum_r', 'rougeLsum_f', 'ei_score', 'claimver',
       'sts', 'nli', 'no_significant_difference_prediction_rr',
       'significantly_increased_prediction_rr', 'pmid', 'title', 'abstract',
       'source', 'background', 'significantly_decreased_of_input_rr',
       'no_significant_difference_of_input_rr',
       'significantly_increased_of_input_rr', 'punchlines',
       'significant_effect_of_input_rr', 'significantly_decreased_of_input_ei',
       'no_significant_difference_of_input_ei',
       'significantly_increased_of_input_ei', 'significan

Unnamed: 0,significantly_decreased_prediction,no_significant_difference_prediction,significantly_increased_prediction,significant_difference_prediction,prediction_kls_ei,significantly_decreased_prediction_rr,no_significant_difference_prediction_rr,significantly_decreased_prediction_rr.1,significant_difference_prediction_rr,prediction_kls_rr,significantly_decreased_of_input_rr,no_significant_difference_of_input_rr,significantly_increased_of_input_rr,significant_effect_of_input_rr,prediction_kls_of_input_rr,significantly_decreased_of_input_ei,no_significant_difference_of_input_ei,significantly_increased_of_input_ei,significant_effect_of_input_ei,prediction_kls_of_input_ei
0,0.053528,0.049263,0.897209,0.950737,significantly_increased,0.066852,0.212070,0.066852,0.787931,significantly_increased,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[significantly_increased, significantly_increa...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1...","[no_significant_difference, significantly_incr..."
1,0.254750,0.220718,0.524532,0.779282,significantly_increased,0.434737,0.256255,0.434737,0.743745,significantly_decreased,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[significantly_increased, significantly_increa...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1...","[no_significant_difference, significantly_incr..."
2,0.189049,0.180032,0.630919,0.819968,significantly_increased,0.186612,0.522406,0.186612,0.477595,no_significant_difference,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[significantly_increased, significantly_increa...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1...","[no_significant_difference, significantly_incr..."
3,0.229608,0.350560,0.419832,0.649440,significantly_increased,0.171278,0.718776,0.171278,0.281224,no_significant_difference,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[significantly_increased, significantly_increa...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1...","[no_significant_difference, significantly_incr..."
4,0.029836,0.016749,0.953415,0.983251,significantly_increased,0.072712,0.100922,0.072712,0.899078,significantly_increased,"[0.19205841, 0.19205841, 0.19205841, 0.19205841]","[0.0069376007, 0.0069376007, 0.0069376007, 0.0...","[0.801004, 0.801004, 0.801004, 0.801004]","[0.19205841, 0.19205841, 0.19205841, 0.1920584...","[significantly_increased, significantly_increa...","[0.03182065859436989, 0.05880654975771904, 0.1...","[0.8062215447425842, 0.08273926377296448, 0.16...","[0.16195781528949738, 0.8584542274475098, 0.64...","[0.03182065859436989, 0.05880654975771904, 0.1...","[no_significant_difference, significantly_incr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17988,0.238323,0.274627,0.487050,0.725373,significantly_increased,0.180022,0.678657,0.180022,0.321343,no_significant_difference,"[0.0015160621, 0.0015160621]","[0.9973514, 0.9973514]","[0.0011325332, 0.0011325332]","[0.0015160621, 0.0015160621, 0.0011325332, 0.0...","[no_significant_difference, no_significant_dif...","[0.1988724321126938, 0.2142958790063858]","[0.5597877502441406, 0.6097846031188965]","[0.24133983254432678, 0.17591948807239532]","[0.1988724321126938, 0.2142958790063858, 0.241...","[no_significant_difference, no_significant_dif..."
17989,0.026364,0.901532,0.072104,0.098468,no_significant_difference,0.005692,0.979640,0.005692,0.020360,no_significant_difference,"[0.0015160621, 0.0015160621]","[0.9973514, 0.9973514]","[0.0011325332, 0.0011325332]","[0.0015160621, 0.0015160621, 0.0011325332, 0.0...","[no_significant_difference, no_significant_dif...","[0.1988724321126938, 0.2142958790063858]","[0.5597877502441406, 0.6097846031188965]","[0.24133983254432678, 0.17591948807239532]","[0.1988724321126938, 0.2142958790063858, 0.241...","[no_significant_difference, no_significant_dif..."
17990,0.013497,0.964115,0.022388,0.035885,no_significant_difference,0.009610,0.979698,0.009610,0.020303,no_significant_difference,"[0.0015160621, 0.0015160621]","[0.9973514, 0.9973514]","[0.0011325332, 0.0011325332]","[0.0015160621, 0.0015160621, 0.0011325332, 0.0...","[no_significant_difference, no_significant_dif...","[0.1988724321126938, 0.2142958790063858]","[0.5597877502441406, 0.6097846031188965]","[0.24133983254432678, 0.17591948807239532]","[0.1988724321126938, 0.2142958790063858, 0.241...","[no_significant_difference, no_significant_dif..."
17991,0.042273,0.825310,0.132418,0.174690,no_significant_difference,0.009395,0.965898,0.009395,0.034102,no_significant_difference,"[0.0015160621, 0.0015160621]","[0.9973514, 0.9973514]","[0.0011325332, 0.0011325332]","[0.0015160621, 0.0015160621, 0.0011325332, 0.0...","[no_significant_difference, no_significant_dif...","[0.1988724321126938, 0.2142958790063858]","[0.5597877502441406, 0.6097846031188965]","[0.24133983254432678, 0.17591948807239532]","[0.1988724321126938, 0.2142958790063858, 0.241...","[no_significant_difference, no_significant_dif..."


In [213]:
test_analysis_df['max_rouge1_target_vs_input_punchlines'].mean()


0.1919956607097028

In [215]:
df['max_rouge1_vs_input'] = df['rouge1_vs_input_abstract'].apply(max)
df['max_rouge1_vs_input_position'] = df['rouge1_vs_input_abstract'].apply(np.argmax)
df['max_rouge1_vs_input_kls_ei'] = [x[pos] for (x, pos) in zip(df['prediction_kls_of_input_ei'], df['max_rouge1_vs_input_position'])]
df['rouge1_vs_reference_is_higher'] = df['max_rouge1_vs_input'] > df['rouge1_f']
df['rouge1_vs_reference_delta'] = df['max_rouge1_vs_input'] - df['rouge1_f']


df['max_rouge1_vs_input_punchlines'] = df['rouge1_vs_input_punchlines'].apply(max)
df['max_rouge1_vs_input_position_punchlines'] = df['rouge1_vs_input_punchlines'].apply(np.argmax)
df['max_rouge1_vs_input_kls_rr_punchlines'] = [x[pos] for (x, pos) in zip(df['prediction_kls_of_input_rr'], df['max_rouge1_vs_input_position_punchlines'])]
df['rouge1_vs_reference_is_higher_punchlines'] = df['max_rouge1_vs_input_punchlines'] > df['rouge1_f']
df['rouge1_vs_reference_delta_punchlines'] = df['max_rouge1_vs_input_punchlines'] - df['rouge1_f']

random_input_effect_agrees = []
lengths = []
all_inputs = []
for _, row in df.iterrows():
    input_dist = Counter(row['prediction_kls_of_input_rr'])
    prediction_kls = row['prediction_kls_rr']
    prediction_prob = input_dist[prediction_kls] / len(row['prediction_kls_of_input_rr'])
    random_input_effect_agrees.append(prediction_prob)
    lengths.append(len(row['prediction_kls_of_input_rr']))
    all_inputs.extend(row['prediction_kls_of_input_rr'])
df['random_input_effect_agrees_with_prediction_rr'] = random_input_effect_agrees
print('num rows', len(df))
print(np.mean(lengths))
print('input dist', Counter(all_inputs))
exp_counts = Counter(df['exp_short'])
print(exp_counts)
print(df.columns)


datasets = Counter(df['subtask'])
subtask_dfs = {}
for subtask in datasets.keys():
    res = []
    test_analysis_df_ = test_analysis_df[test_analysis_df['source'] == subtask.lower()]
    avg_synthesis_times_ei = (test_analysis_df_['max_rouge1_target_vs_input_kls_ei'] == test_analysis_df_['prediction_kls_of_target_ei']).mean()
    assert avg_synthesis_times_ei == avg_synthesis_times_ei
    avg_synthesis_times_rr_punchline = (test_analysis_df_['max_rouge1_target_vs_input_kls_rr_punchlines'] == test_analysis_df_['prediction_kls_of_target_rr']).mean()
    assert avg_synthesis_times_rr_punchline == avg_synthesis_times_rr_punchline
    res.append(
        {
            'subtask': subtask,
            'exp': 'target',
            'avg_synthesis_times_ei': avg_synthesis_times_ei,
            'avg_synthesis_times_rr_punchline': avg_synthesis_times_rr_punchline,
        }
    )
    print(res[-1])
    
#     if subtask == 'MS2':
#         continue
    
    print('subtask', subtask)
    for exp in exp_counts.keys():
        # TODO: blank results?
        print(subtask, exp)
        df_ = df[df['subtask'] == subtask]
        df_ = df_[df_['exp_short'] == exp]
        if len(df_) == 0:
            continue
        print(Counter(df_['prediction_kls_ei']))
            
        # copying an input?
        avg_times_closer = np.mean(df_['rouge1_vs_reference_is_higher'])
        avg_rouge1_vs_reference_delta = np.mean(df_['rouge1_vs_reference_delta'])
        avg_rouge1_vs_reference_delta_std = np.std(df_['rouge1_vs_reference_delta'])

       # doing a synthesis?
        avg_synthesis_times_ei = (df_['max_rouge1_vs_input_kls_ei'] == df_['prediction_kls_ei']).mean()

        print("Is it copying an input?")
        print('fraction of times input is closer than target', avg_times_closer)
        print('mean/std diff max rouge1(input, pred) vs rouge1(input, pred)', avg_rouge1_vs_reference_delta, avg_rouge1_vs_reference_delta_std)
        print('Is it doing a synthesis?', avg_synthesis_times_ei)
 
        # copying an input (punchline)?
        avg_times_closer_punchlines = np.mean(df_['rouge1_vs_reference_is_higher_punchlines'])
        avg_rouge1_vs_reference_delta_punchlines = np.mean(df_['rouge1_vs_reference_delta_punchlines'])
        avg_rouge1_vs_reference_delta_std_punchlines = np.std(df_['rouge1_vs_reference_delta_punchlines'])

        # doing a synthesis?
        avg_synthesis_times_rr_punchline = (df_['max_rouge1_vs_input_kls_rr_punchlines'] == df_['prediction_kls_rr']).mean()

        
        print("Is it copying an input punchline?")
        print('fraction of times input punchline is closer than target', avg_times_closer_punchlines)
        print('mean/std diff max rouge1(input(punchline), pred) vs rouge1(input(punchline), pred)', avg_rouge1_vs_reference_delta_punchlines, avg_rouge1_vs_reference_delta_std_punchlines)
        print('Is it doing a synthesis (punchline)?', avg_synthesis_times_rr_punchline)
        
        # TODO baseline synthesis?
        res.append({
            'subtask': subtask,
            'exp': exp,
            'avg_times_closer': avg_times_closer,
            'avg_rouge1_vs_reference_delta': avg_rouge1_vs_reference_delta,
            'avg_rouge1_vs_reference_delta_std': avg_rouge1_vs_reference_delta_std,
            # times the synthesis result would agree
            'avg_synthesis_times_ei': avg_synthesis_times_ei,
            # punchlines
            'avg_times_closer_punchlines': avg_times_closer_punchlines,
            'avg_rouge1_vs_reference_delta_punchlines': avg_rouge1_vs_reference_delta_punchlines,
            'avg_rouge1_vs_reference_delta_std_punchlines': avg_rouge1_vs_reference_delta_std_punchlines,
            'avg_synthesis_times_rr_punchline': avg_synthesis_times_rr_punchline,
        })

    res = pd.DataFrame(res)
    res.to_csv(f'{DATAPATH}/result_table_{subtask}.csv')
    print(res.to_latex(index=False))
    subtask_dfs[subtask] = res


num rows 17993
22.13616406380259
input dist Counter({'no_significant_difference': 175655, 'significantly_increased': 148123, 'significantly_decreased': 74518})
Counter({'EFD8HX': 1667, 'MG3N0D': 1667, 'W6E2CQ': 1667, '9EKG14': 1667, 'V0PMWV': 1667, '8ZAR37': 1667, 'WHTYYD': 1667, 'RDZ7K5': 1666, 'SPNXTA': 470, '6GBRY0': 470, '8FWF5T': 470, 'JB6Z8F': 470, 'VNCH8M': 470, 'AQ85CE': 466, '5VR9DD': 466, 'JX1AJ6': 466, 'PX7SGV': 466, 'RQD4RK': 444})
Index(['review_id', 'subtask', 'target', 'prediction', 'exp_short',
       'significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction',
       'significantly_decreased_prediction_rr', 'bertscore_p', 'bertscore_r',
       'bertscore_f', 'rouge1_p', 'rouge1_r', 'rouge1_f', 'rouge2_p',
       'rouge2_r', 'rouge2_f', 'rougeL_p', 'rougeL_r', 'rougeL_f',
       'rougeLsum_p', 'rougeLsum_r', 'rougeLsum_f', 'ei_score', 'claimver',
       'sts', 'nli', 'no_significant_difference_predicti

Cochrane MG3N0D
Cochrane W6E2CQ
Cochrane RDZ7K5
Cochrane 9EKG14
Cochrane V0PMWV
Cochrane 8ZAR37
Cochrane WHTYYD
Cochrane SPNXTA
Counter({'no_significant_difference': 365, 'significantly_increased': 58, 'significantly_decreased': 47})
Is it copying an input?
fraction of times input is closer than target 0.274468085106383
mean/std diff max rouge1(input, pred) vs rouge1(input, pred) -0.08310186403058861 0.13017359579596963
Is it doing a synthesis? 0.39148936170212767
Is it copying an input punchline?
fraction of times input punchline is closer than target 0.3617021276595745
mean/std diff max rouge1(input(punchline), pred) vs rouge1(input(punchline), pred) -0.04650387791556527 0.1209424463032407
Is it doing a synthesis (punchline)? 0.44893617021276594
Cochrane 6GBRY0
Counter({'no_significant_difference': 208, 'significantly_increased': 188, 'significantly_decreased': 74})
Is it copying an input?
fraction of times input is closer than target 0.17872340425531916
mean/std diff max rouge1(inpu

  print(res.to_latex(index=False))
  print(res.to_latex(index=False))


In [216]:
submission_names = {}
with open(f'{DATAPATH}/submission_info.json', 'r') as inf:
    submissions = json.loads(inf.read())
    for exp, attrs in submissions.items():
        submission_names[exp] = attrs['name']
print(submission_names)

{'SPNXTA': 'led-base-16384-cochrane', '6GBRY0': 'Concatenate last sentence of each abstract', '8FWF5T': 'SciSpace', 'JB6Z8F': 'ittc2', 'AQ85CE': 'bart-large-finetuned', 'RQD4RK': 'longt5_pubmed', 'VNCH8M': 'ittc1', '5VR9DD': 'AI2/Longformer BART/train MS2/decode Cochrane', 'JX1AJ6': 'AI2/Longformer BART/Train Cochrane/Decode Cochrane', 'PX7SGV': 'AI2/BART/train Cochrane/decode Cochrane', 'EFD8HX': 'led-base-16384-ms2', 'MG3N0D': 'Copying the background section', 'W6E2CQ': 'led-base-16384-ms2-old', 'RDZ7K5': 'longt5_pubmed', '9EKG14': 'bart-large-finetuned', 'V0PMWV': 'AI2/Longformer BART/train Cochrane/decode MS2', '8ZAR37': 'AI2/Longformer BART/train MS2/decode MS2', 'WHTYYD': 'AI2/BART/train MS2/decode MS2'}


In [217]:
print(subtask_dfs.keys())

# for k, subtask_df in subtask_dfs.items():
#     print(subtask_df.to_latex(index=False))
# subtask_dfs['MS2']['exp'] = subtask_dfs['MS2']['exp'].replace(submission_names)
subtask_dfs['Cochrane']['exp'] = subtask_dfs['Cochrane']['exp'].replace(submission_names)
print(subtask_dfs['Cochrane'][['exp', 'avg_synthesis_times_ei', 'avg_synthesis_times_rr_punchline', 'avg_times_closer', 'avg_times_closer_punchlines', 'avg_rouge1_vs_reference_delta_punchlines', 'avg_rouge1_vs_reference_delta_std_punchlines']].to_latex(index=False, float_format="{:0.2f}".format))

dict_keys(['MS2', 'Cochrane'])
\begin{tabular}{lrrrrrr}
\toprule
                                               exp &  avg\_synthesis\_times\_ei &  avg\_synthesis\_times\_rr\_punchline &  avg\_times\_closer &  avg\_times\_closer\_punchlines &  avg\_rouge1\_vs\_reference\_delta\_punchlines &  avg\_rouge1\_vs\_reference\_delta\_std\_punchlines \\
\midrule
                                            target &                    0.41 &                              0.48 &               NaN &                          NaN &                                       NaN &                                           NaN \\
                           led-base-16384-cochrane &                    0.39 &                              0.45 &              0.27 &                         0.36 &                                     -0.05 &                                          0.12 \\
        Concatenate last sentence of each abstract &                    0.53 &                              0.49 &            

  print(subtask_dfs['Cochrane'][['exp', 'avg_synthesis_times_ei', 'avg_synthesis_times_rr_punchline', 'avg_times_closer', 'avg_times_closer_punchlines', 'avg_rouge1_vs_reference_delta_punchlines', 'avg_rouge1_vs_reference_delta_std_punchlines']].to_latex(index=False, float_format="{:0.2f}".format))


In [206]:
subtask_dfs['Cochrane'].head()

Unnamed: 0,subtask,exp,avg_synthesis_times_ei,avg_synthesis_times_rr_punchline,avg_times_closer,avg_rouge1_vs_reference_delta,avg_rouge1_vs_reference_delta_std,avg_times_closer_punchlines,avg_rouge1_vs_reference_delta_punchlines,avg_rouge1_vs_reference_delta_std_punchlines,avg_times_random_choice_synthesis_agrees_rr
0,Cochrane,target,0.412766,0.482979,,,,,,,
1,Cochrane,led-base-16384-cochrane,0.391489,0.448936,0.274468,-0.083102,0.130174,0.361702,-0.046504,0.120942,0.44206
2,Cochrane,Concatenate last sentence of each abstract,0.529787,0.491489,0.178723,-0.10992,0.161426,0.814894,0.13387,0.14647,0.44206
3,Cochrane,SciSpace,0.355319,0.438298,0.159574,-0.11225,0.11347,0.478723,-0.010215,0.116173,0.44206
4,Cochrane,ittc2,0.342553,0.446809,0.148936,-0.112159,0.114336,0.151064,-0.101542,0.109192,0.44206


In [207]:

print("For conditioned models, do these tend to copy from the prompt?")
print("For unconditioned models, do the outputs tend to look like the prompt (that they did not have access to!)?")
# does not say the background was used!
background_available = df[df['background'].notnull()].copy()
exp_counts_with_available_background = Counter(background_available['exp_short'])
print(exp_counts_with_available_background)

background_available['rouge1_vs_background_is_higher'] = background_available['rouge1_vs_background'] > background_available['rouge1_f']
background_available['rouge1_vs_background_diff'] = background_available['rouge1_vs_background'] - background_available['rouge1_f']

for exp in exp_counts_with_available_background.keys():
    # yes yes I should use a groupby
    background_available_ = background_available[background_available['exp_short'] == exp]
    print(exp)
    print('fraction of times the background is closer than the target', np.mean(background_available_['rouge1_vs_background_is_higher']))
    print('mean/std diff rouge1(background, pred) - rouge1(target, pred)', np.mean(background_available_['rouge1_vs_background_diff']), np.std(background_available_['rouge1_vs_background_diff']))

For conditioned models, do these tend to copy from the prompt?
For unconditioned models, do the outputs tend to look like the prompt (that they did not have access to!)?
Counter({'EFD8HX': 1667, 'MG3N0D': 1667, 'W6E2CQ': 1667, '9EKG14': 1667, 'V0PMWV': 1667, '8ZAR37': 1667, 'WHTYYD': 1667, 'RDZ7K5': 1666})
EFD8HX
fraction of times the background is closer than the target 0.6664667066586683
mean/std diff rouge1(background, pred) - rouge1(target, pred) 0.06477661251324623 0.1601050581250956
MG3N0D
fraction of times the background is closer than the target 1.0
mean/std diff rouge1(background, pred) - rouge1(target, pred) 0.7280145722774497 0.11052521521978258
W6E2CQ
fraction of times the background is closer than the target 0.5872825434913017
mean/std diff rouge1(background, pred) - rouge1(target, pred) 0.036911253704006775 0.14299161578338265
RDZ7K5
fraction of times the background is closer than the target 0.5342136854741897
mean/std diff rouge1(background, pred) - rouge1(target, pred) 