In [84]:
import json
import os

import datasets
import evaluate
import numpy as np
import pandas as pd
import torch

from collections import Counter

from transformers import AutoTokenizer, RobertaForSequenceClassification

assert torch.cuda.is_available()

DATAPATH = '../data/'

def load_array(array_column):
    def load(row):
        res = row[array_column].replace("['", '').replace("']", '').split('\n')
        return res
    return load

test_sets_converters = {x: load_array(x) for x in 
                        ['title', 'abstract', 
                         'significantly_decreased_of_input', 'no_significant_difference_of_input', 'significantly_increased_of_input', 'significant_effect_of_input']}
joined_converters = {x:load_array(x) for x in 
    ['significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction', 'title', 'abstract', 'source', 'background',
       'significantly_decreased_of_input',
       'no_significant_difference_of_input',
       'significantly_increased_of_input', 'significant_effect_of_input']
}
       

In [68]:
test_sets_file = f'{DATAPATH}/test_set_inputs.csv'

if not os.path.exists(test_sets_file):
    # load MSLR input data, prepare a dataframe and a couple of convenience dictionaries
    cochrane_test = datasets.load_dataset('allenai/mslr2022', 'cochrane')['test'].to_pandas()
    cochrane_test['source'] = 'cochrane'
    ms2_test = datasets.load_dataset('allenai/mslr2022', 'ms2')['test'].to_pandas()
    ms2_test['source'] = 'ms2'
    test_sets = pd.concat([cochrane_test, ms2_test], ignore_index=True)
    test_sets['review_id'] = test_sets['review_id']
    del test_sets['target']
    # del test_sets['pmid']
    # del test_sets['title']
    # del test_sets['abstract']
    # del test_sets['background']

    print(test_sets.columns)
    test_sets.head()

Found cached dataset mslr2022 (/home/jay/.cache/huggingface/datasets/allenai___mslr2022/cochrane/1.0.0/383847f6631ddefc5b6ed7df606b6f17078bdd51f642209158ed6e4bea951bbb)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mslr2022 (/home/jay/.cache/huggingface/datasets/allenai___mslr2022/ms2/1.0.0/383847f6631ddefc5b6ed7df606b6f17078bdd51f642209158ed6e4bea951bbb)


  0%|          | 0/3 [00:00<?, ?it/s]

Index(['review_id', 'pmid', 'title', 'abstract', 'source', 'background'], dtype='object')


In [69]:
# n.b. references have to come later :frown:
evidence_inference_tokenizer = AutoTokenizer.from_pretrained('allenai/biomed_roberta_base')

evidence_inference_classifier = RobertaForSequenceClassification.from_pretrained(
    'allenai/biomed_roberta_base',
    num_labels=3,
)
evidence_inference_classifier = evidence_inference_classifier.cuda()
state_dict = torch.load('../evidence_inference_models/unconditioned_evidence_classifier/unconditioned_evidence_classifier.pt')
state_dict = {k.replace('bert.', ''):v for k,v in state_dict.items()}
evidence_inference_classifier.load_state_dict(state_dict)

classes = ["significantly_decreased", "no_significant_difference", "significantly_increased"]
def compute_evidence_inference_scores_for_inputs(row):
    titles = row['title']
    abstracts = row['abstract']
    
    # definitely not how that was trained...
    title_abstracts = [x[0] + ' ' + x[1] for x in zip(titles, abstracts)]
    ret = { kls + "_of_input":[] for kls in classes }
    for i in range(0, len(title_abstracts), 16):
        inputs = evidence_inference_tokenizer(title_abstracts[i:i+6], return_tensors='pt', padding=True, max_length=512, truncation=True)
        inputs = inputs.to(evidence_inference_classifier.device)
        with torch.no_grad():
            logits = evidence_inference_classifier(**inputs).logits
            instance_probs = torch.softmax(logits, axis=1).cpu().tolist()
            for instance in instance_probs:
                for kls, prob in zip(classes, instance):
                    ret[kls + '_of_input'].append(prob)
    return ret

    
if os.path.exists(test_sets_file):
    test_sets = pd.read_csv(test_sets_file, converters=test_sets_converters)
else:
    test_sets_evidence_inference = test_sets.apply(
        compute_evidence_inference_scores_for_inputs,
        result_type='expand',
        axis=1)
#     print(test_sets_evidence_inference)
#     pd.concat([test_sets[:2], test_sets_evidence_inference], axis=1)
    test_sets = pd.concat([test_sets, test_sets_evidence_inference], axis=1)
    test_sets['significant_effect_of_input'] = test_sets["significantly_decreased_of_input"] + test_sets["significantly_increased_of_input"]
    test_sets.to_csv(test_sets_file)

review_id_to_abstract = dict(zip(test_sets['review_id'], test_sets['abstract']))
review_id_to_title = dict(zip(test_sets['review_id'], test_sets['title']))

test_sets.head()


Some weights of the model checkpoint at allenai/biomed_roberta_base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allenai/biomed_roberta_base and are newly initialized: ['classi

Unnamed: 0,review_id,pmid,title,abstract,source,background,significantly_decreased_of_input,no_significant_difference_of_input,significantly_increased_of_input,significant_effect_of_input
0,CD000220,"[11519502, 6845046]",[Failure of metronidazole to prevent preterm d...,[Infection with Trichomonas vaginalis during p...,cochrane,,"[0.12681518495082855, 0.1782418191432953]","[0.21311910450458527, 0.5389403700828552]","[0.6600657105445862, 0.2828177809715271]","[0.12681518495082855, 0.1782418191432953, 0.66..."
1,CD008120,"[16259547, 18246327, 18007568, 16139813, 12503...",[Adjunctive risperidone in generalized anxiety...,[Although significant advances have been made ...,cochrane,,"[0.38044750690460205, 0.3531244993209839, 0.09...","[0.18082502484321594, 0.345176100730896, 0.423...","[0.438727468252182, 0.30169937014579773, 0.480...","[0.38044750690460205, 0.3531244993209839, 0.09..."
2,CD002968,"[9699091, 12351469, 12829654, 11891019, 145782...",[Effects of a short-term circuit weight traini...,[This study assessed the effects of short-term...,cochrane,,"[0.7270705699920654, 0.6144202947616577, 0.145...","[0.0399714857339859, 0.12528376281261444, 0.05...","[0.2329578697681427, 0.2602958679199219, 0.795...","[0.7270705699920654, 0.6144202947616577, 0.145..."
3,CD008472,"[20484064, 19995115, 20484066, 17846333]",[Remote ischemic preconditioning for cerebral ...,[Remote ischemic preconditioning (RIPC) is a p...,cochrane,,"[0.04978576675057411, 0.24430350959300995, 0.1...","[0.8936229944229126, 0.37324902415275574, 0.53...","[0.05659119039773941, 0.3824475109577179, 0.29...","[0.04978576675057411, 0.24430350959300995, 0.1..."
4,CD006373,"[16293396, 15567053, 15963687, 11566828, 10223...",[The effectiveness of a voice treatment approa...,[Teachers are considered the professional grou...,cochrane,,"[0.08196703344583511, 0.1351054608821869, 0.13...","[0.07671593874692917, 0.023034831508994102, 0....","[0.8413169980049133, 0.8418596982955933, 0.513...","[0.08196703344583511, 0.1351054608821869, 0.13..."


In [None]:
# def load_array(array_column):
#     def load(row):
#         res = row[array_column].replace("['", '').replace("']", '').split('\n')
#         return res
#     return load
# test_sets_ = pd.read_csv(f'{DATAPATH}/test_set_inputs.csv')
# for column in ['title', 'abstract', 'significantly_decreased_of_input', 'no_significant_difference_of_input',
#        'significantly_increased_of_input', 'significant_effect_of_input']:
#     fixed_column = test_sets_.apply(load_array(column), axis=1)
#     test_sets_[column] = fixed_column
# test_sets_.to_json(test_sets_file)

In [70]:
# 1. update the stored json files with the source data abstracts and titles
# 2. create a dataframe of the system productions and whatever outputs are already present
# 3. I should probably feel bad about co-mingling these two, but...I don't?

def evidence_inference_scores_for_outputs(summary):
    inputs = evidence_inference_tokenizer(summary, return_tensors='pt', padding=True, max_length=512, truncation=True)
    inputs = inputs.to(evidence_inference_classifier.device)
    with torch.no_grad():
        logits = evidence_inference_classifier(**inputs).logits
        instance_probs = torch.softmax(logits, axis=1).cpu().tolist()
    return instance_probs[0]

# this is a hack
# after this runs, we have the output with metrics
for datafile, new_datafile in [
    (f'{DATAPATH}/processed_data.json', f'{DATAPATH}/processed_data_w_inputs.json'),
    (f'{DATAPATH}/processed_data_w_metrics.json', f'{DATAPATH}/processed_data_w_metrics_w_inputs.json')]:
    system_productions = []
    system_productions_lines = []
    with open(datafile, 'r') as inf, open(new_datafile, 'w') as of:
        for line in inf:
            instance = json.loads(line)
            instance['abstract'] = review_id_to_abstract[instance['review_id']].tolist()
            instance['title'] = review_id_to_title[instance['review_id']].tolist()
            for prediction in instance['predictions']:
                # only compute if needed
                if 'evidence_inference_target' not in instance:
                    instance['evidence_inference_target'] = evidence_inference_scores_for_outputs(instance['target'])

                prediction_dict = {
                    'review_id': instance['review_id'],
                    'subtask': instance['subtask'],
                    'target': instance['target'],
                    'prediction': prediction['prediction'],
#                     'evidence_inference_prediction': evidence_inference_prediction,
                    'exp_short': prediction['exp_short'],
                }
                # only compute if needed
                if 'no_significant_difference_prediction' not in prediction:
                    for kls, score in zip(classes, evidence_inference_scores_for_outputs(prediction['prediction'])):
                        prediction_dict[kls + '_prediction'] = score
                        prediction[kls + '_prediction'] = score
                else:
                    for kls in classes:
                        prediction_dict[kls + '_prediction'] = prediction[kls + '_prediction']
                
                # copy automatic scoring over
                if 'scores' in prediction:
                    for k, v in prediction['scores'].items():
                        assert k not in prediction_dict
                        prediction_dict[k] = v
                # TODO annotations?
                
                system_productions.append(prediction_dict)
            of.write(json.dumps(instance))
            of.write('\n')
    system_productions = pd.DataFrame(system_productions)
    system_productions['review_id'] = system_productions['review_id'].astype(str)
    
    os.replace(new_datafile, datafile)

print(system_productions.columns)
print(system_productions['review_id'][0])
system_productions.head()

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

Index(['review_id', 'subtask', 'target', 'prediction', 'exp_short',
       'significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction', 'bertscore_p', 'bertscore_r',
       'bertscore_f', 'rouge1_p', 'rouge1_r', 'rouge1_f', 'rouge2_p',
       'rouge2_r', 'rouge2_f', 'rougeL_p', 'rougeL_r', 'rougeL_f',
       'rougeLsum_p', 'rougeLsum_r', 'rougeLsum_f', 'ei_score', 'claimver',
       'sts', 'nli'],
      dtype='object')
24894943


Unnamed: 0,review_id,subtask,target,prediction,exp_short,significantly_decreased_prediction,no_significant_difference_prediction,significantly_increased_prediction,bertscore_p,bertscore_r,...,rougeL_p,rougeL_r,rougeL_f,rougeLsum_p,rougeLsum_r,rougeLsum_f,ei_score,claimver,sts,nli
0,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,EFD8HX,0.053528,0.049263,0.897209,0.816605,0.841818,...,0.142857,0.172414,0.15625,0.171429,0.206897,0.1875,0.836226,0.476795,0.631644,0.804932
1,24894943,MS2,Significant OS differences were observed in ol...,INTRODUCTION / BACKGROUND An individual patien...,MG3N0D,0.25475,0.220718,0.524532,0.841731,0.859274,...,0.129032,0.137931,0.133333,0.129032,0.137931,0.133333,0.078508,0.770375,0.605519,0.712774
2,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,W6E2CQ,0.189049,0.180032,0.630919,0.817489,0.82791,...,0.363636,0.137931,0.2,0.363636,0.137931,0.2,0.032762,0.721033,0.633378,0.739471
3,24894943,MS2,Significant OS differences were observed in ol...,backgroundthe aim of this study was to compare...,RDZ7K5,0.229608,0.35056,0.419832,0.848861,0.807682,...,0.133333,0.068966,0.090909,0.133333,0.068966,0.090909,0.036774,0.365915,0.397677,0.613397
4,24894943,MS2,Significant OS differences were observed in ol...,Conclusions : The results of this meta- analys...,9EKG14,0.029836,0.016749,0.953415,0.854012,0.833715,...,0.16129,0.172414,0.166667,0.16129,0.172414,0.166667,0.985187,0.230184,0.488337,0.693682


In [73]:
print(set(test_sets.columns) & set(system_productions.columns))
print(len(set(system_productions['review_id']) & set(test_sets['review_id'])))

df = system_productions.merge(right=test_sets, on='review_id', how='inner')
print(df.columns, len(df))
# df.to_csv('../data/system_productions_w_metrics.csv')
df.head()

{'review_id'}
2137
Index(['review_id', 'subtask', 'target', 'prediction', 'exp_short',
       'significantly_decreased_prediction',
       'no_significant_difference_prediction',
       'significantly_increased_prediction', 'bertscore_p', 'bertscore_r',
       'bertscore_f', 'rouge1_p', 'rouge1_r', 'rouge1_f', 'rouge2_p',
       'rouge2_r', 'rouge2_f', 'rougeL_p', 'rougeL_r', 'rougeL_f',
       'rougeLsum_p', 'rougeLsum_r', 'rougeLsum_f', 'ei_score', 'claimver',
       'sts', 'nli', 'pmid', 'title', 'abstract', 'source', 'background',
       'significantly_decreased_of_input',
       'no_significant_difference_of_input',
       'significantly_increased_of_input', 'significant_effect_of_input'],
      dtype='object') 17993


Unnamed: 0,review_id,subtask,target,prediction,exp_short,significantly_decreased_prediction,no_significant_difference_prediction,significantly_increased_prediction,bertscore_p,bertscore_r,...,nli,pmid,title,abstract,source,background,significantly_decreased_of_input,no_significant_difference_of_input,significantly_increased_of_input,significant_effect_of_input
0,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,EFD8HX,0.053528,0.049263,0.897209,0.816605,0.841818,...,0.804932,"[19632716, 20573926, 23724913, 21903745]",[Radiotherapy plus chemotherapy with or withou...,[BACKGROUND Results from phase II studies in p...,ms2,INTRODUCTION / BACKGROUND An individual patien...,"[0.031820639967918396, 0.05880654603242874, 0....","[0.8062216639518738, 0.08273934572935104, 0.16...","[0.16195771098136902, 0.858454167842865, 0.647...","[0.031820639967918396, 0.05880654603242874, 0...."
1,24894943,MS2,Significant OS differences were observed in ol...,INTRODUCTION / BACKGROUND An individual patien...,MG3N0D,0.25475,0.220718,0.524532,0.841731,0.859274,...,0.712774,"[19632716, 20573926, 23724913, 21903745]",[Radiotherapy plus chemotherapy with or withou...,[BACKGROUND Results from phase II studies in p...,ms2,INTRODUCTION / BACKGROUND An individual patien...,"[0.031820639967918396, 0.05880654603242874, 0....","[0.8062216639518738, 0.08273934572935104, 0.16...","[0.16195771098136902, 0.858454167842865, 0.647...","[0.031820639967918396, 0.05880654603242874, 0...."
2,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,W6E2CQ,0.189049,0.180032,0.630919,0.817489,0.82791,...,0.739471,"[19632716, 20573926, 23724913, 21903745]",[Radiotherapy plus chemotherapy with or withou...,[BACKGROUND Results from phase II studies in p...,ms2,INTRODUCTION / BACKGROUND An individual patien...,"[0.031820639967918396, 0.05880654603242874, 0....","[0.8062216639518738, 0.08273934572935104, 0.16...","[0.16195771098136902, 0.858454167842865, 0.647...","[0.031820639967918396, 0.05880654603242874, 0...."
3,24894943,MS2,Significant OS differences were observed in ol...,backgroundthe aim of this study was to compare...,RDZ7K5,0.229608,0.35056,0.419832,0.848861,0.807682,...,0.613397,"[19632716, 20573926, 23724913, 21903745]",[Radiotherapy plus chemotherapy with or withou...,[BACKGROUND Results from phase II studies in p...,ms2,INTRODUCTION / BACKGROUND An individual patien...,"[0.031820639967918396, 0.05880654603242874, 0....","[0.8062216639518738, 0.08273934572935104, 0.16...","[0.16195771098136902, 0.858454167842865, 0.647...","[0.031820639967918396, 0.05880654603242874, 0...."
4,24894943,MS2,Significant OS differences were observed in ol...,Conclusions : The results of this meta- analys...,9EKG14,0.029836,0.016749,0.953415,0.854012,0.833715,...,0.693682,"[19632716, 20573926, 23724913, 21903745]",[Radiotherapy plus chemotherapy with or withou...,[BACKGROUND Results from phase II studies in p...,ms2,INTRODUCTION / BACKGROUND An individual patien...,"[0.031820639967918396, 0.05880654603242874, 0....","[0.8062216639518738, 0.08273934572935104, 0.16...","[0.16195771098136902, 0.858454167842865, 0.647...","[0.031820639967918396, 0.05880654603242874, 0...."


In [74]:
rouge = evaluate.load('rouge')

def input_copy_rouge1(row):
    # vs input abstracts
    ret = {
        'rouge1_vs_input_abstract': [],
        'rouge2_vs_input_abstract': [],
        'rougeL_vs_input_abstract': [],
        'rougeLsum_vs_input_abstract': [],
    }
    
    for inp in row['abstract']:
        rouge_scores = rouge.compute(references=[inp], predictions=[row['prediction']])
        for k, v in rouge_scores.items():
            ret[k + '_vs_input_abstract'].append(v)

    # vs input titles
    ret = {
        **ret, 
        'rouge1_vs_input_title': [],
        'rouge2_vs_input_title': [],
        'rougeL_vs_input_title': [],
        'rougeLsum_vs_input_title': [],
    }
    
    for inp in row['title']:
        rouge_scores = rouge.compute(references=[inp], predictions=[row['prediction']])
        for k, v in rouge_scores.items():
            ret[k + '_vs_input_title'].append(v)
    
    # vs background
    if row['background'] == row['background'] and row['background'] is not None:
        background_rouge_scores = rouge.compute(references=[row['background']], predictions=[row['prediction']])
        for k, v in background_rouge_scores.items():
            ret[k + '_vs_background'] = v
    
    return ret

input_rouge_result_file = f'{DATAPATH}/processed_data_w_input_rouge_scores.csv'
if os.path.exists(input_rouge_result_file):
    df = pd.read_json(input_rouge_result_file, converters=joined_converters)
else:
    copying_eval = df.apply(input_copy_rouge1, result_type='expand', axis=1)
    df = pd.concat([df, copying_eval], axis=1)
    df.to_csv(input_rouge_result_file)

df.head()

Unnamed: 0,review_id,subtask,target,prediction,exp_short,significantly_decreased_prediction,no_significant_difference_prediction,significantly_increased_prediction,bertscore_p,bertscore_r,...,rougeL_vs_input_abstract,rougeLsum_vs_input_abstract,rouge1_vs_input_title,rouge2_vs_input_title,rougeL_vs_input_title,rougeLsum_vs_input_title,rouge1_vs_background,rouge2_vs_background,rougeL_vs_background,rougeLsum_vs_background
0,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,EFD8HX,0.053528,0.049263,0.897209,0.816605,0.841818,...,"[0.05952380952380953, 0.06936416184971099, 0.0...","[0.05952380952380953, 0.06936416184971099, 0.0...","[0.17543859649122806, 0.0851063829787234, 0.04...","[0.07272727272727271, 0.0, 0.0, 0.039215686274...","[0.17543859649122806, 0.0851063829787234, 0.04...","[0.17543859649122806, 0.0851063829787234, 0.04...",0.242424,0.09375,0.151515,0.151515
1,24894943,MS2,Significant OS differences were observed in ol...,INTRODUCTION / BACKGROUND An individual patien...,MG3N0D,0.25475,0.220718,0.524532,0.841731,0.859274,...,"[0.044000000000000004, 0.05263157894736842, 0....","[0.044000000000000004, 0.05263157894736842, 0....","[0.07547169811320754, 0.046511627906976744, 0....","[0.0, 0.0, 0.0, 0.0]","[0.03773584905660377, 0.046511627906976744, 0....","[0.03773584905660377, 0.046511627906976744, 0....",1.0,1.0,1.0,1.0
2,24894943,MS2,Significant OS differences were observed in ol...,CONCLUSION / IMPLICATION S OF KEY FINDINGS In ...,W6E2CQ,0.189049,0.180032,0.630919,0.817489,0.82791,...,"[0.020833333333333332, 0.024844720496894408, 0...","[0.020833333333333332, 0.024844720496894408, 0...","[0.060606060606060615, 0.08695652173913043, 0....","[0.0, 0.0, 0.0, 0.0]","[0.060606060606060615, 0.08695652173913043, 0....","[0.060606060606060615, 0.08695652173913043, 0....",0.238095,0.15,0.238095,0.238095
3,24894943,MS2,Significant OS differences were observed in ol...,backgroundthe aim of this study was to compare...,RDZ7K5,0.229608,0.35056,0.419832,0.848861,0.807682,...,"[0.045454545454545456, 0.030674846625766868, 0...","[0.045454545454545456, 0.030674846625766868, 0...","[0.16216216216216214, 0.07407407407407407, 0.0...","[0.0, 0.0, 0.0, 0.0]","[0.10810810810810811, 0.07407407407407407, 0.0...","[0.10810810810810811, 0.07407407407407407, 0.0...",0.173913,0.0,0.130435,0.130435
4,24894943,MS2,Significant OS differences were observed in ol...,Conclusions : The results of this meta- analys...,9EKG14,0.029836,0.016749,0.953415,0.854012,0.833715,...,"[0.048, 0.07017543859649124, 0.065395095367847...","[0.048, 0.07017543859649124, 0.065395095367847...","[0.07547169811320754, 0.046511627906976744, 0....","[0.0, 0.0, 0.0, 0.0]","[0.07547169811320754, 0.046511627906976744, 0....","[0.07547169811320754, 0.046511627906976744, 0....",0.322581,0.066667,0.258065,0.258065


In [88]:
df['max_rouge1_vs_input'] = df['rouge1_vs_input_abstract'].apply(max)
df['max_rouge1_vs_input_position'] = df['rouge1_vs_input_abstract'].apply(np.argmax)
df['rouge1_vs_reference_is_higher'] = df['max_rouge1_vs_input'] > df['rouge1_f']
df['rouge1_vs_reference_delta'] = df['max_rouge1_vs_input'] - df['rouge1_f']

exp_counts = Counter(df['exp_short'])
print(exp_counts)

for exp in exp_counts.keys():
    print(exp)
    df_ = df[df['exp_short'] == exp]
    print("Is it copying an input?")
    print('fraction of times input is closer than target', np.mean(df_['rouge1_vs_reference_is_higher']))
    print('mean/std diff max rouge1(input, pred) vs rouge1(input, pred)', np.mean(df_['rouge1_vs_reference_delta']), np.std(df_['rouge1_vs_reference_delta']))

# print("Is it doing an implicit synthesis by selecting an input?")
# compute what the target says by ei
# compute what the best input/prediction match (by rouge) says by ei
# compute what the prediction says by ei
# how many times does the prediction ei equal the best target?
# vs how many times does the target ei match the prediction?
# vs how many times does the target ei match the best input/prediction?

# should this use the class or the delta ei?
# TODO use the evidence inference nonsense here

Counter({'EFD8HX': 1667, 'MG3N0D': 1667, 'W6E2CQ': 1667, '9EKG14': 1667, 'V0PMWV': 1667, '8ZAR37': 1667, 'WHTYYD': 1667, 'RDZ7K5': 1666, 'SPNXTA': 470, '6GBRY0': 470, '8FWF5T': 470, 'JB6Z8F': 470, 'VNCH8M': 470, 'AQ85CE': 466, '5VR9DD': 466, 'JX1AJ6': 466, 'PX7SGV': 466, 'RQD4RK': 444})
EFD8HX
Is it copying an input?
number of times input is closer than target 0.1943611277744451
mean/std diff max rouge1(input, pred) vs rouge1(input, pred) -0.10258055586849103 0.12725985617078314
MG3N0D
Is it copying an input?
number of times input is closer than target 0.35332933413317336
mean/std diff max rouge1(input, pred) vs rouge1(input, pred) -0.04158278760996608 0.12232860255005393
W6E2CQ
Is it copying an input?
number of times input is closer than target 0.1739652069586083
mean/std diff max rouge1(input, pred) vs rouge1(input, pred) -0.11015035681379805 0.12554688522177185
RDZ7K5
Is it copying an input?
number of times input is closer than target 0.40456182472989194
mean/std diff max rouge1(inp

In [87]:

print("For conditioned models, do these tend to copy from the prompt?")
print("For unconditioned models, do the outputs tend to look like the prompt (that they did not have access to!)?")
# does not say the background was used!
background_available = df[df['background'].notnull()].copy()
exp_counts_with_available_background = Counter(background_available['exp_short'])
print(exp_counts_with_available_background)

background_available['rouge1_vs_background_is_higher'] = background_available['rouge1_vs_background'] > background_available['rouge1_f']
background_available['rouge1_vs_background_diff'] = background_available['rouge1_vs_background'] - background_available['rouge1_f']

for exp in exp_counts_with_available_background.keys():
    # yes yes I should use a groupby
    background_available_ = background_available[background_available['exp_short'] == exp]
    print(exp)
    print('fraction of times the background is closer than the target', np.mean(background_available_['rouge1_vs_background_is_higher']))
    print('mean/std diff rouge1(background, pred) - rouge1(target, pred)', np.mean(background_available_['rouge1_vs_background_diff']), np.std(background_available_['rouge1_vs_background_diff']))

For conditioned models, do these tend to copy from the prompt?
For unconditioned models, do the outputs tend to look like the prompt (that they did not have access to!)?
Counter({'EFD8HX': 1667, 'MG3N0D': 1667, 'W6E2CQ': 1667, '9EKG14': 1667, 'V0PMWV': 1667, '8ZAR37': 1667, 'WHTYYD': 1667, 'RDZ7K5': 1666})
EFD8HX
number of times the background is closer than the target 0.6664667066586683
mean/std diff rouge1(background, pred) - rouge1(target, pred) 0.06477661251324623 0.1601050581250956
MG3N0D
number of times the background is closer than the target 1.0
mean/std diff rouge1(background, pred) - rouge1(target, pred) 0.7280145722774497 0.11052521521978258
W6E2CQ
number of times the background is closer than the target 0.5872825434913017
mean/std diff rouge1(background, pred) - rouge1(target, pred) 0.036911253704006775 0.14299161578338265
RDZ7K5
number of times the background is closer than the target 0.5342136854741897
mean/std diff rouge1(background, pred) - rouge1(target, pred) -0.00413