## Evaluation Metrics using Entities    


Our own metrics to capture/measure the hallucination, missing facts effect as well as faithfulness of a model.   

In [12]:
import re
import numpy as np

# for matching entities in generated/original sections
from nltk.tokenize import wordpunct_tokenize

### Evaluate BART generations

**Note**: inputs are same as for the T5 model

In [13]:
# old - from Ruslan
#PATH_FINE_TUNING_ENTITIES = "/home/angelo_ziletti/nlg-ra/T5_experiments/T5_plain/input_data/train.source"
#PATH_TEST_INPUTS = "/home/angelo_ziletti/nlg-ra/T5_experiments/T5_plain/input_data/test.source"
#PATH_TEST_OUTPUTS = "/home/angelo_ziletti/nlg-ra/results/bart/test_generations_beam_1.txt"
#PATH_ORIGINAL_OUTPUTS = "/home/angelo_ziletti/nlg-ra/T5_experiments/T5_plain/input_data/test.target"

In [224]:
import os.path

# this is fixed 
PATH_FINE_TUNING_ENTITIES = "/home/angelo_ziletti/nlg-ra/T5_experiments/T5_plain/input_data/train.source"

ROOT_FOLDER = "/home/angelo_ziletti/nlg-ra/T5_experiments"

#MODEL = "T5_plain"
#PARTIAL_PATH_GENERATED = "hp_1e_3_1/test_generations_beam_1.txt"
#MODEL = "T5_condition"
MODEL = "T5_condition_semantics"
#MODEL = "BART_base"
#MODEL = "BART_condition"
#MODEL = "BART_condition_semantics"


PARTIAL_PATH_GENERATED = "outputs/test_generations_beam_1.txt"
PARTIAL_PATH_ORIGINAL = "input_data/test.target"
PARTIAL_PATH_TEST_INPUTS = "input_data/test.source"

PATH_GENERATED = os.path.abspath(os.path.normpath(os.path.join(ROOT_FOLDER, MODEL, PARTIAL_PATH_GENERATED)))
PATH_ORIGINAL = os.path.abspath(os.path.normpath(os.path.join(ROOT_FOLDER, MODEL, PARTIAL_PATH_ORIGINAL)))
PATH_TEST_INPUTS = os.path.abspath(os.path.normpath(os.path.join(ROOT_FOLDER, MODEL, PARTIAL_PATH_TEST_INPUTS)))


PATH_TEST_OUTPUTS = PATH_GENERATED
PATH_ORIGINAL_OUTPUTS = PATH_ORIGINAL

print("PATH_TEST_INPUTS: {}".format(PATH_TEST_INPUTS))
print("PATH_TEST_OUTPUTS: {}".format(PATH_TEST_OUTPUTS))
print("PATH_ORIGINAL_OUTPUTS: {}".format(PATH_ORIGINAL_OUTPUTS))
print("PATH_FINE_TUNING_ENTITIES: {}".format(PATH_FINE_TUNING_ENTITIES))

PATH_TEST_INPUTS: /home/angelo_ziletti/nlg-ra/T5_experiments/T5_condition_semantics/input_data/test.source
PATH_TEST_OUTPUTS: /home/angelo_ziletti/nlg-ra/T5_experiments/T5_condition_semantics/outputs/test_generations_beam_1.txt
PATH_ORIGINAL_OUTPUTS: /home/angelo_ziletti/nlg-ra/T5_experiments/T5_condition_semantics/input_data/test.target
PATH_FINE_TUNING_ENTITIES: /home/angelo_ziletti/nlg-ra/T5_experiments/T5_plain/input_data/train.source


Load data

In [225]:
with open(PATH_TEST_OUTPUTS) as f:
    test_outputs = [line.strip() for line in f]

In [226]:
with open(PATH_ORIGINAL_OUTPUTS) as f:
    original_outputs = [line.strip() for line in f]

In [227]:
with open(PATH_FINE_TUNING_ENTITIES) as f:
    finetuning_entities = [line.strip() for line in f]

In [228]:
with open(PATH_TEST_INPUTS) as f:
    test_inputs = [line.strip() for line in f]

In [229]:
# check 
assert len(original_outputs) == len(test_outputs)
assert len(test_inputs) == len(test_outputs)

### Get all the entities used in fine-tuning step

(under the assumption that that's where hallucination comes from)    

Basically get all the unique entities used during fine-tuning a model aka all the unique entities from train input dataset

In [230]:
# keys - unique entities from all the section across train dataset
entities_finetuning = dict()

for section_entities in finetuning_entities:
    
    # take into account only entities - uncomment in case there is conditioning on a section
    # section_entities = section_entities.split(":", 1)[1]
    
    # get only the entity_value
    section_entities = re.sub('<[^>]+>', '', section_entities)
    section_entities = section_entities.split()
    
    for entity in section_entities:
        
        # case when entity is long
        words_in_entity = entity.split("_")
        detected_entity = " ".join(words_in_entity)
        
        if detected_entity in entities_finetuning: entities_finetuning[detected_entity] += 1
        else: entities_finetuning[detected_entity] = 1

In [231]:
# number of unique entities in fine-tuning
len(entities_finetuning)

54797

In [234]:
# check out the most popular entities (TOP_ENTITIES) in train input dataset

sorted_entities_finetuning = {k: v for k, v in sorted(entities_finetuning.items(), key=lambda item: item[1], reverse=True)}

TOP_ENTITIES = 20

for entity in sorted_entities_finetuning:
    print(entity, sorted_entities_finetuning[entity])
    TOP_ENTITIES -= 1
    if TOP_ENTITIES == 0: break

side effects 9361
this medicine 8540
1 6815
pregnant 3635
10 3372
treatment 3212
medicines 2618
2 2434
100 2194
pregnancy 2085
other medicines 2021
symptoms 1629
fever 1557
4 1537
3 1434
6 1424
vomiting 1321
dizziness 1282
skin 1276
pain 1234


### Get unique entities per each sample in test set   

In [235]:
print('Num. of test inputs:', len(test_inputs))

Num. of test inputs: 742


In [236]:
# list of dicts where each dict is a collection of (entity, num. of times it appears in corresponding *test_inputs*)
entities_inputs = list()

for section_entities in test_inputs:
    
    # take into account only entities (remove section) - uncomment in case there is conditioning on a section
    # section_entities = section_entities.split(":", 1)[1]
    
    # get only the entity_value
    section_entities = re.sub('<[^>]+>', '', section_entities)
    section_entities = section_entities.split()
    
    # keep unique entities for the current section_entities in test_input
    current_section_entities = dict()
    
    for entity in section_entities:
        
        # case when entity is long
        words_in_entity = entity.split("_")
        detected_entity = " ".join(words_in_entity)
        
        if detected_entity in current_section_entities: current_section_entities[detected_entity] += 1
        else: current_section_entities[detected_entity] = 1
    
    # add to the list
    entities_inputs.append(current_section_entities)

In [237]:
print('[Before] Test input of one section looks like: ')
print(test_inputs[0])

[Before] Test input of one section looks like: 
what the medicine is and what it is used for: <PRODUCT_NAME> incivo </PRODUCT_NAME> <PROBLEM> the_virus </PROBLEM> <DX_NAME> hepatitis_c_infection </DX_NAME> <PROBLEM> chronic_hepatitis_c_infection </PROBLEM> <AGE> 1865 </AGE> <GENERIC_NAME> peginterferon_alfa </GENERIC_NAME> <GENERIC_NAME> ribavirin </GENERIC_NAME> <GENERIC_NAME> telaprevir </GENERIC_NAME> <TREATMENT> medicines </TREATMENT> <TREATMENT_NAME> ns3-4a_protease_inhibitors </TREATMENT_NAME> <TREATMENT> the_ns3-4a_protease_inhibitor </TREATMENT> <DX_NAME> hepatitis_c_virus </DX_NAME> <GENERIC_NAME> peginterferon_alfa </GENERIC_NAME> <GENERIC_NAME> ribavirin </GENERIC_NAME> <TREATMENT> incivo </TREATMENT> <PROBLEM> chronic_hepatitis_c_infection </PROBLEM> <PROBLEM> chronic_hepatitis_c_infection </PROBLEM> <TIME_TO_TREATMENT_NAME> previously </TIME_TO_TREATMENT_NAME> <TREATMENT> an_interferon-based_regimen </TREATMENT>


In [238]:
print('[After] Test input of one section looks like: ')
print(entities_inputs[0])

[After] Test input of one section looks like: 
{'what': 2, 'the': 1, 'medicine': 1, 'is': 2, 'and': 1, 'it': 1, 'used': 1, 'for:': 1, 'incivo': 2, 'the virus': 1, 'hepatitis c infection': 1, 'chronic hepatitis c infection': 3, '1865': 1, 'peginterferon alfa': 2, 'ribavirin': 2, 'telaprevir': 1, 'medicines': 1, 'ns3-4a protease inhibitors': 1, 'the ns3-4a protease inhibitor': 1, 'hepatitis c virus': 1, 'previously': 1, 'an interferon-based regimen': 1}


In [239]:
assert len(entities_inputs) == len(test_inputs)

## Now the fun part - metrics calculation - Faithfulness, Missing facts, Hallucination

In [240]:
test_outputs[0]

'incivo works by stopping the virus from multiplying in the body . inciva is used to treat chronic hepatitis c infection in adults ( including those of 1865 years of age ) in combination with peginterferon alfa - 2b , ribavirin and telaprevir , which are medicines called ns3 - 4a protease inhibitors . the nn3 -4a proteace inhibitor is a natural antibody that helps to protect you from hepatis b virus . pegInterferon atfa and rib virin are the active ingredients of inciVO . what is chronic : incive is used for treating chronic hempatiti c disease in adults . it is used in addition to long - term patients with chronic hc infection who have not previously responded to an'

**Note**: in the generations, punctuations are treated separately.

In [241]:
entities_inputs[0]

{'what': 2,
 'the': 1,
 'medicine': 1,
 'is': 2,
 'and': 1,
 'it': 1,
 'used': 1,
 'for:': 1,
 'incivo': 2,
 'the virus': 1,
 'hepatitis c infection': 1,
 'chronic hepatitis c infection': 3,
 '1865': 1,
 'peginterferon alfa': 2,
 'ribavirin': 2,
 'telaprevir': 1,
 'medicines': 1,
 'ns3-4a protease inhibitors': 1,
 'the ns3-4a protease inhibitor': 1,
 'hepatitis c virus': 1,
 'previously': 1,
 'an interferon-based regimen': 1}

In [242]:
# for each test sample, keep track on entities from entities_inputs included in the test_output
ENTITIES_INCLUDED = []

# for each test sample, keep track on entities from entities_inputs NOT included in the test_output
ENTITIES_NOT_INCLUDED = []

for section_index, section_entities in enumerate(entities_inputs):
    
    # get the corresponding test output for the current section_entities
    curr_output = test_outputs[section_index]
    
    # for each test sample, keep trach of entities included in generated text and not included
    curr_entities_present = []
    curr_entities_not_present = []
    
    for entity in section_entities:
        
        # in case entity has punctuation, tokenize it in order to match the style of test_output
        entity = wordpunct_tokenize(entity)
        entity = " ".join(entity)  
        
        if entity in curr_output:
            
            # important - check that there is a whitespace around entity in curr_output
            start_entity = curr_output.index(entity)
            end_entity = curr_output.index(entity) + len(entity)
            
            ## make sure that entity is not a substring of a word
            
            if not curr_output[start_entity-1].isalpha() and end_entity >= len(curr_output):
                curr_entities_present.append(entity)
            elif not curr_output[start_entity-1].isalpha() and not curr_output[end_entity].isalpha():
                curr_entities_present.append(entity)
            else:
                curr_entities_not_present.append(entity)
        else:
            curr_entities_not_present.append(entity)
    
    ENTITIES_INCLUDED.append(curr_entities_present)
    ENTITIES_NOT_INCLUDED.append(curr_entities_not_present)

In [243]:
len(ENTITIES_INCLUDED)

742

In [244]:
len(ENTITIES_NOT_INCLUDED)

742

### Measuring the Hallucination Effect

In [214]:
def _is_digit_letter(char):
    """ Helper function to check whether character is digit or a letter"""
    if char.isdigit() or char.isalpha():
        return True
    
    return False

In [215]:
# save hallucination entities per each test sample
# list of lists

HALLUCINATED_ENTITIES = []

for sample_ind in range(len(test_outputs)):
    
    # get generated text at this index
    generated_section = test_outputs[sample_ind]
    
    # get input entities at this index
    input_entities = entities_inputs[sample_ind]
    
    curr_hallucination_entities = []
    
    # for each unique from train input entities (used in fine-tuning)
    # find out entities present in test_output but not in corresponding entities_inputs
    for entity in entities_finetuning:
        
        # tokenize entity in order to check whether it is present in text generation
        entity_token = wordpunct_tokenize(entity)
        entity_token = " ".join(entity)
        
        # input_entities and entity have same format so do not check whether entity_token in input_entities
        if entity_token in generated_section and entity not in input_entities:
            
            # make sure that it is indeed <entity> in generated_section and not a substring of a token
            # important - check that there is a whitespace around entity in curr_output
            start_entity = generated_section.index(entity_token)
            end_entity = generated_section.index(entity_token) + len(entity_token)
            
            # edge cases when entity - at the begining
            if start_entity == 0 and not _is_digit_letter(generated_section[end_entity]):
                curr_hallucination_entities.append(entity_token)
            # edge cases when entity - at the end
            elif not _is_digit_letter(generated_section[start_entity-1]) and end_entity >= len(generated_section):
                curr_hallucination_entities.append(entity_token)
            elif not _is_digit_letter(generated_section[start_entity-1]) and not _is_digit_letter(generated_section[end_entity]):
                curr_hallucination_entities.append(entity_token)
            
            # else - current entity is part of a token in generated text - do not count such entity
    
    HALLUCINATED_ENTITIES.append(curr_hallucination_entities)

In [216]:
# save hallucination entities per each test sample
# list of lists

# old version - I am checking only whether char to the left/right is not letter, but need also to check it not a digit!
"""
HALLUCINATED_ENTITIES = []

for sample_ind in range(len(test_outputs)):
    
    # get generated text at this index
    generated_section = test_outputs[sample_ind]
    
    # get input entities at this index
    input_entities = entities_inputs[sample_ind]
    
    curr_hallucination_entities = []
    
    # for each unique from train input entities (used in fine-tuning)
    # find out entities present in test_output but not in corresponding entities_inputs
    for entity in entities_finetuning:
        
        # tokenize entity in order to check whether it is present in text generation
        entity_token = wordpunct_tokenize(entity)
        entity_token = " ".join(entity)
        
        # input_entities and entity have same format so do not check whether entity_token in input_entities
        if entity_token in generated_section and entity not in input_entities:
            
            # make sure that it is indeed <entity> in generated_section and not a substring of a token
            # important - check that there is a whitespace around entity in curr_output
            start_entity = generated_section.index(entity_token)
            end_entity = generated_section.index(entity_token) + len(entity_token)
            
            # edge cases when entity - at the begining
            if start_entity == 0 and not generated_section[end_entity].isalpha():
                curr_hallucination_entities.append(entity_token)
            # edge cases when entity - at the end
            elif not generated_section[start_entity-1].isalpha() and end_entity >= len(generated_section):
                curr_hallucination_entities.append(entity_token)
            elif not generated_section[start_entity-1].isalpha() and not generated_section[end_entity].isalpha():
                curr_hallucination_entities.append(entity_token)
            
            # else - current entity is part of a token in generated text - do not count such entity
    
    HALLUCINATED_ENTITIES.append(curr_hallucination_entities)
"""

'\nHALLUCINATED_ENTITIES = []\n\nfor sample_ind in range(len(test_outputs)):\n    \n    # get generated text at this index\n    generated_section = test_outputs[sample_ind]\n    \n    # get input entities at this index\n    input_entities = entities_inputs[sample_ind]\n    \n    curr_hallucination_entities = []\n    \n    # for each unique from train input entities (used in fine-tuning)\n    # find out entities present in test_output but not in corresponding entities_inputs\n    for entity in entities_finetuning:\n        \n        # tokenize entity in order to check whether it is present in text generation\n        entity_token = wordpunct_tokenize(entity)\n        entity_token = " ".join(entity)\n        \n        # input_entities and entity have same format so do not check whether entity_token in input_entities\n        if entity_token in generated_section and entity not in input_entities:\n            \n            # make sure that it is indeed <entity> in generated_section and not

### Calculate metrics

In [217]:
# each element - entities from input included in generated output
len(ENTITIES_INCLUDED)

742

In [218]:
# each element - entities from input NOT included in generated output
len(ENTITIES_NOT_INCLUDED)

742

In [219]:
# each element - entities from the whole train entities present in generated output but not in the corresponding input
len(HALLUCINATED_ENTITIES)

742

In [220]:
ENTITIES_INCLUDED

[['incivo',
  'the virus',
  'hepatitis c infection',
  'peginterferon alfa',
  'ribavirin',
  'telaprevir',
  'medicines',
  'ns3 - 4a protease inhibitors',
  'hepatitis c virus'],
 ['incivo',
  'allergic',
  'telaprevir',
  'this medicine',
  'peginterferon alfa',
  'ribavirin',
  'severe side effects',
  '56',
  'the medicine alfuzosin',
  'symptoms',
  'an enlarged prostate',
  '1',
  'these medicines',
  'side effects',
  'medicine',
  'medicines'],
 ['this medicine',
  'the appropriate dose regimen',
  'the recommended dose regimen',
  '3',
  '6',
  '2',
  '8',
  'both hepatitis c virus infection',
  'human immunodeficiency virus infection',
  'efavirenz'],
 ['incivo',
  'all medicines',
  'this medicine',
  'side effects',
  'rash',
  'an itchy skin rash',
  'the rash',
  'other symptoms',
  'a severe skin reaction',
  'immediately',
  'a skin rash',
  'your rash',
  'a rash',
  'fever',
  'tiredness',
  'swelling of the face',
  'a wide - spread rash',
  'peeling skin',
  'flu 

In [221]:
ENTITIES_NOT_INCLUDED

[['chronic hepatitis c infection',
  '1865',
  'the ns3 - 4a protease inhibitor',
  'previously',
  'an interferon - based regimen'],
 ['their contraindications ( e . g . pregnancy precautions',
  'the following medicines',
  'alpha - 1 - adrenoreceptor antagonists',
  'amiodarone',
  'bepridil',
  'quinidine',
  'iii antiarrhythmics',
  'certain heart disorders',
  'irregular heart beat',
  'heart',
  'antiarrhythmics',
  'astemizole',
  'terfenadine',
  'allergy symptoms',
  'antihistamines',
  'rifampicin',
  'some infections',
  'tuberculosis',
  'antimycobacterial ) dihydroergotamine',
  'ergonovine',
  'ergotamine',
  'methylergonovine',
  'migraine',
  'headaches',
  'ergot derivatives',
  'cisapride',
  'some stomach conditions',
  'gastrointestinal motility agents',
  "st john ' s wort ( hypericum perforatum",
  'an herbal product',
  'anxiety',
  'atorvastatin',
  'lovastatin',
  'simvastatin',
  'lower cholesterol levels',
  'hmg coa reductase inhibitors',
  'pimozide',
  'p

In [222]:
# calculate metrics

FAITHFULNESS = []
MISSING_FACTS = []
HALLUCINATION = []

for section_idx in range(len(test_outputs)):
        
    # get generated text at this index
    generated_section = test_outputs[sample_ind]
    
    # get input entities at this index
    input_entities = entities_inputs[sample_ind]
    
    entities_included = ENTITIES_INCLUDED[sample_ind]
    
    entities_not_included = ENTITIES_NOT_INCLUDED[sample_ind]
    
    entities_hallucinated = HALLUCINATED_ENTITIES[sample_ind]
    
    FAITHFULNESS.append(len(entities_included) / len(input_entities))
    MISSING_FACTS.append(len(entities_not_included) / len(input_entities))
    HALLUCINATION.append(len(entities_hallucinated) / len(input_entities))

In [223]:
print("FAITHFULNESS:", np.mean(FAITHFULNESS))
print("MISSING_FACTS:", np.mean(MISSING_FACTS))
print("HALLUCINATION:", np.mean(HALLUCINATION))

FAITHFULNESS: 0.6399999999999999
MISSING_FACTS: 0.3599999999999999
HALLUCINATION: 0.0
