# Model evaluation based on predictions

In [None]:
import json
import re
import nltk
from nltk.tokenize import TreebankWordTokenizer
import evaluate
import matplotlib.pyplot as plt
import numpy as np

In [None]:
nltk.download('punkt')
seqeval = evaluate.load("seqeval")

In [None]:
def predicted_data(test_file, result_file):
    '''
    Function for matching result and test file based on annotations and combining them for evaluation.
    '''
    with open(test_file, 'r', encoding='utf-8') as f:
        testdata = [json.loads(line) for line in f]

    with open(result_file, 'r', encoding='utf-8') as f:
        resdata = [json.loads(line) for line in f]
    
    combined_data = []
    count = 0
    j = 0
    
    for i, res in enumerate(resdata):
        
        full_example = {}
        matched = False
       
        while not matched:
            if j >= len(testdata):
                break
            
            test_og = testdata[j]["messages"][2]["content"]
            res_og = res["original"]
        
            if test_og == res_og:
                full_example["text"] = testdata[j]["messages"][1]["content"]
                full_example["original"] = test_og
                full_example["predicted"] = res["predicted"]
                full_example["original_index"] = testdata[j]["messages"][3]["content"]
                
                combined_data.append(full_example)
                count += 1
                matched = True
            else:
                j +=1
        
    return combined_data

In [None]:
def get_label_span(labels, text, model, count_pred):
    '''
    Function to get predicted label spans for evaluation.
    '''
    original_labels = {} # New predicted labels
    
    found_problem = False # Only complete results are returned

    if isinstance(labels, str):
        labels = json.loads(labels)

    text = text.strip()
    text = text.lower()

    if isinstance(labels, dict):
        for label, entities in labels.items():
            try:
                if label in data_labels:
                    if entities:
                        # Incase the entites are as a string, not list
                        if isinstance(entities, str):
                            entities = entities.split(", ")

                        # Entites without duplicates
                        entities = list(set(entities))

                        # All entity spans are found in text and saved to result list
                        for entity in entities:
                            entity = entity.strip()
                            entity = entity.lower()
                            pattern = re.compile(re.escape(entity))

                            # Entites shorter than 2 characters are excluded, becaause they would produce too many false positives.
                            if len(list(pattern.finditer(text))) == 0 or len(entity) < 2:
                                if count_pred:
                                    not_in_text[model][label].append(entity)
                            else:
                                if count_pred:
                                    in_text[model][label] += 1

                                for match in pattern.finditer(text):
                                    start, end = match.span()
                                    parts = list(TreebankWordTokenizer().span_tokenize(entity))

                                    # Entities that consist of multible words are split
                                    if len(parts) > 1:
                                        for part in parts:
                                            original_labels[(start + part[0], start + part[1])] = label
                                    else:
                                        original_labels[(start, end)] = label
            except Exception as e:
                found_problem = True
                return None
    else:
        found_problem = True
        return None

    if found_problem:
        return None
    
    return original_labels

In [None]:
def format_predictions(test_file, result_file):
    '''
    Function that formats original and predicted values for evaluation with seqeval.
    '''
    
    combined_data = predicted_data(test_file, result_file)
    model = result_file.split("_")[2]

    original_by_token = [] # Original labels
    predicted_by_token = [] # Predicted labels
    examples_by_token = [] # Tokens
    
    for ie, example in enumerate(combined_data):
        
        original = example["original"]
        predicted = example["predicted"]
        text = example["text"]
        original_label_indexes = example["original_index"]

        # The text is tokenized
        span_tokens = list(TreebankWordTokenizer().span_tokenize(text))
        tokens = {k: text[i:j] for k, (i, j) in zip(span_tokens, span_tokens)}

        # Predicted labels are split and assigned to tokens
        predicted_labels = get_label_span(predicted, text, model, True)

        if predicted_labels == None:
            continue

        # Original labels are split and assigned to tokens
        original_labels = {}
        
        for key in original_label_indexes.keys():
            for i in range(len(original_label_indexes[key])):
                beginning = original_label_indexes[key][i][0]
                parts = list(TreebankWordTokenizer().span_tokenize(original[key][i]))

                # Entity consist of multiple tokens and is split
                if len(parts) > 1:
                    label_token_parts = list(TreebankWordTokenizer().tokenize(original[key][i]))
                    
                    for part in parts:
                        new_index = (part[0] + int(beginning), part[1] + int(beginning))
                        original_labels[new_index] =  key

                else:
                    original_labels[tuple(original_label_indexes[key][i])] = key

        total_number_original_labels = list(original_labels.keys())

        # Labels are formatted as a list 
        result_original = []
        result_predicted = []
        
        for token in tokens:
            
            token_label_original = original_labels.get(token, 'O')
            token_label_predicted = predicted_labels.get(token, 'O')
            
            # Some words are not tokenized equally so I try to match the most similar token 
            if token_label_original == "O" and token not in original_labels.keys() and token:
                for k, v in original_labels.items():
                    if k not in tokens.keys():
                        if int(k[0]) >= token[0] and int(k[1]) <= token[1]:
                            token_label_original = v
                            total_number_original_labels.remove(k)
                            break
                            
            if token_label_predicted == "O" and token not in predicted_labels.keys():
                for k, v in predicted_labels.items():
                    if k not in tokens.keys():
                        if int(k[0]) >= token[0] and int(k[1]) <= token[1]:
                            token_label_predicted = v
                            break

            if token_label_original != "O" and token in total_number_original_labels:
                total_number_original_labels.remove(token)
                
            
            result_original.append(token_label_original)
            result_predicted.append(token_label_predicted)
                
        
        original_by_token.append(result_original)
        predicted_by_token.append(result_predicted)
        examples_by_token.append(list(TreebankWordTokenizer().tokenize(text)))
    
    return original_by_token, predicted_by_token, examples_by_token

## N2c2 data evaluation

In [None]:
test_file = ""
data_labels = ["Drug", "Frequency", "Dosage", "ADE", "Reason", "Form", "Duration", "Route", "Strength"]
models = ["GPT", "GPT-150", "GPT-300", "GPT-700"]

not_in_text = {}
in_text = {}

for model in models:
    not_in_text[model] = {}
    in_text[model] = {}
    for label in data_labels:
        not_in_text[model][label] = []
        in_text[model][label] = 0
        
res_file = "GPT_predictions.jsonl"
res_file150 = "GPT-150_predictions.jsonl"
res_file300 = "GPT-300_predictions.jsonl"
res_file700 = "GPT-700_predictions.jsonl"

ot, pt, t = format_predictions(test_file, res_file)
ot_150, pt_150, t_150 = format_predictions(test_file, res_file150)
ot_300, pt_300, t_300 = format_predictions(test_file, res_file300)
ot_700, pt_700, t_700 = format_predictions(test_file, res_file700)

data_per_model = {}
data_per_model["GPT"] = [ot, pt, t]
data_per_model["GPT-150"] = [ot_150, pt_150, t_150]
data_per_model["GPT-300"] = [ot_300, pt_300, t_300]
data_per_model["GPT-700"] = [ot_700, pt_700, t_700]

## Synthetic data evaluation

In [None]:
test_file = ""
data_labels = ["DISEASE", "DRUG", "SMOKING", "PROCEDURE"]
models = ["GPT", "GPT-150", "GPT-300", "GPT-700"]

not_in_text = {}
in_text = {}

for model in models:
    not_in_text[model] = {}
    in_text[model] = {}
    for label in data_labels:
        not_in_text[model][label] = []
        in_text[model][label] = 0

res_file = "GPT_synth_responses_format.jsonl"
res_file150 = "GPT-150_synth_responses.jsonl"
res_file300 = "GPT-300_synth_responses.jsonl"
res_file700 = "GPT-700_synth_responses.jsonl"

ot, pt, t = format_predictions(test_file, res_file)
ot_150, pt_150, t_150 = format_predictions(test_file, res_file150)
ot_300, pt_300, t_300 = format_predictions(test_file, res_file300)
ot_700, pt_700, t_700 = format_predictions(test_file, res_file700)

data_per_model = {}
data_per_model["GPT"] = [ot, pt, t]
data_per_model["GPT-150"] = [ot_150, pt_150]
data_per_model["GPT-300"] = [ot_300, pt_300]
data_per_model["GPT-700"] = [ot_700, pt_700]

In [None]:
res_gpt = seqeval.compute(predictions=pt, references=ot)
res_150 = seqeval.compute(predictions=pt_150, references=ot_150)
res_300 = seqeval.compute(predictions=pt_300, references=ot_300)
res_700 = seqeval.compute(predictions=pt_700, references=ot_700)

# Plotting

In [None]:
def compare_models_by_label(metrics, result_list, models, title):

    m_dict = {}
    m_dict["precision"] = "täpsus"
    m_dict["recall"] = "saagis"
    m_dict["f1"] = "f1-skoor"

    for metric in metrics:
        # N2c2 data labels in Estonian
        labels = ["Üldine", "Kõrvalmõju", "Põhjus", "Vorm", "Doos", "Viis", "Sagedus", "Ravim", "Tugevus", "Kestus"]
        
        # Synthetic data labels in Estonian
        #labels = ["Üldine", "Haigus", "Suitsetamine", "Protseduur", "Ravim"] 
        
        colors = ['blue', 'red', 'green', 'mediumvioletred']
        bar_width = 0.175
        
        index = np.arange(len(labels))
        plt.figure(figsize=(12, 6))

        for i, res in enumerate(result_list):
            values = [res[label][metric] for label in res.keys()]
            plt.bar(index + i * bar_width, values, bar_width, label=models[i], color=colors[i])

        plt.title("Mudelite " + m_dict[metric] + " " + title + " andmetel")
        plt.xticks(index + bar_width+ 0.074, labels)
        plt.ylim(0, 1)
        plt.legend()
        plt.grid(axis = 'y')
        plt.tight_layout()
        plt.show()

In [None]:
def fix_labels(res):
    '''
    Function for fixing label names since seqeval package removes the first letters from labels.
    '''
    corrected_metrics = {}
    corrected_metrics["Overall"] = {}
    
    for label, metrics in res.items():
        for data_label in data_labels:
            if label.lower() in data_label.lower():
                corrected_metrics[data_label] = metrics
                break
        
        if "overall" in label.lower():
            l = label.split("_")[1]
            if "acc" not in l:
                corrected_metrics["Overall"][l] = metrics
    return corrected_metrics

In [None]:
models = ["GPT", "GPT-150", "GPT-300", "GPT-700"]

result_list_b = [res_gpt, res_150, res_300, res_700]
result_list = [fix_labels(r) for r in result_list_b]

'''
n2c2 data evaluation
'''
#title = "sünteetilistel"
#metrics = list(result_list[0]['DRUG'].keys())[:-1]

'''
synthetic data evaluation
'''
title = "n2c2"
metrics = list(result_list[0]['Drug'].keys())[:-1]

compare_models_by_label(metrics, result_list, models, title)

In [None]:
# Extracting values
recalls = [result['Overall']['recall'] for result in result_list]
precisions = [result['Overall']['precision'] for result in result_list]
f1 = [result['Overall']['f1'] for result in result_list]

x_values = range(1, len(result_list) + 1)

# Plotting
plt.plot(x_values, recalls, marker='o')
plt.plot(x_values, precisions, marker='o')

# Adding labels and title
x_values = range(1, len(result_list) + 1)
plt.ylim(0, 1)
plt.xticks(x_values, models)

plt.legend(["Saagis", "Täpsus", "F1-skoor"])
plt.grid(axis = 'y')
plt.xlabel('Mudel')
plt.title('Mudelid n2c2 andmestikul')

plt.savefig("mid_results/overall_plot_"+ title +"_est.jpg")
plt.show()