# Evaluation of predictions

## Loading the dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 23.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 48.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 32.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 24.5 M

In [19]:
from datasets import load_dataset
cuad = load_dataset("cuad")



  0%|          | 0/2 [00:00<?, ?it/s]

## Loading the predictions
The predictions were made using JSON files. We must load them and store them in values.

In [20]:
import json

predicted_values = []
confidence_scores = []
# Opening JSON file
f = open("/content/drive/MyDrive/CSI5386-Assignment_2/trained-models/bert-CUAD/predictions-bert-cuad-checkpoint-4000.json")

# returns JSON object as 
# a dictionary
data = json.load(f)

print(len(data))

for i in range(len(data)):
    # res = list(i.keys())[1]
    # print(res)
    # print(data[i].keys())
    predicted_value  = data[i].get(list(data[i].keys())[0])
    confidence_score = data[i].get(list(data[i].keys())[1])

    if (confidence_score<0.000001):
        predicted_value=""
    
    # print(str(predicted_value)+" with confidence of "+str(confidence_score))
    predicted_values.append(predicted_value)
    confidence_scores.append(confidence_score)
  
# Closing file
f.close()

1897


## Create the new n_best prediction file


In [18]:
file_name  = "/content/drive/MyDrive/CSI5386 : Assignment 2/trained-models/legal-bert-CUAD/nbest_predictions_.json"

file = {}
for index in range(len(data)):

    title = (list(data[index].keys()))[0]
    
    inputs = {   
        "text": predicted_values[index],
        "probability": confidence_scores[index],
    }

    
    file[title]=[inputs]
with open(file_name, "w") as outfile:
    json.dump(file,outfile,indent=4)


## Evaluation of the models

In [21]:
# these functions are heavily influenced by the HF squad_metrics.py script
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        #print("TEXT : "+text)
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

def get_gold_answers(example):
    """helper function that retrieves all possible true answers from a cuad example"""
    
    if (len(example["text"])==0):
      gold_answers = [""]
    else:
      gold_answers = example["text"]

    # if gold_answers doesn't exist it's because this is a negative example - 
    # the only correct answer is an empty string
    
        
    return gold_answers

In [22]:
import numpy as np

f1_scores = []
em_scores = []

for index in range(len(predicted_values)):
    gold_answers = get_gold_answers(cuad["test"]["answers"][index])
    f1_score = max((compute_f1(predicted_values[index], answer)) for answer in gold_answers)
    em_score = max((compute_exact_match(predicted_values[index], answer)) for answer in gold_answers)
    f1_scores.append(f1_score)
    em_scores.append(em_score)

print("Average F1 : "+str(np.mean(f1_scores)))
print("Average EM : "+str(np.mean(em_scores)))
    

Average F1 : 0.7367830797976926
Average EM : 0.7111228255139694
