In [None]:
import json
import os
import pickle
import sys
import subprocess
sys.path.append("../..")
from src.training.dataset_utils import read_conll_file, examples_to_indices
from src.training.train_utils import evaluate_ner, form_ner_pred_matrix


from sklearn.metrics import classification_report
import tensorflow as tf

In [None]:
base_path = "/Users/Carol/Google Drive/"
dev_file = os.path.join(base_path, "nlp_data/recipe_data/20200523_food_gold_dev.conll")
model_file = "/Users/Carol/Google Drive/nlp_data/output/20200523_22_06_28/20200523_22_06_28_food_ner_epoch_5_dev_f1_0.9851520816163143.h5"
mappings_file = "/Users/Carol/Google Drive/nlp_data/output/20200523_22_06_28/20200523_22_06_28_food_ner_mappings.pkl"

In [None]:
model = tf.keras.models.load_model(model_file)

In [None]:
mappings = pickle.load(open(mappings_file, "rb"))
label_to_index = mappings['label_to_index']
token_to_index = mappings['token_to_index']
index_to_label = {v:k for k,v in label_to_index.items()}

In [None]:
dev_dataset = read_conll_file(dev_file)
dev_sentences = examples_to_indices(dev_dataset, label_to_index, token_to_index)

In [None]:
label_mappings = list(index_to_label.items())
label_mappings.sort()
label_strings = [x[1] for x in label_mappings]
y_pred = []
y_true = []
for sent in dev_sentences:
    preds = model.predict_on_batch(form_ner_pred_matrix(sent['tokens']))
    y_pred.extend(np.argmax(preds, axis=-1)[0])
    y_true.extend(sent['labels'])
metrics = classification_report(y_true, y_pred, target_names = label_strings,
                                output_dict=False)


In [None]:
# token-level metrics
print(metrics)

              precision    recall  f1-score   support

      I-FOOD       0.93      0.86      0.89       278
           O       0.99      0.99      0.99      9464
      B-FOOD       0.95      0.94      0.95      1175

    accuracy                           0.99     10917
   macro avg       0.96      0.93      0.94     10917
weighted avg       0.99      0.99      0.99     10917



In [None]:
# write to conll format for use of conll perl script to calculate entity-level metrics
outfile = "/Users/Carol/Google Drive/nlp_data/output/20200523_22_06_28/dev_conll.txt"
with open(outfile, "w") as out:
    ctr = 0
    for doc in dev_sentences:
        for token in doc['raw_tokens']:
            out.write(f"{token} {index_to_label[y_true[ctr]]} {index_to_label[y_pred[ctr]]}\n")
            ctr += 1
        out.write("\n")


In [None]:
infile = "/Users/Carol/Google Drive/nlp_data/output/20200523_22_06_28/dev_conll.txt"
outfile = '/Users/Carol/Google Drive/nlp_data/output/20200523_22_06_28/eval.txt'
conlleval_script_path = "/Users/Carol/Dropbox/repos/food/src/evaluation"
os.chdir(conlleval_script_path)
cmd = "perl conlleval.pl < {} > {}".format(json.dumps(infile), json.dumps(outfile))
os.system(cmd)

0

# Update June 2023 for BERT

In [1]:
LOCAL = False   # training on local Mac vs. in Colab

import json
import os
import sys
import subprocess



if LOCAL:
    BASE_PATH = "/Users/carolanderson/Dropbox/"

else:
    BASE_PATH = "/content/drive/My Drive/"
    from google.colab import drive
    drive.mount('/content/drive')
    ! pip install transformers[torch]
    ! pip install sklearn


from sklearn.metrics import classification_report
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

Mounted at /content/drive
Collecting transformers[torch]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.2-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [60]:
def read_conll_file(file):
    """
    Given a file in CoNLL format, read in tokens and labels. Treat each sentence as a training example.
    :param file: file in CoNLL format; tokens are assumed to be in the first column and labels in the last column.
    :returns a nested list, in which each sublist is a sentence and contains a sublist [token, label] for each token.

    .. note:: Ignores document boundaries and treats each sentence as an independent training example.
    """
    documents = []  # holds all documents
    sentence = [] # will hold the first sentence
    with open(file, 'r') as infile:
        for line in infile:
            if '-DOCSTART-' in line:  # beginning of a new document; ignore this since we will treat each sentence as a training example
                continue
            elif not line.split():  # beginning of a new sentence
                if sentence:
                    documents.append(sentence)
                sentence = []
            else:
                token, *other_columns, label = line.split()
                sentence.append([token, label])
    return documents


def detokenize_conll_input(dataset):
    '''
    For each sentence in the conll file, rejoin the tokens with spaces between, to make a sentence.
    Also create a dict for each token containing the token's text, start index, end index, and true label

    Parameter
    ---------
    A list of lists of lists; each list contains one sentence and its sublists are tokens, e.g.
    [[['Melt', 'O'],
     ['chocolate', 'B-FOOD'],
     ['in', 'O'],
     ['top', 'O'],
     ['of', 'O'],
     ['double', 'O'],
     ['boiler', 'O']]]

    Returns
    -------
    dev_tokens: a list of lists of dicts; each list contains one sentence and its dicts are tokens, e.g.
    dev_sentences: a list of strings, where each one is a rejoined sentence
    '''
    dev_tokens = []
    dev_sentences = []

    for example in dataset:
        example_text = " ".join([item[0] for item in example])
        start = 0
        example_tokens = []
        for token in example:
            text = token[0]
            end = start + len(text)
            assert text == example_text[start:end]
            example_tokens.append({"text" : token[0], "true_label": token[1], "start" : start, "end": end})
            start = end + 1
        dev_tokens.append(example_tokens)
        dev_sentences.append(example_text)
    return dev_tokens, dev_sentences


def add_predictions_to_tokens(predictions, token_list):
    '''Iterate over the predicted entities and the original tokens, assigning predicted labels to the tokens
      For each original token, this uses the predicted label for the first subtoken.
     (processes a single sentence)
     Parameters
     ----------
     predictions: a list of dicts produced by HuggingFace NER pipeline, e.g.
     [{'entity': 'B-FOOD',
      'score': 0.9986027,
      'index': 13,
      'word': 'salsa',
      'start': 63,
      'end': 68},
     {'entity': 'B-FOOD',
      'score': 0.9981828,
      'index': 17,
      'word': 'salt',
      'start': 83,
      'end': 87}]
     token_list: a list of dicts produced by detokenize_conll_input


    Returns
    -------
    no explicit return; token_list is changed in place, with predictions added to the existing dicts
    '''
    result_start = 0 # for efficiency, can skip some preds once they've been mapped onto a token
    for token in token_list:
        for result in predictions[result_start:]:
            if (result['start'] == token['start']):
                token['pred_label'] = result['entity']
                result_start += 1
                break
            token['pred_label'] = 'O'
        if not "pred_label" in token:
            token['pred_label'] = 'O'


def split_label(label):
    """Split a label into the BIO tag and entity type."""
    if label.startswith("B-"):
        return "B-", label[2:]
    elif label.startswith("I-"):
        return "I-", label[2:]
    elif label.startswith("O"):
        return "O", "O"
    else:
        raise Exception("Found non-BIO label: {}!".format(label))


def correct_BIO_encodings(labels):
    corrected_labels = []
    curr_tag = "O"
    for i, label in enumerate(labels):
        BIO_tag, base_label = split_label(label)
        if BIO_tag == "B-":
            curr_tag = base_label
            corrected_labels.append(label)
        elif BIO_tag == "I-":
            if base_label == curr_tag:
                corrected_labels.append(label)
            else:
                corrected_labels.append("B-" + base_label)
                curr_tag = base_label
        elif BIO_tag == "O":
            corrected_labels.append(label)
            curr_tag = "O"
    return corrected_labels


In [48]:
dev_file = os.path.join(BASE_PATH, "nlp_data/recipe_data/20200523_food_gold_dev.conll")
dev_dataset = read_conll_file(dev_file)
dev_tokens, dev_sentences = detokenize_conll_input(dev_dataset)

In [49]:
model_ckpt = os.path.join(BASE_PATH, 'food_ner_models/20230705_03_08_29-roberta-base-finetuned-ner/checkpoint-740')
model = AutoModelForTokenClassification.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

In [50]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [51]:
ner_results = nlp(dev_sentences, aggregation_strategy=None)

In [52]:
for result, token_list in zip(ner_results, dev_tokens):
    add_predictions_to_tokens(result, token_list)

In [63]:
# extract the labels and preds into a single list each for token-level metrics
y_true_flat = []
y_pred_flat = []

# also make nested lists for writing out conll file
y_true_nested = []
y_pred_nested = []

for sent in dev_tokens:
    true_labels = [token['true_label'] for token in sent]
    pred_labels = [token['pred_label'] for token in sent]
    pred_labels = correct_BIO_encodings(pred_labels)
    y_pred_flat.extend(pred_labels)
    y_true_flat.extend(true_labels)
    y_pred_nested.append(pred_labels)
    y_true_nested.append(true_labels)

In [79]:
metrics = classification_report(y_true_flat, y_pred_flat, output_dict=False)

In [80]:
print(metrics)

              precision    recall  f1-score   support

      B-FOOD       0.96      0.97      0.97      1175
      I-FOOD       0.95      0.94      0.94       278
           O       1.00      0.99      1.00      9464

    accuracy                           0.99     10917
   macro avg       0.97      0.97      0.97     10917
weighted avg       0.99      0.99      0.99     10917



In [81]:
# write out metrics
outfile = os.path.join(BASE_PATH, "nlp_data", "results", "20230705_03_08_29-roberta-base-finetuned-ner_dev_token_metrics.txt")
with open(outfile, "w") as out:
  out.write(metrics)

In [72]:
# write to conll format for use of conll perl script to calculate entity-level metrics
outfile = os.path.join(BASE_PATH, "nlp_data", "results", "20230705_03_08_29-roberta-base-finetuned-ner_dev_conll.txt")
with open(outfile, "w") as out:
    ctr = 0
    for doc, true, preds in zip(dev_tokens, y_true_nested, y_pred_nested):
      for token, true_label, pred_label in zip(doc, true, preds):
          assert token['true_label']  == true_label
          out.write(f"{token['text']} {token['true_label']} {pred_label}\n")  # use predictions with corrected BIO encodings
      out.write("\n")

In [None]:
import os
import subprocess

In [None]:
infile = "/Users/carolanderson/Dropbox/nlp_data/results/20230705_03_08_29-roberta-base-finetuned-ner_dev_conll.txt"
outfile = '/Users/carolanderson/Dropbox/nlp_data/results/20230705_03_08_29-roberta-base-eval.txt'
conlleval_script_path = "/Users/carolanderson/Dropbox/repos/food/src/evaluation"
os.chdir(conlleval_script_path)
cmd = "perl conlleval.pl < {} > {}".format(json.dumps(infile), json.dumps(outfile))
os.system(cmd)