### Antoine EDY
# Natural Language Processing (COMM061) - Coursework

In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
import torch
import nltk
from torch.utils.tensorboard import SummaryWriter

In [2]:
dataset = load_dataset("surrey-nlp/PLOD-CW")

In [3]:
TEXT2ID = {
    "B-O": 0,
    "B-AC": 1,
    "B-LF": 2,
    "I-LF": 3,
}
ID2TEXT = {v: k for k, v in TEXT2ID.items()}

In [4]:
# map the labels to integers

def map_labels_to_int(example):
    example["ner_tags"] = [TEXT2ID[label] for label in example["ner_tags"]]
    return example

dataset = dataset.map(map_labels_to_int)


In [18]:
dataset['test'][0]

{'tokens': ['Abbreviations',
  ':',
  'GEMS',
  ',',
  'Global',
  'Enteric',
  'Multicenter',
  'Study',
  ';',
  'VIP',
  ',',
  'ventilated',
  'improved',
  'pit',
  '.'],
 'pos_tags': ['NOUN',
  'PUNCT',
  'PROPN',
  'PUNCT',
  'PROPN',
  'PROPN',
  'PROPN',
  'PROPN',
  'PUNCT',
  'PROPN',
  'PUNCT',
  'VERB',
  'ADJ',
  'NOUN',
  'PUNCT'],
 'ner_tags': [0, 0, 1, 0, 2, 3, 3, 3, 0, 1, 0, 2, 3, 3, 0]}

In [5]:
from transformers import AutoTokenizer

MODEL = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [6]:
train_texts = [item["tokens"] for item in dataset["train"]]
dev_texts = [item["tokens"] for item in dataset["validation"]]
test_texts = [item["tokens"] for item in dataset["test"]]

train_texts_encoded = tokenizer(train_texts, padding=True, truncation=True, max_length=256, is_split_into_words=True)
dev_texts_encoded = tokenizer(dev_texts, padding=True, truncation=True, max_length=256, is_split_into_words=True)
test_texts_encoded = tokenizer(test_texts, padding=True, truncation=True, max_length=256, is_split_into_words=True)

In [7]:
all_labels = list(set([label for item in dataset["train"] for label in item["ner_tags"]]))
all_labels

[0, 1, 2, 3]

In [8]:
import numpy as np

def map_entities_to_tokens(items, encodings):
    
    labels = [item["ner_tags"] for item in items]
    offsets = [encoding.offsets for encoding in encodings]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, offsets):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        if len(doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)]) != len(doc_labels):
            print("ERROR")
        else:
            doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
            encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = map_entities_to_tokens(dataset["train"], train_texts_encoded.encodings)
dev_labels = map_entities_to_tokens(dataset["validation"], dev_texts_encoded.encodings)
test_labels = map_entities_to_tokens(dataset["test"], test_texts_encoded.encodings)

ERROR
ERROR
ERROR
ERROR
ERROR


In [17]:
train_labels

[[-100,
  0,
  0,
  0,
  0,
  2,
  3,
  3,
  3,
  3,
  0,
  1,
  -100,
  -100,
  0,
  0,
  0,
  0,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -10

In [9]:
import torch

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    

train_dataset = NERDataset(train_texts_encoded, train_labels)
dev_dataset = NERDataset(dev_texts_encoded, dev_labels)
test_dataset = NERDataset(test_texts_encoded, test_labels)

print(f"Train items: {len(train_dataset)}")
print(f"Dev items: {len(dev_dataset)}")
print(f"Test items: {len(test_dataset)}")

Train items: 1068
Dev items: 125
Test items: 153


In [10]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    flat_labels, flat_preds = [], []
    flat_ent_labels, flat_ent_preds = [], []
    for label_row, pred_row in zip(labels, preds):
        for label, pred_label in zip(label_row, pred_row):
            if label != -100:
                flat_labels.append(label)
                flat_preds.append(pred_label)
                if label != 0 or pred_label != 0:
                    flat_ent_labels.append(label)
                    flat_ent_preds.append(pred_label)
                    
        
    precision, recall, f1, _ = precision_recall_fscore_support(flat_ent_labels, flat_ent_preds, average='micro')
    acc = accuracy_score(flat_labels, flat_preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification, BertForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(all_labels))


#device = torch.device("cpu")
#model.to(device)

training_args = TrainingArguments(
    output_dir='model_saves/',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=int(len(train_dataset)/8),  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='runs/transformer',            # directory for storing logs
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=10,
    load_best_model_at_end=True,
    no_cuda=False
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset,            # evaluation dataset
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/402 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.511648416519165, 'eval_accuracy': 0.848585690515807, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 6.0004, 'eval_samples_per_second': 20.832, 'eval_steps_per_second': 2.667, 'epoch': 1.49}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.48478496074676514, 'eval_accuracy': 0.8504575707154742, 'eval_f1': 0.024423337856173677, 'eval_precision': 0.024423337856173677, 'eval_recall': 0.024423337856173677, 'eval_runtime': 7.2665, 'eval_samples_per_second': 17.202, 'eval_steps_per_second': 2.202, 'epoch': 2.99}
{'train_runtime': 753.8618, 'train_samples_per_second': 4.25, 'train_steps_per_second': 0.533, 'train_loss': 0.6856517886641014, 'epoch': 3.0}


TrainOutput(global_step=402, training_loss=0.6856517886641014, metrics={'train_runtime': 753.8618, 'train_samples_per_second': 4.25, 'train_steps_per_second': 0.533, 'train_loss': 0.6856517886641014, 'epoch': 3.0})

In [14]:
trainer.evaluate(test_dataset)

  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.40094903111457825,
 'eval_accuracy': 0.8604,
 'eval_f1': 0.02920723226703755,
 'eval_precision': 0.02920723226703755,
 'eval_recall': 0.02920723226703755,
 'eval_runtime': 5.2691,
 'eval_samples_per_second': 29.037,
 'eval_steps_per_second': 3.796,
 'epoch': 3.0}

In [20]:
from transformers import pipeline

model = AutoModelForTokenClassification.from_pretrained("model_saves/checkpoint-400")
nlp = pipeline("ner", tokenizer=tokenizer, model=model)

In [21]:
print(dataset["test"][1])

nlp(dataset["test"][1]["tokens"])

{'tokens': ['Fractions', 'from', 'FPLC', 'purification', 'were', 'treated', 'with', 'Laemmli', 'buffer', '[', '82', ']', 'with', '10', 'mM', '1,4', '-', 'dithiothreitol', '(', 'DTT', ')', 'and', 'heated', 'for', '5', 'm', 'at', '85', '°', 'C', 'then', 'analyzed', 'on', 'a', '4', '%', 'to', '15', '%', 'discontinuous', 'SDS', 'gel', 'with', 'a', '6', '%', 'stacking', 'gel', 'run', 'at', 'ambient', 'temperature', 'at', 'a', 'constant', '100', 'V.', 'Two', 'epithelial', 'cytokines', 'other', 'than', 'IL33', ',', 'IL25', ',', 'and', 'thymic', 'stromal', 'lymphopoietin', '(', 'TSLP', ')', 'are', 'known', 'to', 'activate', 'ILC2', 'in', 'the', 'lung', '[', '22,24', ']', '.'], 'pos_tags': ['NOUN', 'ADP', 'PROPN', 'NOUN', 'AUX', 'VERB', 'ADP', 'PROPN', 'NOUN', 'X', 'X', 'X', 'ADP', 'NUM', 'NOUN', 'NUM', 'PUNCT', 'NOUN', 'PUNCT', 'PROPN', 'PUNCT', 'CCONJ', 'VERB', 'ADP', 'NUM', 'NOUN', 'ADP', 'NUM', 'NOUN', 'NOUN', 'ADV', 'VERB', 'ADP', 'DET', 'NUM', 'NOUN', 'PART', 'NUM', 'NOUN', 'ADJ', 'PROPN'

[[{'entity': 'LABEL_0',
   'score': 0.77826846,
   'index': 1,
   'word': 'fraction',
   'start': 0,
   'end': 8},
  {'entity': 'LABEL_0',
   'score': 0.8472568,
   'index': 2,
   'word': '##s',
   'start': 8,
   'end': 9}],
 [{'entity': 'LABEL_0',
   'score': 0.7298088,
   'index': 1,
   'word': 'from',
   'start': 0,
   'end': 4}],
 [{'entity': 'LABEL_1',
   'score': 0.4226853,
   'index': 1,
   'word': 'f',
   'start': 0,
   'end': 1},
  {'entity': 'LABEL_0',
   'score': 0.7609227,
   'index': 2,
   'word': '##pl',
   'start': 1,
   'end': 3},
  {'entity': 'LABEL_0',
   'score': 0.84351,
   'index': 3,
   'word': '##c',
   'start': 3,
   'end': 4}],
 [{'entity': 'LABEL_0',
   'score': 0.7632109,
   'index': 1,
   'word': 'purification',
   'start': 0,
   'end': 12}],
 [{'entity': 'LABEL_0',
   'score': 0.59802824,
   'index': 1,
   'word': 'were',
   'start': 0,
   'end': 4}],
 [{'entity': 'LABEL_0',
   'score': 0.8522196,
   'index': 1,
   'word': 'treated',
   'start': 0,
   'end'

In [12]:
%load_ext tensorboard

In [13]:
tagger = torch.load(OUTPUT_PATH)
tagger.eval()

NameError: name 'OUTPUT_PATH' is not defined

In [None]:
labels = label_field.vocab.itos[2:]
labels = sorted(labels, key=lambda x: x.split("-")[-1])
label_idxs = [label_field.vocab.stoi[l] for l in labels]

test(tagger, test_iter, BATCH_SIZE, labels = label_idxs, target_names = labels)

              precision    recall  f1-score   support

        B-AC       0.66      0.33      0.44       270
        I-LF       0.55      0.35      0.43       288
        B-LF       0.37      0.24      0.29       150
         B-O       0.91      0.97      0.93      4292

    accuracy                           0.88      5000
   macro avg       0.62      0.47      0.52      5000
weighted avg       0.86      0.88      0.86      5000



In [None]:
from colorama import Back, Style

def vizu(words, output, truth):
    if isinstance(output, torch.Tensor):
        output = output.squeeze().tolist()
    col = {0: Back.BLACK, 1: Back.RED, 2: Back.GREEN, 3: Back.BLUE, 4: Back.MAGENTA}
    colors1 = [col[i] for i in output]
    colors2 = [col[i] for i in truth]
    words = [word.replace("Ġ", "") for word in words]
    print(Style.RESET_ALL + "Output:")
    for i, word in enumerate(words):
        print(colors1[i] + word, end=" ")
    print(Style.RESET_ALL + "\nTruth:")
    for i, word in enumerate(words):
        print(colors2[i] + word, end=" ")