In [None]:
import pandas as pd
import nltk
import string
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification
! pip install datasets
%pip install transformers
%pip install spacy
%pip install torch
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval
from datasets import load_dataset, load_metric
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

In [None]:
dataset = load_dataset("surrey-nlp/PLOD-CW")

In [None]:
from src.dataio import load_plod_cw, label_names, export_parquet, export_conll

ds = load_plod_cw()

print(ds)
print(label_names(ds))

In [None]:
short_dataset = dataset["train"][:200]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

In [None]:
from collections import Counter

tokens = short_dataset["tokens"]
ner_tags = short_dataset["ner_tags"]

df = pd.DataFrame({"tokens": tokens, "ner_tags": ner_tags})

ner_counter = Counter()
for tags in ner_tags:
  for tag in tags:
    ner_counter[tag] += 1

plt.bar(ner_counter.keys(), ner_counter.values())
plt.xlabel("NER Type")
plt.ylabel("Occurances")
plt.title("NER Distribution")
plt.show()


In [None]:
tokenized_input = tokenizer(short_dataset["tokens"], is_split_into_words=True)

In [None]:
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

label_list = []
for sample in short_dataset["ner_tags"]:
    label_list.append([label_encoding[tag] for tag in sample])

val_label_list = []
for sample in val_dataset["ner_tags"]:
    val_label_list.append([label_encoding[tag] for tag in sample])

test_label_list = []
for sample in test_dataset["ner_tags"]:
    test_label_list.append([label_encoding[tag] for tag in sample])

In [None]:
def tokenize_and_align_labels(short_dataset, list_name):
    tokenized_inputs = tokenizer(short_dataset["tokens"], truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(list_name):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_datasets = tokenize_and_align_labels(short_dataset, label_list)
tokenized_val_datasets = tokenize_and_align_labels(val_dataset, val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test_dataset, test_label_list)

In [None]:
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [None]:
tokenised_train = turn_dict_to_list_of_dict(tokenized_datasets)
tokenised_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenised_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:
# Prepare the test data for evaluation in the same format as the training data

predictions, labels, _ = trainer.predict(tokenised_test)
predictions = np.argmax(predictions, axis=2)

# Remove the predictions for the [CLS] and [SEP] tokens
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute multiple metrics on the test restuls
results = metric.compute(predictions=true_predictions, references=true_labels)
results



In [None]:

def sparse_metrics(true_predictions, true_labels):
    results = metric.compute(predictions=true_predictions, references=true_labels)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    f1 = results["overall_f1"]
    accuracy = results["overall_accuracy"]

    metrics = csr_matrix([[precision, recall, f1, accuracy]])

    return metrics

test_results = sparse_metrics(true_predictions, true_labels)

precision = test_results[0, 0]
recall = test_results[0, 1]
f1 = test_results[0, 2]
accuracy = test_results[0, 3]

metrics = ["Precision", "Recall", "F1 Score", "Accuracy"]

dense_results = test_results.toarray()

plt.figure(figsize=(10, 6))
plt.bar(metrics, dense_results[0], color=['blue', 'green', 'orange', 'red'])
plt.title('Performance Metrics')
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

metrics_df = pd.DataFrame(dense_results, columns=metrics)

plt.figure(figsize=(8, 6))
sns.heatmap(metrics_df, annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Performance Metrics")
plt.xlabel("Metrics")
plt.ylabel("Model")
plt.yticks([0], ['Model 1'])
plt.show()

**Hyper-Parameter optimization**

Lower learning rate value, lower Batch size, lower epochs

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 3
batch_size = 2
learning_rate = 1e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:
# Prepare the test data for evaluation in the same format as the training data

predictions, labels, _ = trainer.predict(tokenised_test)
predictions = np.argmax(predictions, axis=2)

# Remove the predictions for the [CLS] and [SEP] tokens
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute multiple metrics on the test restuls
results = metric.compute(predictions=true_predictions, references=true_labels)
results


In [None]:
def sparse_metrics(true_predictions, true_labels):
    results = metric.compute(predictions=true_predictions, references=true_labels)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    f1 = results["overall_f1"]
    accuracy = results["overall_accuracy"]

    metrics = csr_matrix([[precision, recall, f1, accuracy]])

    return metrics

test_results = sparse_metrics(true_predictions, true_labels)

precision = test_results[0, 0]
recall = test_results[0, 1]
f1 = test_results[0, 2]
accuracy = test_results[0, 3]

metrics = ["Precision", "Recall", "F1 Score", "Accuracy"]

dense_results = test_results.toarray()

plt.figure(figsize=(10, 6))
plt.bar(metrics, dense_results[0], color=['blue', 'green', 'orange', 'red'])
plt.title('Performance Metrics')
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

metrics_df = pd.DataFrame(dense_results, columns=metrics)

plt.figure(figsize=(8, 6))
sns.heatmap(metrics_df, annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Performance Metrics")
plt.xlabel("Metrics")
plt.ylabel("Model")
plt.yticks([0], ['Model 1'])
plt.show()

Higher learning rate value, lower Batch size, lower epochs

In [None]:
# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 3
batch_size = 2
learning_rate = 4e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:
# Prepare the test data for evaluation in the same format as the training data

predictions, labels, _ = trainer.predict(tokenised_test)
predictions = np.argmax(predictions, axis=2)

# Remove the predictions for the [CLS] and [SEP] tokens
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute multiple metrics on the test restuls
results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
def sparse_metrics(true_predictions, true_labels):
    results = metric.compute(predictions=true_predictions, references=true_labels)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    f1 = results["overall_f1"]
    accuracy = results["overall_accuracy"]

    metrics = csr_matrix([[precision, recall, f1, accuracy]])

    return metrics

test_results = sparse_metrics(true_predictions, true_labels)

precision = test_results[0, 0]
recall = test_results[0, 1]
f1 = test_results[0, 2]
accuracy = test_results[0, 3]

metrics = ["Precision", "Recall", "F1 Score", "Accuracy"]

dense_results = test_results.toarray()

plt.figure(figsize=(10, 6))
plt.bar(metrics, dense_results[0], color=['blue', 'green', 'orange', 'red'])
plt.title('Performance Metrics')
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

metrics_df = pd.DataFrame(dense_results, columns=metrics)

plt.figure(figsize=(8, 6))
sns.heatmap(metrics_df, annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Performance Metrics")
plt.xlabel("Metrics")
plt.ylabel("Model")
plt.yticks([0], ['Model 1'])
plt.show()

Higher learning rate value, lower Batch size, higher epochs

In [None]:
# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 12
batch_size = 2
learning_rate = 4e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:
# Prepare the test data for evaluation in the same format as the training data

predictions, labels, _ = trainer.predict(tokenised_test)
predictions = np.argmax(predictions, axis=2)

# Remove the predictions for the [CLS] and [SEP] tokens
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute multiple metrics on the test restuls
results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
def sparse_metrics(true_predictions, true_labels):
    results = metric.compute(predictions=true_predictions, references=true_labels)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    f1 = results["overall_f1"]
    accuracy = results["overall_accuracy"]

    metrics = csr_matrix([[precision, recall, f1, accuracy]])

    return metrics

test_results = sparse_metrics(true_predictions, true_labels)

precision = test_results[0, 0]
recall = test_results[0, 1]
f1 = test_results[0, 2]
accuracy = test_results[0, 3]

metrics = ["Precision", "Recall", "F1 Score", "Accuracy"]

dense_results = test_results.toarray()

plt.figure(figsize=(10, 6))
plt.bar(metrics, dense_results[0], color=['blue', 'green', 'orange', 'red'])
plt.title('Performance Metrics')
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

metrics_df = pd.DataFrame(dense_results, columns=metrics)

plt.figure(figsize=(8, 6))
sns.heatmap(metrics_df, annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Performance Metrics")
plt.xlabel("Metrics")
plt.ylabel("Model")
plt.yticks([0], ['Model 1'])
plt.show()

Higher learning rate value, higher Batch size, higher epochs

In [None]:
# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 12
batch_size = 8
learning_rate = 4e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:
# Prepare the test data for evaluation in the same format as the training data

predictions, labels, _ = trainer.predict(tokenised_test)
predictions = np.argmax(predictions, axis=2)

# Remove the predictions for the [CLS] and [SEP] tokens
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute multiple metrics on the test restuls
results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
def sparse_metrics(true_predictions, true_labels):
    results = metric.compute(predictions=true_predictions, references=true_labels)
    precision = results["overall_precision"]
    recall = results["overall_recall"]
    f1 = results["overall_f1"]
    accuracy = results["overall_accuracy"]

    metrics = csr_matrix([[precision, recall, f1, accuracy]])

    return metrics

test_results = sparse_metrics(true_predictions, true_labels)

precision = test_results[0, 0]
recall = test_results[0, 1]
f1 = test_results[0, 2]
accuracy = test_results[0, 3]

metrics = ["Precision", "Recall", "F1 Score", "Accuracy"]

dense_results = test_results.toarray()

plt.figure(figsize=(10, 6))
plt.bar(metrics, dense_results[0], color=['blue', 'green', 'orange', 'red'])
plt.title('Performance Metrics')
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

metrics_df = pd.DataFrame(dense_results, columns=metrics)

plt.figure(figsize=(8, 6))
sns.heatmap(metrics_df, annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Performance Metrics")
plt.xlabel("Metrics")
plt.ylabel("Model")
plt.yticks([0], ['Model 1'])
plt.show()