In [None]:
import numpy as np
import evaluate
import json
import pandas as pd
import datasets
from bert_utils import tokenize_special_tokens, compute_metrics, load_dataset
from transformers import BertTokenizerFast, AutoTokenizer
from transformers import pipeline
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
import argparse
from datasets import ClassLabel, Dataset

In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer)
classmap = ClassLabel(num_classes=3, names=['O', 'I-TECT', 'B-TECT'])

In [None]:
train_dataset, eval_dataset = load_dataset("C:\\Users\\trewe\Desktop\\UniTingz\Bachelorarbeit\\NER\Bachelor-Thesis\\data\\annotations", tokenizer, classmap, overfit=True)

In [None]:
print(train_dataset)
print(train_dataset['tokens'][0], len(train_dataset['tokens'][0]))
print(train_dataset['ner_tags'][0], len(train_dataset['ner_tags'][0]))
print(train_dataset['input_ids'][0], len(train_dataset['input_ids'][0]))

example = train_dataset[0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

In [4]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

In [6]:
label_list = ["O", "I-TECT", "B-TECT"]

id2label = {
    0: "O",
    1: "I-TECT",
    2: "B-TECT"
}

label2id = {
    "O": 0,
    "I-TECT": 1,
    "B-TECT": 2
}

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased",
                                                        num_labels=3,
                                                        id2label=id2label,
                                                        label2id=label2id,
                                                        finetuning_task="ner")

In [9]:
def compute_metrics(p):

    seqeval = evaluate.load("seqeval")

    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [10]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch"
)
#    eval_strategy="epoch",
#    load_best_model_at_end=True,

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
print("Training is now starting...")
trainer.train()

In [None]:
model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
inputs = tokenizer(text, return_tensors="pt")

In [None]:
with torch.no_grad():
    logits = model(**inputs).logits
    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
    
print(predicted_token_class)

In [None]:
#Or with a pipeline
classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
classifier(text)