In [22]:
from datasets import Dataset, DatasetDict
import evaluate
import torch
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer

## Helper functions and declarations

In [2]:
label_names = ['O', 'B-ART','I-ART','B-CON','I-CON','B-LOC','I-LOC',
        'B-MAT','I-MAT','B-PER','I-PER','B-SPE','I-SPE']
id2label = {i: l for i, l in enumerate(label_names)}
label2id = {l: i for i, l in id2label.items()}

In [14]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], truncation=True, is_split_into_words=True
    )
    all_labels = examples['ner_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


def parse(sentence):
    tokens, ner_tags = [], []
    for phrase in sentence:
            features = phrase.split(' ')
            tokens.append(features[0].strip())
            ner_tags.append(label2id[features[2].strip()])
    return {'tokens': tokens, 'ner_tags': ner_tags}

## Data Loading

In [5]:
raw_datasets = {}
for split in ('train', 'val', 'test'):
    raw_datasets[split] = {}
    with open(f'{split}.txt', 'r') as infile:
        raw_data = infile.read()
        lines = raw_data.split('\n')
        sentences = raw_data.split('\n\n')
        sentences = [parse(sentence.split('\n')) for sentence in sentences]
        for sentence in sentences:
            for k in sentence:
                if k not in raw_datasets[split].keys():
                    raw_datasets[split][k] = []
                raw_datasets[split][k].append(sentence[k])


datasets = {}
for split in raw_datasets.keys():
    datasets[split] = Dataset.from_dict(raw_datasets[split])
data = DatasetDict(datasets)

## Preprocessing

### Tokenization and alignment

In [29]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

data = data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=data['train'].column_names)

Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

## Training

### Training set-up

In [30]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load('seqeval')

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

args = TrainingArguments(
    "bert-finetuned-archeology",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=data["train"],
    eval_dataset=data["val"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training loop

In [None]:
trainer.train()
trainer.push_to_hub(commit_message="Training complete")

  0%|          | 0/747 [00:00<?, ?it/s]