In [23]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, pipeline, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
from datasets import Dataset

In [24]:
from datasets import load_dataset

ds = load_dataset("telord/mountains-ner-dataset")

In [25]:
train_ds = ds['train'].shuffle(seed=42).select([i for i in range(int(len(ds['train']) * 0.15))])
val_ds = ds['val'].shuffle(seed=42).select([i for i in range(int(len(ds['val']) * 0.15))])
test_ds = ds['test'].shuffle(seed=42).select([i for i in range(int(len(ds['test']) * 0.15))])

In [26]:
tag2id = {"O": 0, "B-MNTN": 1, "I-MNTN": 2}
id2tag = {0: "O", 1: "B-MNTN", 2: "I-MNTN"}

In [27]:
# Перетворення числових міток на текстові для train_ds, val_ds та test_ds
train_ds = train_ds.map(lambda x: {'labels': [id2tag.get(id) for id in x['labels']]})
val_ds = val_ds.map(lambda x: {'labels': [id2tag.get(id) for id in x['labels']]})
test_ds = test_ds.map(lambda x: {'labels': [id2tag.get(id) for id in x['labels']]})

In [28]:
# Завантаження токенізатора і моделі
tokenizer = BertTokenizerFast.from_pretrained("distilbert-base-uncased")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizerFast'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.outp

In [29]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [30]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], 
        is_split_into_words=True, 
        padding=True, 
        truncation=True,
        max_length=256
    )
    all_labels = examples["labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [31]:
tokenized_datasets = ds.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=ds['train'].column_names,
)

Map:   0%|          | 0/479 [00:00<?, ? examples/s]

In [32]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3827
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 478
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 479
    })
})

In [33]:
tokenized_datasets.save_to_disk("../data/tokenized_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/3827 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/478 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/479 [00:00<?, ? examples/s]