In [2]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if not torch.cuda.is_available():
    assert False, "CUDA is not available."

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset('json', data_files='./dataset.json')['train']

dataset = dataset.train_test_split(test_size=0.2)

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

label_list = ["O", "U-SHAPE", "B-VALUE", "I-UNIT", "L-DIMENSION", "B-SHAPE_TYPE", "L-SHAPE", "B-TEETH-COUNT", "L-TEETH"]
label_to_id = {label: i for i, label in enumerate(label_list)}

def tokenize_and_align_labels(examples):
    examples["text"] = [text.split(" ") for text in examples["text"]]

    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, word_ids in enumerate([tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["text"]))]):
        example_labels = examples["labels"][i]
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(label_to_id.get(example_labels[word_idx], -100))
            else:
                aligned_labels.append(-100)
            previous_word_idx = word_idx
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

dataset = dataset.map(tokenize_and_align_labels, batched=True)

Generating train split: 1000 examples [00:00, 13784.67 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 6618.17 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 4867.56 examples/s]


In [4]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=len(label_list),
    id2label={i: l for i, l in enumerate(label_list)},
    label2id={l: i for i, l in enumerate(label_list)}
)
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [5]:
from transformers import TrainingArguments, Trainer

training_arguments = TrainingArguments(
    output_dir="../model",
    eval_strategy="epoch",
    num_train_epochs=5,
    learning_rate=2e-5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=1,
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [6]:
trainer.train()
trainer.evaluate()
trainer.save_model()

Epoch,Training Loss,Validation Loss
1,0.1935,0.002045
2,0.0035,0.00105
3,0.0022,0.000767
4,0.0018,0.00065
5,0.0016,0.000617
