In [3]:
import torch

def load_data(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []

    with open(file_path, encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                token, tag = line.strip().split()
                sentence.append(token)
                label.append(tag)
                
    return sentences, labels

train_sentences, train_labels = load_data('wlina_bd.txt')

In [4]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
        padding=True
    )
    
    labels_aligned = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_aligned = []
        previous_word_idx = None
        
        for word_idx in word_ids:
            if word_idx is None:
                label_aligned.append(-100)
            elif word_idx != previous_word_idx:
                label_aligned.append(label[word_idx])
            else:
                label_aligned.append(label[word_idx].replace('B-', 'I-'))
            previous_word_idx = word_idx
            
        labels_aligned.append(label_aligned)
        
    return tokenized_inputs, labels_aligned

train_tokenized_inputs, train_labels_aligned = tokenize_and_align_labels(train_sentences, train_labels)



In [5]:
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, encodings, labels, label_map):
        self.encodings = encodings
        self.labels = labels
        self.label_map = label_map

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(
            [self.label_map[label] if label != -100 else -100 for label in self.labels[idx]]
        )
        return item

    def __len__(self):
        return len(self.labels)

label_list = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-DATE', 'I-DATE', 'B-MISC', 'I-MISC']
label_map = {label: i for i, label in enumerate(label_list)}
num_labels = len(label_list)


train_dataset = NERDataset(train_tokenized_inputs, train_labels_aligned, label_map)

In [None]:
# from transformers import BertForTokenClassification, Trainer, TrainingArguments

# model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)

# training_args = TrainingArguments(
#     output_dir='./results',
#     eval_strategy="epoch",  # Updated to avoid the deprecation warning
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
# )

# trainer.train()

from transformers import BertForTokenClassification, Trainer, TrainingArguments
import torch

# Define the model
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Use "epoch" to match save_strategy
    save_strategy="epoch",  # Align save strategy with evaluation strategy
    learning_rate=5e-5,  # Adjusted learning rate
    per_device_train_batch_size=32,  # Increase batch size
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True if torch.cuda.is_available() else False,  # Use mixed precision if GPU is available
    logging_dir='./logs',  # Enable logging
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,  # This is now compatible with the strategies
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Start training
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [1]:
import platform
import psutil
import torch

def get_system_info():
    print(f"Processor: {platform.processor()}")
    print(f"RAM: {round(psutil.virtual_memory().total / (1024**3), 2)} GB")
    print(f"CPU Cores: {psutil.cpu_count(logical=True)}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {round(torch.cuda.get_device_properties(0).total_memory / (1024**3), 2)} GB")
    else:
        print("No GPU found.")

get_system_info()

Processor: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
RAM: 15.78 GB
CPU Cores: 16
No GPU found.
