Fine-tune using BERT-based-multiligual_cased from hugging face

-we download all files needed because this is run in Autodl and there's no VPN to access Hugging face's API from https://huggingface.co/tugstugi/bert-base-mongolian-cased

#1 Download and upgrade required Libraries(make sure that they are compatible with your environment) before running all lines of code

In [1]:
!pip install -r requirements.txt

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

In [2]:
!pip install -U bitsandbytes

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[33mDEPRECATION: The HTML index page being used (http://mirrors.aliyun.com/pypi/simple/bitsandbytes/) is not a proper HTML 5 document. This is in violation of PEP 503 which requires these pages to be well-formed HTML 5 documents. Please reach out to the owners of this index page, and ask them to update this index page to a valid HTML 5 document. pip 22.2 will enforce this behaviour change. Discussion can be found at https://github.com/pypa/pip/issues/10825[0m[33m
[0m

In [3]:
!pip install --upgrade protobuf sentencepiece transformers

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[33mDEPRECATION: The HTML index page being used (http://mirrors.aliyun.com/pypi/simple/protobuf/) is not a proper HTML 5 document. This is in violation of PEP 503 which requires these pages to be well-formed HTML 5 documents. Please reach out to the owners of this index page, and ask them to update this index page to a valid HTML 5 document. pip 22.2 will enforce this behaviour change. Discussion can be found at https://github.com/pypa/pip/issues/10825[0m[33m
[33mDEPRECATION: The HTML index page being used (http://mirrors.aliyun.com/pypi/simple/sentencepiece/) is not a proper HTML 5 document. This is in violation of PEP 503 which requires these pages to be well-formed HTML 5 documents. Please reach out to the owners of this index page, and ask them to update this index page to a valid HTML 5 document. pip 22.2 will enforce this behaviour change. Discussion can be found at https://github.com/pypa/pip/issues/10825[0m[33m
[3

#2 (skip this if you use API from hugging face) Try and load downloaded files for BERT-base-multiligual-cased and test run to see if it works

In [4]:
from transformers import BertTokenizer, BertModel

# 加载 tokenizer
tokenizer = BertTokenizer.from_pretrained(
    "/dev/shm/bert-base-multiligual-cased",  # 绝对路径，无尾随斜杠
    local_files_only=True
)

# 加载模型
model = BertModel.from_pretrained(
    "/dev/shm/bert-base-multiligual-cased",
    local_files_only=True
)

# 测试分词
text = "Сүүлийн таван жил дараалан"
tokens = tokenizer.tokenize(text)
print(tokens)  # 应该输出类似 ['▁Сүү', '##лийн', '▁таван', '▁жил', '▁дараалан']

  from .autonotebook import tqdm as notebook_tqdm


['С', '##үү', '##лийн', 'тав', '##ан', 'жил', 'дараа', '##лан']


#3 Configuration

In [5]:
import json
import os
import time
import gc
import torch
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support

BERT_MODEL_PATH = "/dev/shm/bert-base-multiligual-cased"
TRAIN_DATASET_PATH = "/dev/shm/train.jsonl"
VALID_DATASET_PATH = "/dev/shm/validation.jsonl"

#4 Memory Management Function

In [6]:
def clear_cuda_memory():
    """Clears unused GPU memory to prevent OutOfMemory errors."""
    gc.collect()
    torch.cuda.empty_cache()

#5 Load Tokenizer

In [7]:
#Load Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL_PATH, local_files_only=True)

#6 Load Dataset

In [8]:
data_files = {"train": TRAIN_DATASET_PATH, "validation": VALID_DATASET_PATH}
dataset = load_dataset("json", data_files=data_files)

train_dataset = dataset["train"]
val_dataset = dataset["validation"]

#7 Determine Number of Labels (Mapping)

In [None]:
def get_unique_labels(dataset):
    labels = set()
    for example in dataset:
        labels.update(example["pos_tags"])
    return sorted(labels)

unique_labels = get_unique_labels(train_dataset)
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(unique_labels)

#8 Load Model

In [None]:
model = BertForTokenClassification.from_pretrained(
    BERT_MODEL_PATH, 
    num_labels=num_labels, 
    local_files_only=True,
    id2label=id2label,
    label2id=label2id
).cuda()

#9 Tokenization & Formatting

In [None]:
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    pos_tags = example["pos_tags"]
    
    encoding = tokenizer(tokens,
                         is_split_into_words=True,
                         truncation=True,
                         padding="max_length",
                         max_length=128,
                         return_tensors="pt")

    word_ids = encoding.word_ids(batch_index=0)
    labels = [-100 if word_id is None else label2id.get(pos_tags[word_id], 0) for word_id in word_ids]

    encoding["labels"] = labels
    return {key: torch.tensor(val).squeeze(0) for key, val in encoding.items()}

train_dataset = train_dataset.map(tokenize_and_align_labels, remove_columns=["tokens", "pos_tags"])
val_dataset = val_dataset.map(tokenize_and_align_labels, remove_columns=["tokens", "pos_tags"])

#10 Define Metrics

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_preds = []
    for pred_seq, label_seq in zip(predictions, labels):
        for p_item, l_item in zip(pred_seq, label_seq):
            if l_item != -100:
                true_labels.append(id2label[l_item])
                true_preds.append(id2label[p_item])

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_preds, average="weighted")
    return {"precision": precision, "recall": recall, "f1": f1}

#11 Training Arguments

In [None]:
def get_training_args(batch_size=8, grad_accum=8):
    return TrainingArguments(
        output_dir="/dev/shm/bert_finetuned",
        num_train_epochs=3,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        eval_strategy="steps",
        eval_steps=500,
        logging_steps=100,
        save_total_limit=1,
        logging_dir="/dev/shm/logs",
        fp16=True,
        gradient_accumulation_steps=grad_accum,
        bf16=False,
        optim="adamw_torch_fused",
        gradient_checkpointing=True,
        dataloader_num_workers=4,
        torch_compile=True,
        report_to="none",
        save_steps=500
    )

training_args = get_training_args()

#12 Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

#13 Run Training with Memory Handling

In [None]:
print("Fine-tuning started at:", time.strftime("%Y-%m-%d %H:%M:%S"))

try:
    trainer.train()
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("CUDA Out of Memory! Reducing batch size...")
        clear_cuda_memory()

        # Retry with lower batch size
        new_batch_size = max(1, training_args.per_device_train_batch_size // 2)
        new_grad_accum = training_args.gradient_accumulation_steps * 2

        training_args = get_training_args(batch_size=new_batch_size, grad_accum=new_grad_accum)
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer
        )
        trainer.train()

print("Fine-tuning finished at:", time.strftime("%Y-%m-%d %H:%M:%S"))

#14 Save Model

In [None]:
trainer.save_model("/dev/shm/bert_finetuned")
print("Final model saved.")