# Fine Tuning DeBERTA v3


In [1]:
import evaluate
import torch
import numpy as np

from transformers import (
    DataCollatorForTokenClassification,
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from datasets import load_from_disk
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType,
)

amdgpu.ids: No such file or directory


## Load Dataset

Dataset has already been preprocessed by script and saved, so we just need to load it


In [2]:
train_ds = load_from_disk("./data/processed/train")
val_ds = load_from_disk("./data/processed/val")

## Data Augmentation

In [3]:
# TODO: synonym replacement

## Train Model


In [4]:
model_name = "microsoft/deberta-v3-base"
label2id = {
    "O": 0,
    "B-TAR": 1,
    "I-TAR": 2,
    "B-TOOL": 3,
    "I-TOOL": 4,
    "B-DIR": 5,
    "I-DIR": 6,
}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)

# config = AutoConfig.from_pretrained(
#     model_name,
#     num_labels=num_labels,
#     id2label=id2label,
#     label2id=label2id,
#     quantization_config=BitsAndBytesConfig(load_in_8bit=True),
# )
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
)
# model.classifier = torch.nn.Linear(in_features=model.classifier.in_features, out_features=num_labels, bias=True)
# model.config = config
# model.num_labels = num_labels

model.model_parallel = False

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [
        [id2label[l] for l in label if l != -100] for label in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(
        predictions=true_predictions, references=true_labels
    )
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model = prepare_model_for_kbit_training(model)

In [6]:
model

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (key_proj): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (value_proj): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear8bitLt(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-0

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_proj", "value_proj", "classifier"],
    use_rslora=True,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.TOKEN_CLS,
)
# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules="all-linear",
#     use_rslora=True,
#     lora_dropout=0.05,
#     bias="none",
#     task_type=TaskType.TOKEN_CLS,
    # use_dora=True
# )


model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 595207 || all params: 184432142 || trainable%: 0.32272411605998697


In [8]:
args = TrainingArguments(
    "output/deberta-v3-ner-v1",
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    gradient_accumulation_steps=1,
    auto_find_batch_size=True,
    num_train_epochs=1,
    save_steps=100,
    save_total_limit=8,
)

In [9]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [10]:
trainer.train()

  0%|          | 0/350 [00:00<?, ?it/s]



  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 1.2801204919815063, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.6192226655309993, 'eval_runtime': 6.7879, 'eval_samples_per_second': 103.125, 'eval_steps_per_second': 12.964, 'epoch': 1.0}
{'train_runtime': 77.8243, 'train_samples_per_second': 35.978, 'train_steps_per_second': 4.497, 'train_loss': 1.304200439453125, 'epoch': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=350, training_loss=1.304200439453125, metrics={'train_runtime': 77.8243, 'train_samples_per_second': 35.978, 'train_steps_per_second': 4.497, 'total_flos': 72861487240608.0, 'train_loss': 1.304200439453125, 'epoch': 1.0})

In [11]:
# model.merge_and_unload()
# model.save_pretrained("./output/flan-t5-lora-v1/flan-t5-base-lora-rslora-v1.1")

In [12]:
model.eval()
input_text = "Control here. Deploy anti-air artillery to target a silver, blue, and red helicopter heading one three zero. Engage and neutralize the threat."

from transformers import pipeline

cls = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="first",
)
cls(input_text)

The model 'PeftModelForTokenClassification' is not supported for token-classification. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForTokenClassification', 'GPTNeoForTokenClassification', 'GPTNeoXForTokenClassification', 'IBertForTokenClassification', 'LayoutLMForTok

[{'entity_group': 'TOOL',
  'score': 0.22237173,
  'word': 'to',
  'start': 39,
  'end': 42}]