REF： \
https://huggingface.co/docs/peft/task_guides/token-classification-lora \
https://github.com/gyr66/privacy_detection/blob/master/lora.ipynb

In [None]:
# !pip install -q peft transformers datasets evaluate seqeval

# # clean output
# from IPython.display import clear_output
# clear_output()

In [None]:
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np

model_checkpoint = "bert-base-chinese"
lr = 1e-3
batch_size = 16
num_epochs = 10
MAX_LEN = 256 - 2

#Train dataset

In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
import json

file_path = '/data/train.json'

data = pd.read_json(file_path, lines = True)

In [None]:
df_traindata = data[["character","character_label"]]

traindataset = Dataset.from_pandas(df_traindata)
dataset = traindataset.train_test_split(train_size=0.8, seed=42)
dataset["validation"] = dataset.pop("test")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['character', 'character_label'],
        num_rows: 22528
    })
    validation: Dataset({
        features: ['character', 'character_label'],
        num_rows: 5633
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [None]:
from functools import partial
# Assuming you have a dataset 'traindataset' and a tokenizer 'tokenizer'
# First, define your label dictionary
label_dict = {
    'O': 0,
    'B-BODY': 1,
    'I-BODY': 2,
    'B-SYMP': 3,
    'I-SYMP': 4,
    'B-INST': 5,
    'I-INST': 6,
    'B-EXAM': 7,
    'I-EXAM': 8,
    'B-CHEM': 9,
    'I-CHEM': 10,
    'B-DISE': 11,
    'I-DISE': 12,
    'B-DRUG': 13,
    'I-DRUG': 14,
    'B-SUPP': 15,
    'I-SUPP': 16,
    'B-TREAT': 17,
    'I-TREAT': 18,
    'B-TIME': 19,
    'I-TIME': 20
}

def align_labels_with_tokens(labels, word_ids):
    return [-100 if word_id is None else labels[word_id] for word_id in word_ids]

def tokenize_and_align_labels(examples, label_dict):
    tokenized_inputs = tokenizer(
        examples["character"], truncation=True, is_split_into_words=True, max_length=512
    )
    all_labels = examples["character_label"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        # Convert string labels to their corresponding integer values
        int_labels = [label_dict[label] for label in labels]
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = align_labels_with_tokens(int_labels, word_ids)
        new_labels.append(aligned_labels)

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
# Wrap your tokenize_and_align_labels function with the label dictionary using partial
tokenize_and_align_labels_with_dict = partial(tokenize_and_align_labels, label_dict=label_dict)

# Apply the function to your dataset
tokenized_dataset = dataset.map(
    tokenize_and_align_labels_with_dict,
    batched=True,
    remove_columns=traindataset.column_names,
    num_proc=16
)

Map (num_proc=16):   0%|          | 0/22528 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/5633 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

#Train

In [None]:
label_names = [
    'O',
    'B-BODY', 'I-BODY',
    'B-SYMP', 'I-SYMP',
    'B-INST', 'I-INST',
    'B-EXAM', 'I-EXAM',
    'B-CHEM', 'I-CHEM',
    'B-DISE', 'I-DISE',
    'B-DRUG', 'I-DRUG',
    'B-SUPP', 'I-SUPP',
    'B-TREAT', 'I-TREAT',
    'B-TIME', 'I-TIME'
]

id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, id2label=id2label, label2id=label2id
)
model

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

#ADD LoRA

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=4, lora_alpha=16, lora_dropout=0.1, bias="all"
)

In [None]:
model = get_peft_model(model, peft_config)
model

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): BertForTokenClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(21128, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(

In [None]:
model.print_trainable_parameters()

trainable params: 708,117 || all params: 102,299,178 || trainable%: 0.6922020429137759


#Train model

In [None]:
training_args = TrainingArguments(
    output_dir="Bert_LoRA",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    num_train_epochs=10,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    logging_strategy="epoch",
    dataloader_num_workers=16,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    save_total_limit=1
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

Epoch,Training Loss,Validation Loss


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

In [None]:
# model.push_to_hub("UJForSchool/Bert_base_chinese_LoRA", private=True)
# tokenizer.push_to_hub("UJForSchool/Bert_base_chinese_LoRA", private=True)