In [1]:
import os
# os.environ["WORLD_SIZE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

# 数据处理

In [2]:
ds = Dataset.load_from_disk('data/alpaca_data_zh')
datasets = ds.train_test_split(test_size=0.2)

In [3]:
ckpt = 'Langboat/bloom-1b4-zh'
tokenizer = AutoTokenizer.from_pretrained(ckpt)

def process_function(example):
    MAX_LENGTH = 256
    input_ids, attention_mask, labels = [], [], []
    instruction = example['instruction']
    input_str = example['input']
    instruction_input_seq = "\n".join(["Human: " + instruction, input_str]).strip() + "\n\n Assistant:"
    response_str = example['output'] + tokenizer.eos_token
    tokenized_instruction_input = tokenizer(instruction_input_seq)
    tokenized_response = tokenizer(response_str)
    input_ids = tokenized_instruction_input['input_ids'] + tokenized_response['input_ids']
    attention_mask = tokenized_instruction_input['attention_mask'] + tokenized_response['attention_mask']
    labels = [-100] * len(tokenized_instruction_input['input_ids']) + tokenized_response['input_ids']
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

tokenized_ds = datasets.map(process_function, remove_columns=ds.column_names)     

Map:   0%|          | 0/21486 [00:00<?, ? examples/s]

Map:   0%|          | 0/5372 [00:00<?, ? examples/s]

# 模型加载

In [4]:
model = AutoModelForCausalLM.from_pretrained(ckpt)

In [5]:
unit = 1000
num_params = sum(params.numel() for params in model.parameters())
print(f'Number of parameters: {num_params/unit**3:.2f} B')

model_size = num_params * 4 / unit**3
print(f'Model size: {model_size:.2f} GB')

gradient_size = model_size
print(f'Gradient size: {gradient_size:.2f} GB')

optimizer_state = model_size * 2
print(f'Optimizer state: {optimizer_state:.2f} GB')

total_size = model_size + optimizer_state + gradient_size
print(f'Total size: {total_size:.2f} GB')

Number of parameters: 1.30 B
Model size: 5.21 GB
Gradient size: 5.21 GB
Optimizer state: 10.42 GB
Total size: 20.85 GB


# BitFit
only update bias terms

[BitFit: Simple Parameter-efficient Fine-tuning for Transformer-based Masked Language-models](https://arxiv.org/abs/2106.10199)

In [6]:
num_params = 0
for name, params in model.named_parameters():
    if 'bias' not in name:
        params.requires_grad = False
    else:
        num_params += params.numel()
print(f'Number of trainable parameters: {num_params/unit**3:.6f} B')

Number of trainable parameters: 0.000545 B


In [7]:
# peformance before training
from transformers import pipeline
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)
ipt = "Human: {}\n{}".format('考试有什么技巧', '').strip() + "\n\nAssistant: "
result = pipe(ipt, max_length=100, num_beams=5)
print(result[0]['generated_text'])

Human: 考试有什么技巧

Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 考试有什么技巧
Assistant: 


# 训练

In [8]:
args = TrainingArguments(output_dir='./bitfit/',
                        num_train_epochs=1,
                        per_device_train_batch_size=8,
                        gradient_accumulation_steps=4,
                        per_device_eval_batch_size=8,
                        logging_steps=10,
                        load_best_model_at_end=True,
                        evaluation_strategy='epoch',
                        save_strategy='epoch',
                        save_total_limit=1,
                        report_to='none',
                        )
trainer = Trainer(model=model, args=args, train_dataset=tokenized_ds['train'], eval_dataset=tokenized_ds['test'],
                data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True))

In [None]:
trainer.train()

# 推理

In [None]:
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)
ipt = "Human: {}\n{}".format('考试有什么技巧', '').strip() + "\n\nAssistant: "
result = pipe(ipt, max_length=100, num_beams=5)
print(result[0]['generated_text'])