# introdution

1. 默认精度 `单精度: fp32`: 32bits = 4Bytes
2. 常见低精度: `fp16 (半精度), bfloat16, int16, int8, fp4, nf4`.

<div>
<img src="figs/14_fp16.jpg" width="1000"/>
</div>

In [1]:
import torch.nn as nn
import torch
net = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 10)
)
net = net.half()
net.state_dict()['0.weight'].dtype

torch.float16

# LLAMA2: LORA + fp16

https://medium.com/@ogbanugot/notes-on-fine-tuning-llama-2-using-qlora-a-detailed-breakdown-370be42ccca1

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
hf_token = os.environ.get('HF_TOKEN')
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

In [3]:
ds = Dataset.load_from_disk('data/alpaca_data_zh')
datasets = ds.train_test_split(test_size=0.2, seed=42)

ckpt = 'meta-llama/Llama-2-7b-hf'#'/data/llama2/llama-2-7b-chat-hf'
tokenizer = AutoTokenizer.from_pretrained(ckpt, token=hf_token)

# some details about the tokenizer
if tokenizer.pad_token is None:
    print('the tokenizer has no pad token!')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print('add [PAD] to the tokenizer')
# Note from blog: I read several tutorials for fine-tuning Llama 2 that use the EOS token for left padding. 
# If you do that, you will have a 0.0 loss and the training will diverge. Try it! It’s interesting to observe it.
# left padding for generation and right padding for fine-tuning
tokenizer.padding_side = 'right' # otherwise it will create weird loss (0.0 or 33.33 ish train loss, vanish or explode as suggested in video tutorial)




the tokenizer has no pad token!
add [PAD] to the tokenizer


In [4]:
tokenizer.eos_token, tokenizer.pad_token

('</s>', '[PAD]')

In [5]:
print(tokenizer.vocab_size), print(len(tokenizer))
print('tokenizer.vocab_size does not change as it is an attribute!')

32000
32001
tokenizer.vocab_size does not change as it is an attribute!


In [6]:
def process_function(example):
    MAX_LENGTH = 384 # increase the length to 384 to accommodate the Chinese characters
    input_ids, attention_mask, labels = [], [], []
    instruction = example['instruction']
    input_str = example['input']
    instruction_input_seq = "\n".join(["Human: " + instruction, input_str]).strip() + "\n\n Assistant:"
    tokenized_instruction_input = tokenizer(instruction_input_seq, add_special_tokens=False)
    response_str = example['output'] + tokenizer.eos_token
    tokenized_response = tokenizer(response_str, add_special_tokens=False)
    input_ids = tokenized_instruction_input['input_ids'] + tokenized_response['input_ids']
    attention_mask = tokenized_instruction_input['attention_mask'] + tokenized_response['attention_mask']
    labels = [-100] * len(tokenized_instruction_input['input_ids']) + tokenized_response['input_ids']
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

tokenized_ds = datasets.map(process_function, remove_columns=ds.column_names)   

In [7]:
tokenizer.decode(tokenized_ds['train']['input_ids'][0])

'Human: 亲爱的客户，我很自豪地向您推荐一款产品，我相信您一定会喜欢。\n\n Assistant: 谢谢您的信任！请您告诉我产品的详细信息以及特点，我可以更好地了解并向其他潜在客户推荐它。</s>'

## discussion padding issues in llama2

In [8]:
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
train_dataloader = DataLoader(tokenized_ds['train'], collate_fn=collator, shuffle=False, batch_size=4)

In [9]:
batch = next(iter(train_dataloader))

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [10]:
[len(seq) for seq in tokenized_ds['train']['input_ids'][:4]]

[130, 43, 33, 82]

In [11]:
[len(seq) for seq in batch['input_ids']]

[130, 130, 130, 130]

In [12]:
[sum(mask == 1).item() for mask in batch['attention_mask']]

[130, 43, 33, 82]

## load model

In [13]:

model = AutoModelForCausalLM.from_pretrained(ckpt, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, device_map='auto', token=hf_token)
# model = AutoModelForCausalLM.from_pretrained(ckpt, low_cpu_mem_usage=True, device_map='auto', token=hf_token)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
model.device

device(type='cuda', index=0)

In [15]:
model.dtype

torch.bfloat16

In [16]:
model.resize_token_embeddings(len(tokenizer)) 

Embedding(32001, 4096)

In [17]:
print(model.config.vocab_size)

32001


In [18]:
# does not train the embedding layer
for name, params in model.model.embed_tokens.named_parameters():
    params.requires_grad = False
    print(name, params.dtype, params.device, params.shape, params.requires_grad)
mean_embedding = torch.mean(model.model.embed_tokens.weight[:-1], dim=0)
model.model.embed_tokens.weight[-1] = mean_embedding

weight torch.bfloat16 cuda:0 torch.Size([32001, 4096]) False


In [19]:
# if use fp16 will get nan in logits
# model(**batch.to('cuda'))

## lora config

In [20]:
from peft import LoraConfig, TaskType, get_peft_model
config = LoraConfig(task_type=TaskType.CAUSAL_LM)
lora_model = get_peft_model(model, config)

In [21]:
lora_model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,618,112 || trainable%: 0.06220586618327525


In [27]:
count = 0
for name, param in lora_model.named_parameters():
    print(name, param.dtype)
    count += 1
    if count > 10:
        break

base_model.model.model.embed_tokens.weight torch.bfloat16
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight torch.bfloat16
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.bfloat16
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.bfloat16
base_model.model.model.layers.0.self_attn.k_proj.weight torch.bfloat16
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight torch.bfloat16
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.bfloat16
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight torch.bfloat16
base_model.model.model.layers.0.self_attn.o_proj.weight torch.bfloat16
base_model.model.model.layers.0.mlp.gate_proj.weight torch.bfloat16
base_model.model.model.layers.0.mlp.up_proj.weight torch.bfloat16


In [24]:
# lora_model(**batch.to('cuda'))

## training

In [25]:
args = TrainingArguments(output_dir='./llama2_lora/',
                        num_train_epochs=1,
                        per_device_train_batch_size=2,
                        gradient_accumulation_steps=8,
                        per_device_eval_batch_size=4,
                        logging_steps=10,
                        load_best_model_at_end=True,
                        evaluation_strategy='epoch',
                        save_strategy='epoch',
                        save_total_limit=0,
                        report_to='wandb',
                        # adam_epsilon=1e-4, # default is 1e-8 but lead to underflow issue when use fp16 training.
                        )
trainer = Trainer(model=lora_model, args=args, train_dataset=tokenized_ds['train'].select(range(6000)), eval_dataset=tokenized_ds['test'],
                data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True))

In [26]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myutongdai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.0094,0.99685


TrainOutput(global_step=375, training_loss=1.023783432006836, metrics={'train_runtime': 1085.7184, 'train_samples_per_second': 5.526, 'train_steps_per_second': 0.345, 'total_flos': 5.093896926245683e+16, 'train_loss': 1.023783432006836, 'epoch': 1.0})

1. when use `gradient_checkpoint` with `lora` add `lora_model.enable_input_require_parameters()` beofre setting up `TrainingArguments`.

2. when use `fp16` with `adam` needs to adjust `adam_epsilon`, otheriwse you get zero loss. [can be buggy]

# inference

In [30]:
from peft import PeftModel
# model = AutoModelForCausalLM.from_pretrained(ckpt, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, device_map='auto', token=hf_token)
p_model = PeftModel.from_pretrained(model, model_id="./llama2_lora/checkpoint-375/")

In [35]:
ipt = tokenizer("Human: {}\n{}".format("考试有哪些技巧？", "").strip() + "\n\nAssistant: ", return_tensors="pt")
ipt = {k: v.to(p_model.device) for k, v in ipt.items()}
print(tokenizer.decode(p_model.generate(**ipt, num_beams=3, 
                    max_length=100, repetition_penalty=1.5)[0], skip_special_tokens=True))

Human: 考试有哪些技巧？

Assistant: 考试是一种重要的学习和评估方法，它能够评估学生的知识水平和学习能力。考试的技巧可以帮助学生提高考


In [38]:
ipt = tokenizer("Human: {}\n{}".format("如何做西红柿炒鸡蛋？", "").strip() + "\n\nAssistant: ", return_tensors="pt")
ipt = {k: v.to(p_model.device) for k, v in ipt.items()}
print(tokenizer.decode(p_model.generate(**ipt, num_beams=3, 
                    max_length=300, repetition_penalty=1.5)[0], skip_special_tokens=True))

Human: 如何做西红柿炒鸡蛋？

Assistant: 1. 把西红柿切成小块。

2. 把鸡蛋炒熟。

3. 将西红柿和鸡蛋放在一起，加上酱油、胡椒粉、辣椒粉等调料，炒熟。


# 8-bit

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
hf_token = os.environ.get('HF_TOKEN')
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import torch

In [5]:
ckpt = 'meta-llama/Llama-2-7b-hf'
model = AutoModelForCausalLM.from_pretrained(ckpt, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, device_map='auto', token=hf_token, load_in_8bit=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
for name, params in model.named_parameters():
    if params.dtype == torch.int8:
        print(name, params.dtype)

model.layers.0.self_attn.q_proj.weight torch.int8
model.layers.0.self_attn.k_proj.weight torch.int8
model.layers.0.self_attn.v_proj.weight torch.int8
model.layers.0.self_attn.o_proj.weight torch.int8
model.layers.0.mlp.gate_proj.weight torch.int8
model.layers.0.mlp.up_proj.weight torch.int8
model.layers.0.mlp.down_proj.weight torch.int8
model.layers.1.self_attn.q_proj.weight torch.int8
model.layers.1.self_attn.k_proj.weight torch.int8
model.layers.1.self_attn.v_proj.weight torch.int8
model.layers.1.self_attn.o_proj.weight torch.int8
model.layers.1.mlp.gate_proj.weight torch.int8
model.layers.1.mlp.up_proj.weight torch.int8
model.layers.1.mlp.down_proj.weight torch.int8
model.layers.2.self_attn.q_proj.weight torch.int8
model.layers.2.self_attn.k_proj.weight torch.int8
model.layers.2.self_attn.v_proj.weight torch.int8
model.layers.2.self_attn.o_proj.weight torch.int8
model.layers.2.mlp.gate_proj.weight torch.int8
model.layers.2.mlp.up_proj.weight torch.int8
model.layers.2.mlp.down_proj.w

In [9]:
model.config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_type": "fp4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": false,
    "load_in_8bit": true,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version":