In [1]:
!python --version

Python 3.9.19


In [1]:
# 载入模型
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('/ssd/xthu/ChatGLM2-6B/chatglm2-6b', trust_remote_code=True)

prompt = tokenizer.build_prompt("AI是什么？", None)
print(" ===Outcome of build_prompt=== ")
print(prompt)

print(" ===Special Tokens=== ")
print("BOS token ID:", tokenizer.get_command("<bos>"))
print("EOS token ID:", tokenizer.get_command("<eos>"))
print("PAD token ID:", tokenizer.get_command("<pad>"))
print("SOP token ID:", tokenizer.get_command("sop"))
print("EOP token ID:", tokenizer.get_command("eop"))
print("MASK token ID:", tokenizer.get_command("[MASK]"))
print("GMASK token ID:", tokenizer.get_command("[gMASK]"))

max_source_length = 100
max_target_length = 200
q_ids = tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True,
                                      max_length=max_source_length)
print(" ===Question ids using encode with special tokens=== ")
print(q_ids)
a_ids = tokenizer.encode(text="AI是人工智能(Artificial Intelligence)的缩写", add_special_tokens=False, truncation=True,
                                      max_length=max_target_length)
print(" ===Answer ids===")
print(a_ids)

# print(" ===Question ids using tokenizer() without argument=== ")
# q_ids2 = tokenizer([prompt])
# print(q_ids2)
# 结果是q_ids2的input_ids和q_ids一致，但是多了attention_mask, position_ids之类的

 ===Outcome of build_prompt=== 
[Round 1]

问：AI是什么？

答：
 ===Special Tokens=== 
BOS token ID: 1
EOS token ID: 2
PAD token ID: 0
SOP token ID: 64792
EOP token ID: 64793
MASK token ID: 64789
GMASK token ID: 64790
 ===Question ids using encode with special tokens=== 
[64790, 64792, 790, 30951, 517, 30910, 30939, 30996, 13, 13, 54761, 31211, 23833, 32664, 31514, 13, 13, 55437, 31211]
 ===Answer ids===
[11265, 54532, 34797, 30946, 11868, 9596, 13067, 30945, 54530, 56068, 55172]


In [None]:
!pip install peft

In [2]:
print(" !!!Loading Model!!! ")
import torch
model = AutoModel.from_pretrained("/ssd/xthu/ChatGLM2-6B/chatglm2-6b", trust_remote_code=True)
max_source_length = 128
max_target_length = 128
epochs=5
batch_size=10
lr = 1e-6
lora_r=8
device=torch.device("cuda")

from peft import LoraConfig, get_peft_model, TaskType
peft_config=LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=lora_r,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(model, peft_config).to(device)
model.print_trainable_parameters()
# model=model.half()

 !!!Loading Model!!! 


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

trainable params: 1,949,696 || all params: 6,245,533,696 || trainable%: 0.0312


In [None]:
# 用来清除内存（NON-USE）
for param in model.parameters():
    del param
del model
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# 用来检测清除内存（NON-USE）
print(torch.cuda.memory_allocated())

In [3]:
print(" !!!Start Training!!! ")
import time
print(" ===Query=== ")
query = "学猫叫"
print(query)
print(" ===Prompt=== ")
prompt = tokenizer.build_prompt(query, None)
print(prompt)
print(" ===Inputs=== ")
inputs = tokenizer([prompt], return_tensors="pt")
from pprint import pprint
pprint(inputs)


 !!!Start Training!!! 
 ===Query=== 
学猫叫
 ===Prompt=== 
[Round 1]

问：学猫叫

答：
 ===Inputs=== 
{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[64790, 64792,   790, 30951,   517, 30910, 30939, 30996,    13,    13,
         54761, 31211, 54545, 56267, 55483,    13,    13, 55437, 31211]]),
 'position_ids': tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18]])}


In [None]:
# 试验：history长啥样（NON-USE）
print("This is for DEBUGing")
model = model.eval()
response, history = model.chat(tokenizer, "你好", history=[])
print(" === Response === ")
print(response)
print(" === History === ")
print(history)

In [None]:
# 实验：如果有history（NON-USE）
query = "学猫叫"
history = [("学狗叫","汪汪汪")]
prompt = tokenizer.build_prompt(query, history)
print(prompt)

In [None]:
# 测试device（NON-USE）
print(f"model:{model.device}")
print(f"inputs.input_ids:{inputs['input_ids'].device}")
print(f"inputs.attention_mask:{inputs['attention_mask'].device}")
print(f"inputs.position_ids:{inputs['position_ids'].device}")
print(f"labels:{labels.device}")

In [None]:
# model.generate()测试（NON-USE）
# Default args of generating 
max_length=8192
num_beams=1
do_sample=True
top_p=0.8
temperature=0.8
logits_processor=None
gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
                      "temperature": temperature, "logits_processor": logits_processor}
# Put inputs into cuda
inputs['input_ids'] = inputs['input_ids'].to(device)
inputs['attention_mask'] = inputs['attention_mask'].to(device)
inputs['position_ids'] = inputs['position_ids'].to(device)

print(" ===Outputs=== ")
outputs=model.generate(**inputs, **gen_kwargs)
print(outputs)

In [None]:
# 试验：output decode（NON-USE）
outputs_list = outputs.tolist()[0]
response = tokenizer.decode(outputs_list)
print(response)

In [None]:
# 试验：one token decode（NON-USE）
str_item = tokenizer.decode([34211])
print(str_item)

In [4]:
print(" ===Labels=== ")
query_ids = inputs["input_ids"]
answer_ids = tokenizer.encode("喵喵喵", add_special_tokens=False)
label1 = query_ids.tolist()+answer_ids+[2]
print(f"label1:{label1}")
label2 = query_ids.tolist()+[answer_ids]+[[2]]
print(f"label2:{label2}")
label_list = [query_ids.tolist()[0]+answer_ids+[2]]
print(f"label:{label_list}")

labels_list = [[-100]*len(query_ids.tolist()[0]) + answer_ids + [2]]
print(f"labels:{labels_list}")
print(f"labels.len:{len(labels_list[0])}")
labels = torch.tensor(labels_list).to(device)

 ===Labels=== 
label1:[[64790, 64792, 790, 30951, 517, 30910, 30939, 30996, 13, 13, 54761, 31211, 54545, 56267, 55483, 13, 13, 55437, 31211], 30910, 59000, 59000, 59000, 2]
label2:[[64790, 64792, 790, 30951, 517, 30910, 30939, 30996, 13, 13, 54761, 31211, 54545, 56267, 55483, 13, 13, 55437, 31211], [30910, 59000, 59000, 59000], [2]]
label:[[64790, 64792, 790, 30951, 517, 30910, 30939, 30996, 13, 13, 54761, 31211, 54545, 56267, 55483, 13, 13, 55437, 31211, 30910, 59000, 59000, 59000, 2]]
labels:[[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 30910, 59000, 59000, 59000, 2]]
labels.len:24


In [None]:
# 试验：input_ids和label的维度（NON-USE）
print(" ===Size for input_ids and labels=== ")
print(inputs["input_ids"].size())
print(label.size())

In [5]:
input_ids_list=[query_ids.tolist()[0]+answer_ids+[2]]
print(" ===input_ids_list=== ")
print(input_ids_list)
print(f"len:{len(input_ids_list[0])}")
input_ids = torch.tensor(input_ids_list).to(device)

print(" ===attention_mask_list=== ")
attention_mask_list = inputs['attention_mask'].tolist()
attention_mask_list = [attention_mask_list[0]+[0]*(len(answer_ids)+1)]
print(attention_mask_list)
print(f"len:{len(attention_mask_list[0])}")
attention_mask = torch.tensor(attention_mask_list).to(device)

print(" ===position_ids_list=== ")
position_ids_len = len(input_ids_list[0])
position_ids1 = torch.arange(position_ids_len, dtype=torch.long).unsqueeze(0)

position_ids2_answer = torch.arange(1, len(answer_ids)+2, dtype=torch.long).unsqueeze(0)
print(position_ids2_answer)
position_ids2_padding = torch.zeros(len(query_ids.tolist()[0]), dtype=torch.long).unsqueeze(0)
position_ids2 = torch.cat((position_ids2_padding, position_ids2_answer), dim=1)

# position_ids = torch.cat((position_ids1, position_ids2), dim=0)
# print(position_ids.tolist())
# print(len(position_ids[0].tolist()))
# print(len(position_ids[1].tolist()))
# position_ids = position_ids.to(device)
position_ids = position_ids1
print(position_ids.tolist())
print(len(position_ids[0].tolist()))
position_ids = position_ids.to(device)

 ===input_ids_list=== 
[[64790, 64792, 790, 30951, 517, 30910, 30939, 30996, 13, 13, 54761, 31211, 54545, 56267, 55483, 13, 13, 55437, 31211, 30910, 59000, 59000, 59000, 2]]
len:24
 ===attention_mask_list=== 
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]
len:24
 ===position_ids_list=== 
tensor([[1, 2, 3, 4, 5]])
[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]
24


In [11]:
# 检查device（NON-USE）
print(f"model:{model.device}")
print(f"input_ids:{input_ids.device}")
print(f"attention_mask:{attention_mask.device}")
print(f"position_ids:{position_ids.device}")
print(f"labels:{labels.device}")

model:cuda:0
input_ids:cuda:0
attention_mask:cuda:0
position_ids:cuda:0
labels:cuda:0


In [6]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
optimizer.zero_grad()
model = model.train()

In [12]:
start_time = time.time()
train_output = model(
    input_ids=input_ids,
    attention_mask=attention_mask,
    position_ids=position_ids,
    labels=labels,
)
loss = train_output.loss
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
optimizer.zero_grad()
end_time = time.time()
print(" ===First step of training=== ")
print(f"time used:{end_time-start_time}")
print(f"loss:{loss}")

model_output_dir = "chatglm2-6b"
print(" ===Save the model=== ")
print(f"the output dir is {model_output_dir}")
model.save_pretrained(model_output_dir)
# LoRA will automatically save

print(" ===Save the optimizer=== ")
import os
optimizer_state_path = os.path.join(model_output_dir, "optimizer.model")
print(f"the output path is {optimizer_state_path}")
torch.save(optimizer.state_dict(), optimizer_state_path)

 ===First step of training=== 
time used:1.208711862564087
loss:1.7470703125
 ===Save the model=== 
the output dir is chatglm2-6b
 ===Save the optimizer=== 
the output path is chatglm2-6b/optimizer.model




In [7]:
# 检查输入中是否有nan
print("Checking input data for NaNs...")
print(torch.isnan(input_ids).any())
print(torch.isnan(attention_mask).any())
print(torch.isnan(position_ids).any())
print(torch.isnan(labels).any())

Checking input data for NaNs...
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
tensor(False, device='cuda:0')


In [None]:
# 检查模型结构（NON-USE）
print(model)

In [8]:
# 测试：到底为什么有nan（NON-USE）
start_time = time.time()
train_output = model(
    input_ids=input_ids,
    attention_mask=attention_mask,
    position_ids=position_ids,
    labels=labels,
)
loss = train_output.loss
loss.backward()

In [11]:
# 测试：到底为什么有nan（NON-USE）
print(loss)

params = list(model.parameters())
import random
random_param = random.choice(params)
print(f'Random parameter data type: {random_param.dtype}')

tensor(1.7471, device='cuda:0', dtype=torch.float16, grad_fn=<ToCopyBackward0>)
Random parameter data type: torch.float16


In [12]:
# 测试：到底为什么有nan（NON-USE）
def print_parameter_updates(model):
    for name, param in model.named_parameters():
        if param.grad is not None:
            # 打印参数的梯度
            print(f"Gradient of parameter {name}: {param.grad}")
            # 打印参数更新量
            update = param.grad * optimizer.param_groups[0]['lr']
            print(f"Update for parameter {name}: {update}")

print_parameter_updates(model)

Gradient of parameter base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.lora_A.default.weight: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', dtype=torch.float16)
Update for parameter base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.lora_A.default.weight: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', dtype=torch.float16)
Gradient of parameter base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.lora_B.default.weight: tensor([[ 3.2783e-05, -4.7970e-04,  9.1505e-04,  ..

In [None]:
# 测试：到底为什么有nan（NON-USE）
optimizer.step()

In [13]:
# 检查nan并找出哪个参数（NON-USE）
import torch

def check_for_nan(model):
    nan_found = False
    for name, param in model.named_parameters():
        if torch.isnan(param).any():
            print(f"NaN detected in parameter: {name}")
            nan_found = True
    if not nan_found:
        print("No NaN values found in model parameters.")
    return nan_found

check_for_nan(model)


No NaN values found in model parameters.


False

In [None]:
step_count = 1
total_epoch = 50
while (step_count <= total_epoch):
    print(f" ===Training Epoch {step_count}=== ")
    start_time = time.time()
    train_output = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        labels=labels,
    )
    loss = train_output.loss
    print(f"loss:{loss}")
    if torch.isnan(loss):
        print("nan detected")
        break
        
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    end_time = time.time()
    print(f"time used:{end_time-start_time}")

    model_output_dir = "LoRA_pretrained"
    print(" ===Save the model=== ")
    print(f"the output dir is {model_output_dir}")
    model.save_pretrained(model_output_dir)
    # LoRA will automatically save
    
    print(" ===Save the optimizer=== ")
    import os
    optimizer_state_path = os.path.join(model_output_dir, "optimizer.model")
    print(f"the output path is {optimizer_state_path}")
    torch.save(optimizer.state_dict(), optimizer_state_path)
    
    step_count += 1