In [2]:
# 载入模型
print(" ===Import Tokenizer=== ")
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('/ssd/xthu/ChatGLM2-6B/chatglm2-6b', trust_remote_code=True)

print(" ===Import Model=== ")
model_origin = AutoModel.from_pretrained("/ssd/xthu/ChatGLM2-6B/chatglm2-6b", trust_remote_code=True)

 ===Import Tokenizer=== 
 ===Import Model=== 


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [1]:
import torch
print(" ===Set Global Parameters=== ")
lr = 1e-6
lora_r=8
device=torch.device("cuda")
batch_size=2
total_epochs=500
model_output_dir="LoRA_pretrained"
gradient_accumulation_steps=2 # After how many times of loading, one gradient is calculated

 ===Set Global Parameters=== 


In [3]:
print(" ===Special Token=== ")
eos = tokenizer.get_command("<eos>")
pad = tokenizer.get_command("<pad>")
print("eos: ", eos)
print("pad: ", pad)

 ===Special Token=== 
eos:  2
pad:  0


In [4]:
print(" ===Import Peft=== ")
from peft import LoraConfig, get_peft_model, TaskType
peft_config=LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=lora_r,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(model_origin, peft_config)
model.print_trainable_parameters()
model = model.to(device)
device_ids = [0, 1, 2, 3]
# model = torch.nn.DataParallel(model, device_ids=device_ids)
for name, param in model.named_parameters():
    if 'lora' in name.lower():  # 检查参数名中是否包含'lora'
        param.data = param.data.float()  # 转换为float32

params = list(model.parameters())
import random
random_param = random.choice(params)
print(f'Random parameter data type: {random_param.dtype}')

 ===Import Peft=== 
trainable params: 1,949,696 || all params: 6,245,533,696 || trainable%: 0.0312
Random parameter data type: torch.float16


In [None]:
# DEBUG：for LoRA NaN (NON-USE)
import torch
import torch.nn as nn

# 假设 model 已经是您加载的模型实例
# 遍历模型的所有参数
for name, param in model.named_parameters():
    print(f"{name} - DataType: {param.dtype}")

In [6]:
# DEBUG：for LoRA NaN (NON-USE)
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): ChatGLMForConditionalGeneration(
      (transformer): ChatGLMModel(
        (embedding): Embedding(
          (word_embeddings): Embedding(65024, 4096)
        )
        (rotary_pos_emb): RotaryEmbedding()
        (encoder): GLMTransformer(
          (layers): ModuleList(
            (0-27): 28 x GLMBlock(
              (input_layernorm): RMSNorm()
              (self_attention): SelfAttention(
                (query_key_value): lora.Linear(
                  (base_layer): Linear(in_features=4096, out_features=4608, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=4608, bias=False)
          

In [7]:
print(" ===Build Dataset=== ")
from torch.utils.data import Dataset
import torch
import json
import numpy as np
import pandas as pd
import random

class QADataset(Dataset):
    def __init__(self, data_path, tokenizer, max_source_length, max_target_length) -> None:
        super().__init__()
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        self.max_seq_length = self.max_source_length + self.max_target_length

        df = pd.read_csv(data_path, delimiter=',', encoding='gbk')
        self.datas = [
            {'index': idx, 'question': row['Question'], 'answer': row['Answer']}
            for idx, row in df.iterrows()
        ]

        print("Data loaded, size:", len(self.datas))
        if len(self.datas) > 0:
            print("A random data example:", self.datas[random.randint(0, len(self.datas)-1)])

    def preprocess(self, question, answer):
        prompt = self.tokenizer.build_prompt(question, None)

        q_ids = self.tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True,
                                      max_length=self.max_source_length)

        a_ids = self.tokenizer.encode(text=answer, add_special_tokens=False, truncation=True,
                                      max_length=self.max_target_length)

        q_len = len(q_ids)
        input_ids = q_ids + a_ids + [eos]
        labels = [-100] * q_len + a_ids + [eos]
        # padding after is needed because of dataloader requires same length
        pad_len = self.max_seq_length - len(input_ids)
        input_ids = input_ids + [pad] * pad_len
        labels = labels + [-100] * pad_len
        return input_ids, labels

    def __getitem__(self, index):
        item_data = self.datas[index]

        input_ids, labels = self.preprocess(item_data['question'],item_data['answer'])

        return {
            "input_ids": torch.LongTensor(np.array(input_ids)),
            "labels": torch.LongTensor(np.array(labels))
        }

    def __len__(self):
        return len(self.datas)


train_dataset = QADataset("../角色Dataset/喵喵喵.csv", tokenizer, 100, 100)

from pprint import pprint
random_data = train_dataset[random.randint(0, len(train_dataset)-1)]
print(f"A random preprocessed data example: {random_data}")
print(f"The length of input ids is {len(random_data['input_ids'])}")
print(f"The length of labels is {len(random_data['labels'])}")

 ===Build Dataset=== 
Data loaded, size: 21
A random data example: {'index': 13, 'question': '学马叫', 'answer': '喵喵喵'}
A random preprocessed data example: {'input_ids': tensor([64790, 64792,   790, 30951,   517, 30910, 30939, 30996,    13,    13,
        54761, 31211, 39701,    13,    13, 55437, 31211, 36474, 54591, 59000,
            2,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,    

In [8]:
import time
from tqdm import tqdm
import sys
import os
def train_epoch(epoch, model, device, loader, optimizer, gradient_accumulation_steps):
    model.train()
    previous_time = time.time()
    for index, data in enumerate(tqdm(loader, file=sys.stdout, desc="Train Epoch: " + str(epoch))):
        input_ids = data['input_ids'].to(device, dtype=torch.long)
        labels = data['labels'].to(device, dtype=torch.long)

        outputs = model(
            input_ids=input_ids,
            labels=labels,
        )
        loss = outputs.loss
        if (torch.isnan(loss)):
            tqdm.write(" !!!NaN encountered!!! ")
            save_checkpoint(epoch, optimizer, model, model_output_dir, True)
            raise Exception("NaN error")
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(loader) - 1:
            optimizer.step()
            optimizer.zero_grad()
            current_time = time.time()
            tqdm.write(f"In epoch {epoch}, the {index} times of loading, the current loss is {loss.item()}")
            tqdm.write(f"For {gradient_accumulation_steps} steps, the time used: {current_time-previous_time}")
            previous_time = current_time

def train(total_epochs, model, device, train_loader, optimizer, gradient_accumulation_steps, model_output_dir):
    for epoch in range(total_epochs):
        print(f" ===Epoch {epoch}=== ")
        train_epoch(epoch, model, device, train_loader, optimizer, gradient_accumulation_steps)
        save_checkpoint(epoch, optimizer, model, model_output_dir, False)

def save_checkpoint(epoch, optimizer, model, model_output_dir, error):
    print(" ===Save model and Checkpoint=== ")
    print("Save To ", model_output_dir)
    model.save_pretrained(model_output_dir)
    torch.save({
        'epoch': epoch,
        'error': error,
        'optimizer_state_dict': optimizer.state_dict(),
    }, os.path.join(model_output_dir, "latest_checkpoint"))


In [None]:
from torch.utils.data import DataLoader
print(" ===prepare dataloader=== ")
dataloader_params = {
    "batch_size": batch_size,
    "shuffle": True,
    "num_workers": 0,
}
print("the batch size: ", batch_size)
train_loader = DataLoader(train_dataset, **dataloader_params)

print(" ===prepare optimizer=== ")
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)

print(" !!!Start Training!!! ")
train(total_epochs, model, device, train_loader, optimizer, gradient_accumulation_steps, model_output_dir)

In [35]:
# 检查nan并找出哪个参数（NON-USE）
import torch

def check_for_nan(model):
    nan_found = False
    for name, param in model.named_parameters():
        if torch.isnan(param).any():
            print(f"NaN detected in parameter: {name}")
            nan_found = True
    if not nan_found:
        print("No NaN values found in model parameters.")
    return nan_found

check_for_nan(model)

No NaN values found in model parameters.


False

In [5]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from peft import PeftConfig, PeftModel, LoraConfig, get_peft_model, TaskType
import torch
import os

# 记得先加载global params

# 记得先加载模型

lora_config=PeftConfig.from_pretrained(model_output_dir)
model = PeftModel.from_pretrained(model_origin, model_output_dir)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)

checkpoint = torch.load(os.path.join(model_output_dir, "latest_checkpoint"))
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

model = model.to(device)
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): ChatGLMForConditionalGeneration(
      (transformer): ChatGLMModel(
        (embedding): Embedding(
          (word_embeddings): Embedding(65024, 4096)
        )
        (rotary_pos_emb): RotaryEmbedding()
        (encoder): GLMTransformer(
          (layers): ModuleList(
            (0-27): 28 x GLMBlock(
              (input_layernorm): RMSNorm()
              (self_attention): SelfAttention(
                (query_key_value): lora.Linear(
                  (base_layer): Linear(in_features=4096, out_features=4608, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=4608, bias=False)
          

In [6]:
response, history = model.chat(tokenizer, "学猫叫", history=[])
print("Chat喵喵喵：", response)

Chat喵喵喵： 喵喵喵


In [5]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): ChatGLMForConditionalGeneration(
      (transformer): ChatGLMModel(
        (embedding): Embedding(
          (word_embeddings): Embedding(65024, 4096)
        )
        (rotary_pos_emb): RotaryEmbedding()
        (encoder): GLMTransformer(
          (layers): ModuleList(
            (0-27): 28 x GLMBlock(
              (input_layernorm): RMSNorm()
              (self_attention): SelfAttention(
                (query_key_value): lora.Linear(
                  (base_layer): Linear(in_features=4096, out_features=4608, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=4608, bias=False)
          

In [6]:
# DEBUG：for LoRA NaN (NON-USE)
import torch
import torch.nn as nn

# 假设 model 已经是您加载的模型实例
# 遍历模型的所有参数
for name, param in model.named_parameters():
    print(f"{name} - DataType: {param.dtype}")

base_model.model.transformer.embedding.word_embeddings.weight - DataType: torch.float16
base_model.model.transformer.encoder.layers.0.input_layernorm.weight - DataType: torch.float16
base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.base_layer.weight - DataType: torch.float16
base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.base_layer.bias - DataType: torch.float16
base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.lora_A.default.weight - DataType: torch.float16
base_model.model.transformer.encoder.layers.0.self_attention.query_key_value.lora_B.default.weight - DataType: torch.float16
base_model.model.transformer.encoder.layers.0.self_attention.dense.weight - DataType: torch.float16
base_model.model.transformer.encoder.layers.0.post_attention_layernorm.weight - DataType: torch.float16
base_model.model.transformer.encoder.layers.0.mlp.dense_h_to_4h.weight - DataType: torch.float16
base_model.model.transform

In [None]:
!ls -l 