In [1]:
import torch
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

# Load Model

In [2]:
model_path = "/root/autodl-fs/Llama-3-12B"
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cuda", torch_dtype=torch.bfloat16)
model = model.cuda()

# model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法

model.dtype

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

torch.bfloat16

# Load Dataset

In [3]:
dataset_file = "pairs/train_pairs.json"
df_train = pd.read_json(dataset_file)

'''---------------------'''
df_train = df_train[:20000]
'''---------------------'''

ds_train = Dataset.from_pandas(df_train)

ds_train[:3]

{'ID': [1, 2, 3],
 'conv': ['hello',
  'hello there, I have not seen this movie so im going to take a minute to look it over :)',
  'Alright that is fine. What is the movie?'],
 'response': ['hello there, I have not seen this movie so im going to take a minute to look it over :)',
  'Alright that is fine. What is the movie?',
  'The movie is The Social Network']}

# Processing the Dataset

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
tokenizer.pad_token, tokenizer.pad_token_id, tokenizer.eos_token_id

('<|end_of_text|>', 128001, 128001)

In [6]:
def process_func(example):
    MAX_LENGTH = 66  # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|start_header_id|>user<|end_header_id|>\n\n{example['conv']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
        add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['response']}<|eot_id|>", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [7]:
train_tokenized_id = ds_train.map(process_func, remove_columns=ds_train.column_names)
train_tokenized_id

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})

In [8]:
print(tokenizer.decode(train_tokenized_id[0]['input_ids']))

<|start_header_id|>user<|end_header_id|>

hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>

hello there, I have not seen this movie so im going to take a minute to look it over :)<|eot_id|><|end_of_text|>


In [9]:
tokenizer.decode(list(filter(lambda x: x != -100, train_tokenized_id[1]["labels"])))

'Alright that is fine. What is the movie?<|eot_id|><|end_of_text|>'

# Config of LoRA

In [10]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training,TaskType
import wandb
config1 = {"r": 8,
    "lora_alpha": 32,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    "lora_dropout": 0.05,
    "inference_mode":False,  # train mode
    "bias": "none",
    "task_type": TaskType.CAUSAL_LM
         }


# define LoRA Config
lora_config = LoraConfig(
    r = config1["r"],
    lora_alpha = config1["lora_alpha"],
    target_modules = config1["target_modules"],
    lora_dropout = config1["lora_dropout"],
    inference_mode = config1["inference_mode"],
    bias = config1["bias"],
    task_type = config1["task_type"]
)


# # prepare int8 model for training
# model = prepare_model_for_int8_training(model)

# add LoRA adapter
model = get_peft_model(model, lora_config)
import io
import sys

# 假设 model 是您已经定义好的模型对象

# 创建一个 StringIO 对象
output = io.StringIO()

# 保存当前的 stdout
original_stdout = sys.stdout

try:
    # 将 stdout 重定向到 StringIO 对象
    sys.stdout = output
    # 调用方法，打印输出到 StringIO 对象
    model.print_trainable_parameters()
finally:
    # 恢复原始的 stdout
    sys.stdout = original_stdout

# 获取 StringIO 对象中的内容
output_str = output.getvalue()

# 关闭 StringIO 对象
output.close()

# 打印捕获到的字符串内容
print("1:",output_str)

# wandb.init(
#     project = "LLM_LoRA_FineTuning",
    
#     config = {
#         "config": config1,
#         "Dataset": "qa_Tools_and_Home_Improvement",
#         "Tuning-method": "LoRA",
#         "Trainable params": str(output_str)
#     }
# )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


1: trainable params: 31,457,280 || all params: 11,551,510,528 || trainable%: 0.27232178790600503



# Config of Training Params

In [11]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100 
# 在训练序列到序列（Seq2Seq）模型时，通常会使用特殊的填充标记来对输入进行处理。对于标签数据，在计算损失函数时，我们需要忽略填充标记所带来的影响，因为这些填充部分不应该参与到损失的计算中。
# 在 Transformers 库中，通常将不应该被考虑的标签设置为一个特定的值，通常是 -100。当计算损失函数时，模型会忽略这些 -100 值所对应的预测结果，只计算真实标签部分的损失值，从而实现在计算损失函数时忽略填充标记的效果。 
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [12]:
from transformers import TrainerCallback, TrainerState, TrainerControl

class TrainLogCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []

    def on_log(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if 'loss' in state.log_history[-1]:
            self.train_losses.append(state.log_history[-1]['loss'])
            if state.global_step % args.logging_steps == 0:
                print(f"Logging step {state.global_step} at epoch {state.epoch}, train_loss: {state.log_history[-1]['loss']}")
                wandb.log({"logging_step": state.global_step, "train_loss": state.log_history[-1]['loss']})
            # print(f"Step: {state.global_step}, Train Loss: {state.log_history[-1]['loss']}")

In [13]:
from transformers import Seq2SeqTrainer, TrainingArguments, Trainer


output_dir="llama-3"

config2 = {
    "output_dir": output_dir,
    "auto_find_batch_size": True,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "learning_rate": 1e-5,
    "num_train_epochs":5,
    "logging_dir":f"{output_dir}/logs",
    "logging_strategy":"steps",
    "logging_steps":50,
    "save_strategy": "no",
    "report_to": "tensorboard"
}

# 初始化自定义回调
log_step_callback = TrainLogCallback()

# Define training args
training_args =TrainingArguments(
    output_dir = config2["output_dir"],
    auto_find_batch_size = config2["auto_find_batch_size"],
    # per_device_train_batch_size = config2['per_device_train_batch_size'],
    gradient_accumulation_steps = config2['gradient_accumulation_steps'],
    learning_rate = config2["learning_rate"], # higher learning rate
    num_train_epochs = config2["num_train_epochs"],
    logging_dir = config2["logging_dir"],
    logging_strategy = config2["logging_strategy"],
    logging_steps = config2["logging_steps"],
    save_strategy = config2["save_strategy"],
    report_to = config2["report_to"],
)

wandb.init(
    project = "Llama3_Conversation",
    
    config = {
        "config1": config1,
        "config2": config2,
        "Dataset": "CMU_DoG",
        "Tuning-method": "LoRA",
        "Trainable params": str(output_str)
    }
)

# api = wandb.Api()

# run = api.run("llm_learner/LLM_LoRA_FineTuning/623waqcp")
# run.config["config2"] = config2
# run.update()

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_id,
    callbacks=[log_step_callback]
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mzuolihanstudy[0m ([33mllm_learner[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112723996241887, max=1.0…

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


# Train

In [14]:
try:
    trainer.train()
except Exception as e:
    print(f"An error occurred: {e}")
    import traceback
    traceback.print_exc()

Step,Training Loss
50,6.6751
100,5.3925
150,4.7017
200,4.6044
250,4.338
300,3.9936
350,4.0023
400,3.8209
450,3.9288
500,3.9274


Logging step 50 at epoch 0.01, train_loss: 6.6751
Logging step 100 at epoch 0.02, train_loss: 5.3925
Logging step 150 at epoch 0.03, train_loss: 4.7017
Logging step 200 at epoch 0.04, train_loss: 4.6044
Logging step 250 at epoch 0.05, train_loss: 4.338
Logging step 300 at epoch 0.06, train_loss: 3.9936
Logging step 350 at epoch 0.07, train_loss: 4.0023
Logging step 400 at epoch 0.08, train_loss: 3.8209
Logging step 450 at epoch 0.09, train_loss: 3.9288
Logging step 500 at epoch 0.1, train_loss: 3.9274
Logging step 550 at epoch 0.11, train_loss: 3.9422
Logging step 600 at epoch 0.12, train_loss: 3.7858
Logging step 650 at epoch 0.13, train_loss: 3.7781
Logging step 700 at epoch 0.14, train_loss: 3.6423
Logging step 750 at epoch 0.15, train_loss: 3.7174
Logging step 800 at epoch 0.16, train_loss: 3.6538
Logging step 850 at epoch 0.17, train_loss: 3.6016
Logging step 900 at epoch 0.18, train_loss: 3.8522
Logging step 950 at epoch 0.19, train_loss: 3.7766
Logging step 1000 at epoch 0.2, tr

# Save model

In [16]:
peft_model_id = "Llama3_LoRA_1"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)



('Llama3_LoRA_1/tokenizer_config.json',
 'Llama3_LoRA_1/special_tokens_map.json',
 'Llama3_LoRA_1/tokenizer.json')

# <center><font color=red>Load LoRA-model to do one-chat-test</font></center>

In [1]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc. 
'''================================================'''
peft_model_id = "Llama3_LoRA_1"
'''================================================'''

config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
if config.base_model_name_or_path:
    ;
else:
    '''========================================================'''
    config.base_model_name_or_path = "/root/autodl-fs/Llama-3-12B"
    '''========================================================'''
    
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, torch_dtype=torch.bfloat16,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
    
# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Peft model loaded


In [6]:
# Set the pad_token_id to eos_token_id if not already set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Ensure the model uses this pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [7]:
from datasets import Dataset
import pandas as pd

dataset_file = "pairs/test_pairs.json"
df_test = pd.read_json(dataset_file)

'''---------------------'''
df_test = df_test
'''---------------------'''

ds_test = Dataset.from_pandas(df_test)

ds_test[0]

{'ID': 1,
 'conv': 'Hey there hows it going! You like catch me if you can as much as i do?',
 'response': 'Opps I meant means girls!'}

In [26]:
from random import randrange

prompt = ds_test[randrange(len(ds_test))]['conv']
print("prompt:",prompt)
messages = [
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

model_inputs = tokenizer([text], padding='max_length', return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=60,
    eos_token_id=tokenizer.encode('<|eot_id|>')[0],
    pad_token_id=model.config.pad_token_id,
    do_sample = True,
    temperature=0.9,
    top_k=100,
    top_p=0.92,
    repetition_penalty=1.2,
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("response:\n",response)

# generated_ids = model.generate(
#     model_inputs.input_ids,
#     max_new_tokens=512,
#     pad_token_id = model.config.pad_token_id,
#     eos_token_id=tokenizer.encode('<|eot_id|>')[0]
# )
# generated_ids = [
#     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
# ]

# response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

# print(response)

prompt: The main character, named Chrissie, was at the party. She went skinny dipping in the ocean and was pulled under the water. The next day, her remains washed ashore. 
response:
 Her best friend Amy started to wonder what happened.

When she found out that Chrissie's body had been discovered onshore, it made things even more complicated.canfriends

It is determined by the police officers that there were no witnesses who saw anything related to Chrissie or any other person
