In [None]:
# 您将学习如何进行数据准备，如何训练，如何运行模型，以及如何保存它（例如Llama.cpp）。
# 参考文档：https://docs.unsloth.ai/tutorials/how-to-finetune-llama-3-and-export-to-ollama

In [ ]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers==0.0.27" trl peft accelerate bitsandbytes

我们支持 16 位 LoRA 或 4 位 QLoRA。两者都快 2 倍。
可以设置为任何内容，因为我们通过 kaiokendev 的方法进行自动 RoPE 缩放。

In [ ]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",  # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",  # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",  # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",  # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",  # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",  # Gemma 2x faster!
]  # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Phi-3.5-mini-instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

我们现在添加 LoRA 适配器，因此我们只需要更新 1 到 10% 的所有参数！

In [ ]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj", ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

# Data Prep 数据准备

我们现在使用 Phi-3 格式进行对话风格微调。我们使用 ShareGPT 风格的 Open Assistant 对话。Phi-3 呈现多轮对话，如下所示：
<|user|>
Hi!<|end|>
<|assistant|>
Hello! How are you?<|end|>
<|user|>
I'm doing great! And you?<|end|>

我们使用我们的get_chat_template函数来获取正确的聊天模板。
We support zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old and our own optimized unsloth template.

注意：ShareGPT 使用 {“from”： “human”， “value” ： “Hi”} 而不是 {“role”： “user”， “content” ： “Hi”}，
所以我们使用 mapping 来映射它。

In [ ]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template="phi-3",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
)


def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts, }


pass

from datasets import load_dataset

dataset = load_dataset("philschmid/guanaco-sharegpt-style", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True, )

让我们通过打印第 5 个元素来了解 Phi-3 格式的工作原理

In [ ]:
dataset[5]["conversations"]

In [ ]:
print(dataset[5]["text"])

如果您想制作自己的聊天模板，这也是可能的！您必须使用 Jinja 模板制度。我们提供了我们自己的精简版的 Unsloth 模板，
我们发现该模板效率更高，并利用了 ChatML、Zephyr 和 Alpaca 样式。

In [ ]:
unsloth_template = \
    "{{ bos_token }}" \
    "{{ 'You are a helpful assistant to the user\n' }}" \
    "{% for message in messages %}" \
    "{% if message['role'] == 'user' %}" \
    "{{ '>>> User: ' + message['content'] + '\n' }}" \
    "{% elif message['role'] == 'assistant' %}" \
    "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}" \
    "{% endif %}" \
    "{% endfor %}" \
    "{% if add_generation_prompt %}" \
    "{{ '>>> Assistant: ' }}" \
    "{% endif %}"
unsloth_eos_token = "eos_token"

if False:
    tokenizer = get_chat_template(
        tokenizer,
        chat_template=(unsloth_template, unsloth_eos_token,),  # You must provide a template and EOS token
        mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
        map_eos_token=True,  # Maps <|im_end|> to </s> instead
    )

# Train the model 训练模型

现在让我们使用 Huggingface TRL 的 SFTTrainer！更多文档请见：TRL SFT 文档。我们执行 60 个步骤来加快速度，
但您可以设置 num_train_epochs=1 进行完整运行，并关闭 max_steps=None。我们还支持 TRL 的 DPOTrainer！

In [ ]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

Show current memory stats

In [ ]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [ ]:
trainer_stats = trainer.train()

Show final memory and time stats

In [ ]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Inference 推理

让我们运行模型！由于我们使用的是 Phi-3，因此请使用 apply_chat_template 并将 add_generation_prompt设置为 True 进行推理。

In [ ]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template="phi-3",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
)

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True)
tokenizer.batch_decode(outputs)

您还可以使用 TextStreamer 进行连续推理 - 这样您就可以逐个令牌查看生成，而不是一直等待！

In [ ]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)

# 保存、加载微调模型
Saving, loading finetuned models

要将最终模型保存为 LoRA 适配器，请使用 Huggingface 的 push_to_hub 进行在线保存，或使用 save_pretrained 进行本地保存。

这只会保存 LoRA 适配器， 而不是完整模型.要保存为 16 位或 GGUF，请向下滚动！

In [ ]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
model.push_to_hub("coderjuzi/lora_model", token=" ")  # Online saving
tokenizer.push_to_hub("coderjuzi/lora_model", token=" ")  # Online saving

现在，如果您想加载我们刚刚保存用于推理的 LoRA 适配器，请将 False 设置为 True：

In [ ]:
if False:
    from unsloth import FastLanguageModel

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "What is a famous tall tower in Paris?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)

您还可以使用 Hugging Face 的 AutoModelForPeftCausalLM。仅当您未安装 unsloth 时才使用此功能。
它可能会慢得无可救药，因为不支持 4 位模型下载，而 Unsloth 的推理速度快 2 倍。

In [ ]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer

    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit=load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

# 为 VLLM 保存到 float16
Saving to float16 for VLLM

我们也支持直接保存到 float16。为 float16 选择 merged_16bit，为 int4 选择 merged_4bit。我们还允许使用 Lora 适配器作为后备.使用push_to_hub_merged上传到您的Hugging Face帐户！您可以去 https://huggingface.co/settings/tokens 获取您的个人代币。

In [ ]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit", )
if True: model.push_to_hub_merged("coderjuzi/Phi-3.5-16bit-vllm", tokenizer, save_method="merged_16bit",
                                  token=" ")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method="merged_4bit", )
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method="merged_4bit",
                                   token=" ")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method="lora", )
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method="lora",
                                   token=" ")

# GGUF / llama.cpp 转换

要保存到 GGUF / llama.cpp，我们现在原生支持它！我们克隆llama.cpp，并默认将其保存为 q8_0。
我们允许所有方法，例如q4_k_m。使用 save_pretrained_gguf 进行本地保存，使用 push_to_hub_gguf 上传到 HF。

一些受支持的量化方法（完整列表在我们的 Wiki 页面上）：

q8_0 - Fast conversion. High resource use, but generally acceptable.
- 快速转换。资源使用率高，但总体上可以接受。
q4_k_m - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
-推荐。使用 Q6_K 表示 attention.wv 和 feed_forward.w2 张量的一半，否则Q4_K。
q5_k_m - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.
-推荐。使用 Q6_K 表示 attention.wv 和 feed_forward.w2 张量的一半，否则Q5_K。

In [ ]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer, )
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token="")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method="f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method="f16", token="")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method="q4_k_m", token="")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model",  # Change hf to your username!
        tokenizer,
        quantization_method=["q4_k_m", "q8_0", "q5_k_m", ],
        token="",  # Get a token at https://huggingface.co/settings/tokens
    )

现在，在 llama.cpp 或基于 UI 的系统（如 GPT4All）中使用 model-unsloth.gguf 文件或 model-unsloth-Q4_K_M.gguf 文件。你可以通过去安装 GPT4All 这里.