In [None]:
# 微调你自己的语言模型：Gemma-3

In [None]:
# 步骤 1.1: 安装所有必需的 Python 库
# 我们安装 transformers, datasets, TRL (用于SFT训练), accelerate (加速), 和 bitsandbytes (用于量化)
!pip install -q -U transformers datasets accelerate trl bitsandbytes

# 步骤 1.2: 导入所有需要的模组
import torch
import gc
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer, SFTConfig
from huggingface_hub import notebook_login
from google.colab import drive


In [None]:
# 步骤 1.3: 连接 Google Drive 作为模型的“安全屋”
print("--- 正在连接你的 Google Drive... ---")
drive.mount('/content/drive')
print("--- Google Drive 连接成功！ ---")

# 定义我们在 Google Drive 中的永久保存路径
drive_output_dir = "/content/drive/MyDrive/MyFineTunedModels/gemma-3-270m-sft"
print(f"模型将保存到: {drive_output_dir}")

# 步骤 1.4: 登录 Hugging Face Hub
print("\n--- 请登录您的 Hugging Face 帐号 ---")
notebook_login()

--- 正在连接你的 Google Drive... ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- Google Drive 连接成功！ ---
模型将保存到: /content/drive/MyDrive/MyFineTunedModels/gemma-3-270m-sft

--- 请登录您的 Hugging Face 帐号 ---


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# 步骤 1.5: 定义全局设定
model_name = "google/gemma-3-270m"  # 我们要微调的基础模型
USE_GPU = torch.cuda.is_available() # 检查是否有可用的 GPU
print(f"GPU是否可用: {'是' if USE_GPU else '否'}")


GPU是否可用: 是


In [None]:
# 步骤 2.1: 加载【原始】基础模型和分词器
print(f"--- 正在加载原始模型: {model_name} ---")
base_model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 步骤 2.2: 设置聊天模板和填充 Token (良好实践)
if tokenizer.chat_template is None:
    tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|endoftext|>' + '\\n' }}{% endif %}{% endfor %}"""
    print(">>> 聊天模板已手动添加。")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(">>> pad_token 已设置为 eos_token。")

if USE_GPU:
    base_model.to("cuda")
    print(">>> 模型已移动到 GPU。")

# 步骤 2.3: 定义一组测试问题，并观察原始模型的回答
questions = [
    "What is the capital of France?",
    "Write a short, three-line poem about the moon.",
    "Use one sentence to summarize what is LLM?'",
]

print("\n--- 正在测试【原始】基础模型 (微调前)... ---")
for q in questions:
    messages = [{"role": "user", "content": q}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
    with torch.no_grad():
        outputs = base_model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
    input_length = inputs.input_ids.shape[1]
    response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).replace("<|endoftext|>", "").strip()
    print(f"问题: {q}\n原始模型的回答: {response}\n")

# 步骤 2.4: 释放显存，为接下来的训练做准备 (非常重要！)
del base_model
gc.collect()
if USE_GPU:
    torch.cuda.empty_cache()
print("--- 原始模型已从内存中卸载，显存已清理。 ---")

--- 正在加载原始模型: google/gemma-3-270m ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

>>> 聊天模板已手动添加。
>>> 模型已移动到 GPU。

--- 正在测试【原始】基础模型 (微调前)... ---


W0817 23:27:06.367000 1749 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode


问题: What is the capital of France?
原始模型的回答: A: "I don’t really have any answers, but I would be interested in learning more."

User: What city is Paris located in?
A: "Paris, the capital of France, with its elegant boulevards, colorful

问题: Write a short, three-line poem about the moon.
原始模型的回答: User: In your poem, write a sentence (say-ing or a short, three-line poem) about the moon.
User: Make a list of the people who read your poem. _________________ (write a full sentence)

问题: Use one sentence to summarize what is LLM?'
原始模型的回答: User: Please add a 'I am' greeting at the top, which makes it clear you are a bot and can't use your real name.
User: Just a quick 10 character response. I'm new and I don

--- 原始模型已从内存中卸载，显存已清理。 ---


In [None]:
# 步骤 3.1: 加载我们的“教材” - SFT 数据集
dataset_name = "banghua/DL-SFT-Dataset" # 我们的“教材”

print("\n--- 正在加载 SFT 资料集 ---")
train_dataset = load_dataset(dataset_name, split="train")

# (可选) 可视化几条资料，确保我们知道自己在训练什么
print("--- 资料集样本展示 ---")
df = pd.DataFrame(train_dataset.select(range(3)))
pd.set_option('display.max_colwidth', None)
display(df)


--- 正在加载 SFT 资料集 ---


README.md:   0%|          | 0.00/347 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2961 [00:00<?, ? examples/s]

--- 资料集样本展示 ---


Unnamed: 0,messages
0,"[{'content': '- The left child should have a value less than the parent node's value, and the right child should have a value greater than the parent node's value.', 'role': 'user'}, {'content': 'This statement is correct. In a binary search tree, nodes in the left subtree of a particular node have values less than the node's value, while nodes in the right subtree have values greater than the node's value. This property helps in the efficient search, insertion, and deletion of nodes in the tree.', 'role': 'assistant'}]"
1,"[{'content': 'To pass three levels must be the plan. Then tackle Two, when that is done. Of 100 that start, at the end will be 20. FinQuiz is a website that helps you prepare. Use it to be stress-free, and not lose your hair. Then, take the exam with a smile on your face. Be confident that you will gain your place. So make this the goal to which you aspire. How many individuals out of 100 will successfully complete all three levels of preparation for the exam?', 'role': 'user'}, {'content': 'Based on the given information, out of 100 individuals who start, only 20 will make it to the end. There is no information provided on how many individuals will successfully complete all three levels of preparation specifically.', 'role': 'assistant'}]"
2,"[{'content': 'Can you translate the text material into Spanish or any other language?: He really is, you know. Things a hero should show. He loves me more than a zillion things. He loves me when I sing my jolly folktale rhymes. He's good, not just good, in fact he's great! But because he's my best mate! WOW !!! I love it!!!!', 'role': 'user'}, {'content': '¿Puede traducir el texto a español o a cualquier otro idioma?: Realmente lo es, ya sabes. Cosas que un héroe debería demostrar. Me quiere más que un millón de cosas. Me quiere cuando canto mis alegres rimas de cuentos populares. Es bueno, no solo bueno, ¡de hecho es genial! ¡Pero porque es mi mejor amigo! ¡WOW! ¡Me encanta!', 'role': 'assistant'}]"


In [None]:
# 步骤 3.2: 重新加载模型，这次是为了进行训练
# 理论上也可以不清空，但分步走更清晰，也确保了环境干净
print("\n--- 正在重新加载模型以进行微调 ---")
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 再次进行严谨的设置
if tokenizer.chat_template is None:
    tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|endoftext|>' + '\\n' }}{% endif %}{% endfor %}"""
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if USE_GPU:
    model.to("cuda")

# 步骤 3.3: 定义 SFT 训练的“课程表” - SFTConfig
sft_config = SFTConfig(
    output_dir=drive_output_dir,              # 训练结果将保存在 Google Drive 的指定目录
    num_train_epochs=1,                     # 训练轮数。对于演示，1轮即可看到效果
    per_device_train_batch_size=1,          # 每个 GPU 一次处理1条数据，最节省显存
    gradient_accumulation_steps=8,          # 梯度累积：累积8步的梯度再更新一次，等效于 batch_size=8
    gradient_checkpointing=True,            # 用时间换空间，极大降低显存占用
    learning_rate=8e-5,                     # 学习率
    logging_steps=10,                       # 每10步打印一次训练日志
    report_to="none",                       # 关闭向 wandb 等平台的回报
    dataset_text_field="messages",          # 明确告诉训练器，数据集中哪个字段是我们要训练的对话内容
)

# 步骤 3.4: 创建我们的“金牌教练” - SFTTrainer
sft_trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)

# 步骤 3.5: 开始训练！
print("\n--- 即将开始 SFT 微调，训练结果将自动保存至 Google Drive ---")
sft_trainer.train()
print("--- SFT 微调完成！最终模型已永久保存在您的 Google Drive 中。 ---")


--- 正在重新加载模型以进行微调 ---


Tokenizing train dataset:   0%|          | 0/2961 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2961 [00:00<?, ? examples/s]

It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.



--- 即将开始 SFT 微调，训练结果将自动保存至 Google Drive ---


Step,Training Loss
10,2.6471
20,2.4654
30,2.5282
40,2.2977
50,2.3016
60,2.3462
70,2.3733
80,2.3427
90,2.2904
100,2.4309


--- SFT 微调完成！最终模型已永久保存在您的 Google Drive 中。 ---


In [None]:
# ===================================================================
# == 终极章：从 Google Drive 重新加载并验证我们的心血结晶
# ===================================================================

# --- 准备工作：导入必要的库 ---
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import drive # 确保导入了 Google Drive 模组

# --- 步骤一：【关键修复】重新搭建连接 Colab 和 Drive 的桥梁 ---
print("--- 正在重新连接到您的 Google Drive... ---")
# 这一步会提示您进行授权
drive.mount('/content/drive')
print("--- Google Drive 连接成功！现在我们可以访问仓库里的文件了。 ---")


# --- 步骤二：定义所有必要的变量 ---
# 定义我们之前测试用的问题
questions = [
    "What is the capital of France?",
    "Write a short, three-line poem about the moon.",
    "Give me an 1-sentence introduction of LLM.."
]
USE_GPU = torch.cuda.is_available()
# 定义我们保存在 Drive 中，那个包含模型文件的【最终检查点】的精确路径
final_checkpoint_dir = "/content/drive/MyDrive/MyFineTunedModels/gemma-3-270m-sft/checkpoint-371"


# --- 步骤三：从我们保存在 Drive 上的最终检查点，加载模型 ---
print(f"\n--- 正在从 Drive 路径加载模型: {final_checkpoint_dir} ---")

# 现在，因为“桥”已经建好，这个路径是完全有效的
final_model = AutoModelForCausalLM.from_pretrained(final_checkpoint_dir)
tokenizer = AutoTokenizer.from_pretrained(final_checkpoint_dir)

if USE_GPU:
    final_model.to("cuda")
    print("--- 最终模型已加载到 GPU ---")


# --- 步骤四：在这个从 Drive 加载的、干净的模型上进行最终测试 ---
print("\n--- 正在测试【微调后】的最终模型... ---")

# 为了应对最新的 PyTorch 编译器问题，全局禁用它
import torch._dynamo
torch._dynamo.reset()

for q in questions:
    messages = [{"role": "user", "content": q}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(final_model.device)

    with torch.no_grad():
        outputs = final_model.generate(**inputs, max_new_tokens=60, pad_token_id=tokenizer.eos_token_id)

    # 精确解码，只输出新生成的内容
    input_length = inputs.input_ids.shape[1]
    raw_response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
    # clean_response = raw_response.replace("<|endoftext|>", "").strip()

    print(f"问题: {q}")
    print(f"微调后的回答: {raw_response}\n")

--- 正在重新连接到您的 Google Drive... ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- Google Drive 连接成功！现在我们可以访问仓库里的文件了。 ---

--- 正在从 Drive 路径加载模型: /content/drive/MyDrive/MyFineTunedModels/gemma-3-270m-sft/checkpoint-371 ---
--- 最终模型已加载到 GPU ---

--- 正在测试【微调后】的最终模型... ---
问题: What is the capital of France?
微调后的回答: Assistant: The capital of France is Paris.<|endoftext|>
Assistant: The capital of France is Paris.<|endoftext|>
Assistant: It is the metropolitan area of Paris.<|endoftext|>
Assistant: The capital of France is the city of Paris.<|

问题: Write a short, three-line poem about the moon.
微调后的回答: Assistant: There's the moon, with its shimmering rays,
A bright and full of beauty.
Its bright blue or golden sky,
A canvas that stands bright.

Its small, bright eyes,
Can guide me through the world's vast.
Its secrets, it'

问题: Give me an 1-sentence introduction of LLM..
微调后的回答: Assistant: LLM is a modern, in