In [None]:
# 确保在Colab中运行此脚本

# 1. 安装必要的库
!pip install transformers datasets accelerate bitsandbytes

# 2. 导入所需的库
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# 3. 设置Hugging Face token (请替换为您的实际token)
os.environ["HUGGINGFACE_TOKEN"] = "xxx"

# 4. 加载预训练模型和分词器
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ["HUGGINGFACE_TOKEN"])
model = AutoModelForCausalLM.from_pretrained(model_name, token=os.environ["HUGGINGFACE_TOKEN"])

# 5. 准备数据集
# 假设您已经上传了名为 'your_data.json' 的JSON文件到Colab
dataset = load_dataset('json', data_files='/content/alpaca_chinese_part_0.json')

# 6. 数据预处理函数
#def preprocess_function(examples):
#    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
def preprocess_function(examples):
    combined_texts = [
        instruction + " " + inp + " " + output
        for instruction, inp, output in zip(examples["instruction"], examples["input"], examples["output"])
    ]
    return tokenizer(combined_texts, truncation=True, padding="max_length", max_length=512)



# 7. 对数据集应用预处理
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 8. 设置训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,  # 啟用 mixed precision
    gradient_accumulation_steps=4,
)

# 9. 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# 10. 开始训练
trainer.train()

# 11. 保存微调后的模型到本地
model_dir = "./finetuned_gemma_2b_lora"
trainer.save_model(model_dir)

# 12. 壓縮並下載模型
!zip -r finetuned_gemma_2b_lora.zip {model_dir}

from google.colab import files
files.download("finetuned_gemma_2b_lora.zip")


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-2b.
401 Client Error. (Request ID: Root=1-66f944ae-1436219a28a8bd712e8af348;4e4a1dc8-ace0-42d6-bd34-5650e14e00a3)

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Access to model google/gemma-2b is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
# 确保在Colab中运行此脚本

# 1. 安装必要的库
!pip install transformers datasets accelerate bitsandbytes peft

# 2. 导入所需的库
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model  # 引入 LoRA 庫

# 3. 设置 Hugging Face token (请替换为您的实际 token)
os.environ["HUGGINGFACE_TOKEN"] = "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"

# 4. 加载预训练模型和分词器
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ["HUGGINGFACE_TOKEN"])
model = AutoModelForCausalLM.from_pretrained(model_name, token=os.environ["HUGGINGFACE_TOKEN"])

# 5. 配置 LoRA
lora_config = LoraConfig(
    r=8,  # LoRA 的秩，越小顯存佔用越少，推薦值為 4 或 8
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # 只在部分模型層上應用 LoRA，減少內存占用
    lora_dropout=0.1
)

# 将模型与 LoRA 结合
model = get_peft_model(model, lora_config)

# 6. 准备数据集
# 假设您已经上传了名为 'your_data.json' 的JSON文件到Colab
dataset = load_dataset('json', data_files='/content/alpaca_chinese_part_0.json')

# 7. 数据预处理函数（保持原有逻辑）
def preprocess_function(examples):
    combined_texts = [
        instruction + " " + inp + " " + output
        for instruction, inp, output in zip(examples["instruction"], examples["input"], examples["output"])
    ]
    return tokenizer(combined_texts, truncation=True, padding="max_length", max_length=512)

# 8. 对数据集应用预处理
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 9. 设置训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # 保持小批次大小以減少內存需求
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,  # 啟用混合精度訓練，減少內存占用
    gradient_accumulation_steps=4,  # 使用梯度累加來模擬更大批次
)

# 10. 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# 11. 开始训练
trainer.train()
# 12. 保存微调后的模型到本地
model_dir = "./finetuned_gemma_2b_lora"
trainer.save_model(model_dir)

# 13. 壓縮並下載模型
!zip -r finetuned_gemma_2b_lora.zip {model_dir}

from google.colab import files
files.download("finetuned_gemma_2b_lora.zip")


Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.0-py3-none-any.whl (322 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.5/322.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.0


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [en_input, input_ids, en_output, en_instruction, instruction, output, input, attention_mask]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [None]:
# 确保在Colab中运行此脚本

# 1. 安装必要的库
!pip install transformers datasets accelerate bitsandbytes peft

# 2. 导入所需的库
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model  # 引入 LoRA 庫

# 3. 设置 Hugging Face token (请替换为您的实际 token)
os.environ["HUGGINGFACE_TOKEN"] = "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"

# 4. 加载预训练模型和分词器
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ["HUGGINGFACE_TOKEN"])
model = AutoModelForCausalLM.from_pretrained(model_name, token=os.environ["HUGGINGFACE_TOKEN"])

# 5. 配置 LoRA
lora_config = LoraConfig(
    r=8,  # LoRA 的秩，越小顯存佔用越少，推薦值為 4 或 8
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # 只在部分模型層上應用 LoRA，減少內存占用
    lora_dropout=0.1
)

# 将模型与 LoRA 结合
model = get_peft_model(model, lora_config)

# 6. 准备数据集
# 假设您已经上传了名为 'your_data.json' 的JSON文件到Colab
dataset = load_dataset('json', data_files='/content/alpaca_chinese_part_0.json')

# 7. 数据预处理函数（保持原有逻辑）
def preprocess_function(examples):
    combined_texts = [
        instruction + " " + inp + " " + output
        for instruction, inp, output in zip(examples["instruction"], examples["input"], examples["output"])
    ]
    return tokenizer(combined_texts, truncation=True, padding="max_length", max_length=512)

# 8. 对数据集应用预处理
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 9. 设置训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # 保持小批次大小以減少內存需求
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,  # 啟用混合精度訓練，減少內存占用
    gradient_accumulation_steps=4,  # 使用梯度累加來模擬更大批次
    remove_unused_columns=False  # 不移除數據集中未使用的欄位
)

# 10. 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# 11. 开始训练
trainer.train()

# 12. 保存微调后的模型到本地
model_dir = "./finetuned_gemma_2b_lora"
trainer.save_model(model_dir)

# 13. 壓縮並下載模型
!zip -r finetuned_gemma_2b_lora.zip {model_dir}

from google.colab import files
files.download("finetuned_gemma_2b_lora.zip")




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`en_instruction` in this case) have excessive nesting (inputs type `list` where type `int` is expected).