In [1]:
import os

In [7]:
import json

input_txt = "captions.txt"     # 你的原始txt文件名
output_jsonl = "captions.jsonl"  # 输出的jsonl文件名

with open(input_txt, "r", encoding="utf-8") as fin, open(output_jsonl, "w", encoding="utf-8") as fout:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        # 拆分为文件名和描述
        if ':' in line:
            image_name, caption = line.split(':', 1)
            image_name = image_name.strip()
            caption = caption.strip()
            # 组装成json并写入
            json.dump({"image": image_name, "caption": caption}, fout, ensure_ascii=False)
            fout.write("\n")


In [None]:
pip install torch torchvision peft

In [3]:
import torch
import transformers
# import peft
print(torch.__version__)
print(transformers.__version__)
# print(peft.__version__)


2.7.1+cpu
4.40.2


## 划分训练集验证集

In [8]:
import json
import random

input_file = "C:/Users/taste/Documents/0_sis/captions.jsonl"
train_file = "C:/Users/taste/Documents/0_sis/captions_train.jsonl"
val_file = "C:/Users/taste/Documents/0_sis/captions_val.jsonl"

# 读入所有数据
with open(input_file, "r", encoding="utf-8") as fin:
    data = [json.loads(line) for line in fin if line.strip()]

# 打乱顺序
random.shuffle(data)

# 按8:2分
split_idx = int(len(data) * 0.8)
train_data = data[:split_idx]
val_data = data[split_idx:]

# 写入新文件
with open(train_file, "w", encoding="utf-8") as ftrain:
    for item in train_data:
        ftrain.write(json.dumps(item, ensure_ascii=False) + "\n")

with open(val_file, "w", encoding="utf-8") as fval:
    for item in val_data:
        fval.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"训练集写入 {train_file}，共 {len(train_data)} 条")
print(f"验证集写入 {val_file}，共 {len(val_data)} 条")


训练集写入 C:/Users/taste/Documents/0_sis/captions_train.jsonl，共 160 条
验证集写入 C:/Users/taste/Documents/0_sis/captions_val.jsonl，共 40 条


## 开始微调

### 第一步，加载模型

In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# 加载假flash_attn
import sys, types, importlib.machinery

def make_fake_mod(name):
    mod = types.ModuleType(name)
    mod.__spec__ = importlib.machinery.ModuleSpec(name, None)
    return mod

sys.modules["flash_attn"] = make_fake_mod("flash_attn")
sys.modules["flash_attn.ops"] = make_fake_mod("flash_attn.ops")
sys.modules["flash_attn.bert_flash_attention"] = make_fake_mod("flash_attn.bert_flash_attention")

In [3]:
import json
import os
from torch.utils.data import Dataset
from PIL import Image
import csv
from peft import LoraConfig, get_peft_model

In [4]:
class Florence2CaptionDataset(Dataset):
    def __init__(self, jsonl_path, image_dir):
        self.samples = []
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip():
                    item = json.loads(line)
                    self.samples.append(item)
        self.image_dir = image_dir

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        image_path = os.path.join(self.image_dir, item["image"])
        image = Image.open(image_path)
        if image.mode != 'RGB':
            image = image.convert('RGB')
        prompt = "<DETAILED_CAPTION>"
        caption = item["caption"]
        return {"prompt": prompt, "caption": caption, "image": image}

训练从这开始

In [5]:
# 加载florence-2-base模型
import torch
from transformers import AutoProcessor, AutoModelForCausalLM

CKPT = "microsoft/Florence-2-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
    CKPT, trust_remote_code=True
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    CKPT, trust_remote_code=True
)



In [6]:
# 简化版本
def collate_fn(batch):
    prompts = [b["prompt"] for b in batch]
    captions = [b["caption"] for b in batch]
    images = [b["image"] for b in batch]
    model_inputs = processor(
        text=prompts,
        images=images,
        return_tensors="pt",
        padding=True
    )
    labels = processor(
        text=captions,
        images=images,
        return_tensors="pt",
        padding=True
    ).input_ids
    model_inputs["labels"] = labels
    return {k: v.to(model.device) for k, v in model_inputs.items()}


In [7]:
# 加载数据
train_jsonl = r"C:\Users\taste\Documents\0_sis\captions_train.jsonl"
val_jsonl = r"C:\Users\taste\Documents\0_sis\captions_val.jsonl"
img_dir = r"C:\Users\taste\Documents\0_sis\processed"

train_ds = Florence2CaptionDataset(train_jsonl, img_dir)
val_ds = Florence2CaptionDataset(val_jsonl, img_dir)

In [8]:
# ==== 1. 参数网格 ====
param_grid = [
    #{"BATCH_SIZE": 4, "LEARNING_RATE": 5e-6, "EPOCHS": 15},
    #{"BATCH_SIZE": 4, "LEARNING_RATE": 2e-5, "EPOCHS": 15},
    #{"BATCH_SIZE": 8, "LEARNING_RATE": 5e-6, "EPOCHS": 15},
    #{"BATCH_SIZE": 2, "LEARNING_RATE": 1e-5, "EPOCHS": 15},
    {"BATCH_SIZE": 4, "LEARNING_RATE": 1e-5, "EPOCHS": 30},
]

# ==== 2. LoRA 配置 ====
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "linear", "Conv2d", "lm_head", "fc2"],
    task_type="CAUSAL_LM",
    lora_dropout=0.1,
    bias="none"
)

# ==== 3. 推理函数（每组参数后可以用来评估效果）====
def infer_caption(model, processor, img_path, device):
    from PIL import Image
    img = Image.open(img_path).convert("RGB")
    prompt = "<DETAILED_CAPTION>"
    inputs = processor(text=prompt, images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs.input_ids,
            pixel_values=inputs.pixel_values,
            max_new_tokens=80,
            num_beams=3
        )
    output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f"图片 {img_path} 生成描述：{output}")

In [9]:
# ==== 4. 初始化CSV ====
loss_csv_path = "train_val_loss_log_5.csv"
with open(loss_csv_path, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["param_group", "epoch", "train_loss", "val_loss"])

### 正式开始循环训练

In [10]:
for params in param_grid:
    BATCH_SIZE = params["BATCH_SIZE"]
    LEARNING_RATE = params["LEARNING_RATE"]
    EPOCHS = params["EPOCHS"]
    LOG_STEP = 10

    print(f"\n===== 实验参数: BATCH_SIZE={BATCH_SIZE}, LR={LEARNING_RATE}, EPOCHS={EPOCHS} =====")
    # 参数组唯一目录
    param_name = f"bs{BATCH_SIZE}_lr{LEARNING_RATE:.0e}_ep{EPOCHS}"
    # 每组参数专属文件夹
    param_dir = f"./florence2-lora-{param_name}"
    os.makedirs(param_dir, exist_ok=True)

    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    # 重新加载基础模型与 LoRA
    base_model = AutoModelForCausalLM.from_pretrained(CKPT, trust_remote_code=True).to(DEVICE)
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    num_training_steps = EPOCHS * len(train_dl)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    best_val_loss = float("inf")
    best_ckpt_dir = ""
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        batch_losses = []
        progress_bar = tqdm(enumerate(train_dl), total=len(train_dl), desc=f"Exp[{BATCH_SIZE},{LEARNING_RATE}] Epoch {epoch+1} Train")
        for step, batch in progress_bar:
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            batch_losses.append(loss.item())
            if (step + 1) % LOG_STEP == 0 or (step + 1) == len(train_dl):
                avg_loss = sum(batch_losses[-LOG_STEP:]) / min(LOG_STEP, len(batch_losses[-LOG_STEP:]))
                progress_bar.set_postfix({"avg_loss": avg_loss})
        avg_epoch_loss = total_loss / len(train_dl)
        print(f"Epoch {epoch+1} - Train avg loss: {avg_epoch_loss:.4f}")

        # 验证
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_dl, desc=f"Epoch {epoch+1} Val"):
                outputs = model(**batch)
                val_loss += outputs.loss.item()
        avg_val_loss = val_loss / len(val_dl)
        print(f"Epoch {epoch+1} - Val avg loss: {avg_val_loss:.4f}")

        # ==== 记录到 CSV ====
        with open(loss_csv_path, "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerow([param_name, epoch+1, avg_epoch_loss, avg_val_loss])

        # === 保存每一轮 epoch checkpoint（子文件夹）===
        epoch_dir = os.path.join(param_dir, f"epoch{epoch+1:02d}")
        os.makedirs(epoch_dir, exist_ok=True)
        print(f"保存本轮模型: {epoch_dir}")
        model.save_pretrained(epoch_dir)
        processor.save_pretrained(epoch_dir)

        # === 保存 best checkpoint（每组 best 子文件夹）===
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_ckpt_dir = os.path.join(param_dir, "best")
            model.save_pretrained(best_ckpt_dir)
            processor.save_pretrained(best_ckpt_dir)
            print(f"保存最优模型: {best_ckpt_dir}")

    print(f"\n>>> 参数组 [{param_name}] 训练完成,最佳验证集loss: {best_val_loss:.4f}，最优模型保存在: {best_ckpt_dir}")

    # ==== 推理评估（可选）====
    infer_caption(model, processor, r"C:\Users\taste\Documents\0_sis\processed\image_145.jpg", DEVICE)



===== 实验参数: BATCH_SIZE=4, LR=1e-05, EPOCHS=30 =====




trainable params: 1,929,928 || all params: 233,343,944 || trainable%: 0.8270743893829102


Exp[4,1e-05] Epoch 1 Train: 100%|██████████| 40/40 [10:16<00:00, 15.40s/it, avg_loss=5.51]


Epoch 1 - Train avg loss: 5.6013


Epoch 1 Val: 100%|██████████| 10/10 [00:33<00:00,  3.36s/it]


Epoch 1 - Val avg loss: 5.5625
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch01
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 2 Train: 100%|██████████| 40/40 [10:18<00:00, 15.46s/it, avg_loss=5.32]


Epoch 2 - Train avg loss: 5.3160


Epoch 2 Val: 100%|██████████| 10/10 [00:33<00:00,  3.35s/it]


Epoch 2 - Val avg loss: 5.2323
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch02
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 3 Train: 100%|██████████| 40/40 [09:56<00:00, 14.91s/it, avg_loss=4.49]


Epoch 3 - Train avg loss: 5.0171


Epoch 3 Val: 100%|██████████| 10/10 [00:31<00:00,  3.15s/it]


Epoch 3 - Val avg loss: 4.7568
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch03
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 4 Train: 100%|██████████| 40/40 [09:45<00:00, 14.63s/it, avg_loss=4.49]


Epoch 4 - Train avg loss: 4.5821


Epoch 4 Val: 100%|██████████| 10/10 [00:33<00:00,  3.35s/it]


Epoch 4 - Val avg loss: 4.2796
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch04
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 5 Train: 100%|██████████| 40/40 [10:04<00:00, 15.10s/it, avg_loss=3.89]


Epoch 5 - Train avg loss: 4.1129


Epoch 5 Val: 100%|██████████| 10/10 [00:33<00:00,  3.36s/it]


Epoch 5 - Val avg loss: 3.8380
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch05
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 6 Train: 100%|██████████| 40/40 [10:12<00:00, 15.32s/it, avg_loss=3.65]


Epoch 6 - Train avg loss: 3.7389


Epoch 6 Val: 100%|██████████| 10/10 [00:31<00:00,  3.19s/it]


Epoch 6 - Val avg loss: 3.5352
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch06
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 7 Train: 100%|██████████| 40/40 [09:12<00:00, 13.82s/it, avg_loss=3.45]


Epoch 7 - Train avg loss: 3.4814


Epoch 7 Val: 100%|██████████| 10/10 [00:32<00:00,  3.29s/it]


Epoch 7 - Val avg loss: 3.3356
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch07
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 8 Train: 100%|██████████| 40/40 [10:05<00:00, 15.15s/it, avg_loss=3.09]


Epoch 8 - Train avg loss: 3.2656


Epoch 8 Val: 100%|██████████| 10/10 [00:32<00:00,  3.21s/it]


Epoch 8 - Val avg loss: 3.1635
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch08
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 9 Train: 100%|██████████| 40/40 [10:08<00:00, 15.21s/it, avg_loss=2.97]


Epoch 9 - Train avg loss: 3.0979


Epoch 9 Val: 100%|██████████| 10/10 [00:33<00:00,  3.36s/it]


Epoch 9 - Val avg loss: 3.0044
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch09
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 10 Train: 100%|██████████| 40/40 [09:34<00:00, 14.37s/it, avg_loss=2.88]


Epoch 10 - Train avg loss: 2.9508


Epoch 10 Val: 100%|██████████| 10/10 [00:32<00:00,  3.21s/it]


Epoch 10 - Val avg loss: 2.8584
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch10
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 11 Train: 100%|██████████| 40/40 [09:46<00:00, 14.65s/it, avg_loss=2.84]


Epoch 11 - Train avg loss: 2.8235


Epoch 11 Val: 100%|██████████| 10/10 [00:33<00:00,  3.39s/it]


Epoch 11 - Val avg loss: 2.7054
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch11
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 12 Train: 100%|██████████| 40/40 [10:21<00:00, 15.54s/it, avg_loss=2.68]


Epoch 12 - Train avg loss: 2.7040


Epoch 12 Val: 100%|██████████| 10/10 [00:33<00:00,  3.37s/it]


Epoch 12 - Val avg loss: 2.5726
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch12
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 13 Train: 100%|██████████| 40/40 [10:23<00:00, 15.58s/it, avg_loss=2.55]


Epoch 13 - Train avg loss: 2.5813


Epoch 13 Val: 100%|██████████| 10/10 [00:33<00:00,  3.37s/it]


Epoch 13 - Val avg loss: 2.4395
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch13
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 14 Train: 100%|██████████| 40/40 [10:19<00:00, 15.48s/it, avg_loss=2.5] 


Epoch 14 - Train avg loss: 2.4465


Epoch 14 Val: 100%|██████████| 10/10 [00:33<00:00,  3.34s/it]


Epoch 14 - Val avg loss: 2.3307
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch14
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 15 Train: 100%|██████████| 40/40 [10:10<00:00, 15.27s/it, avg_loss=2.32]


Epoch 15 - Train avg loss: 2.3590


Epoch 15 Val: 100%|██████████| 10/10 [00:31<00:00,  3.15s/it]


Epoch 15 - Val avg loss: 2.2377
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch15
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 16 Train: 100%|██████████| 40/40 [10:03<00:00, 15.10s/it, avg_loss=2.32]


Epoch 16 - Train avg loss: 2.2483


Epoch 16 Val: 100%|██████████| 10/10 [00:32<00:00,  3.21s/it]


Epoch 16 - Val avg loss: 2.1664
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch16
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 17 Train: 100%|██████████| 40/40 [09:45<00:00, 14.64s/it, avg_loss=2.23]


Epoch 17 - Train avg loss: 2.1964


Epoch 17 Val: 100%|██████████| 10/10 [00:33<00:00,  3.35s/it]


Epoch 17 - Val avg loss: 2.1149
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch17
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 18 Train: 100%|██████████| 40/40 [10:11<00:00, 15.28s/it, avg_loss=2.19]


Epoch 18 - Train avg loss: 2.1422


Epoch 18 Val: 100%|██████████| 10/10 [00:33<00:00,  3.34s/it]


Epoch 18 - Val avg loss: 2.0747
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch18
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 19 Train: 100%|██████████| 40/40 [10:06<00:00, 15.17s/it, avg_loss=1.96]


Epoch 19 - Train avg loss: 2.0999


Epoch 19 Val: 100%|██████████| 10/10 [00:33<00:00,  3.35s/it]


Epoch 19 - Val avg loss: 2.0451
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch19
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 20 Train: 100%|██████████| 40/40 [10:14<00:00, 15.36s/it, avg_loss=2.07]


Epoch 20 - Train avg loss: 2.0405


Epoch 20 Val: 100%|██████████| 10/10 [00:33<00:00,  3.34s/it]


Epoch 20 - Val avg loss: 2.0220
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch20
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 21 Train: 100%|██████████| 40/40 [09:36<00:00, 14.41s/it, avg_loss=1.97]


Epoch 21 - Train avg loss: 2.0572


Epoch 21 Val: 100%|██████████| 10/10 [00:32<00:00,  3.22s/it]


Epoch 21 - Val avg loss: 2.0037
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch21
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 22 Train: 100%|██████████| 40/40 [09:34<00:00, 14.36s/it, avg_loss=1.98]


Epoch 22 - Train avg loss: 1.9957


Epoch 22 Val: 100%|██████████| 10/10 [00:32<00:00,  3.29s/it]


Epoch 22 - Val avg loss: 1.9892
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch22
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 23 Train: 100%|██████████| 40/40 [09:36<00:00, 14.42s/it, avg_loss=2]   


Epoch 23 - Train avg loss: 1.9923


Epoch 23 Val: 100%|██████████| 10/10 [00:32<00:00,  3.22s/it]


Epoch 23 - Val avg loss: 1.9771
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch23
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 24 Train: 100%|██████████| 40/40 [09:22<00:00, 14.07s/it, avg_loss=1.88]


Epoch 24 - Train avg loss: 1.9715


Epoch 24 Val: 100%|██████████| 10/10 [00:32<00:00,  3.23s/it]


Epoch 24 - Val avg loss: 1.9677
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch24
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 25 Train: 100%|██████████| 40/40 [09:23<00:00, 14.08s/it, avg_loss=1.85]


Epoch 25 - Train avg loss: 1.9441


Epoch 25 Val: 100%|██████████| 10/10 [00:32<00:00,  3.23s/it]


Epoch 25 - Val avg loss: 1.9595
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch25
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 26 Train: 100%|██████████| 40/40 [09:50<00:00, 14.77s/it, avg_loss=2.03]


Epoch 26 - Train avg loss: 1.9500


Epoch 26 Val: 100%|██████████| 10/10 [00:33<00:00,  3.35s/it]


Epoch 26 - Val avg loss: 1.9536
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch26
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 27 Train: 100%|██████████| 40/40 [10:04<00:00, 15.11s/it, avg_loss=1.91]


Epoch 27 - Train avg loss: 1.9573


Epoch 27 Val: 100%|██████████| 10/10 [00:33<00:00,  3.34s/it]


Epoch 27 - Val avg loss: 1.9496
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch27
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 28 Train: 100%|██████████| 40/40 [09:53<00:00, 14.84s/it, avg_loss=1.93]


Epoch 28 - Train avg loss: 1.9470


Epoch 28 Val: 100%|██████████| 10/10 [00:33<00:00,  3.35s/it]


Epoch 28 - Val avg loss: 1.9461
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch28
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 29 Train: 100%|██████████| 40/40 [09:36<00:00, 14.42s/it, avg_loss=1.86]


Epoch 29 - Train avg loss: 1.9436


Epoch 29 Val: 100%|██████████| 10/10 [00:32<00:00,  3.21s/it]


Epoch 29 - Val avg loss: 1.9443
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch29
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best


Exp[4,1e-05] Epoch 30 Train: 100%|██████████| 40/40 [09:33<00:00, 14.35s/it, avg_loss=1.79]


Epoch 30 - Train avg loss: 1.9323


Epoch 30 Val: 100%|██████████| 10/10 [00:33<00:00,  3.33s/it]


Epoch 30 - Val avg loss: 1.9437
保存本轮模型: ./florence2-lora-bs4_lr1e-05_ep30\epoch30
保存最优模型: ./florence2-lora-bs4_lr1e-05_ep30\best

>>> 参数组 [bs4_lr1e-05_ep30] 训练完成,最佳验证集loss: 1.9437，最优模型保存在: ./florence2-lora-bs4_lr1e-05_ep30\best
图片 C:\Users\taste\Documents\0_sis\processed\image_145.jpg 生成描述：A store front with a large sign reading "Pelquuería Murano." The glass facade reflects the modern design of the store.


## 加训

In [None]:
# 加载假flash_attn
import sys, types, importlib.machinery

def make_fake_mod(name):
    mod = types.ModuleType(name)
    mod.__spec__ = importlib.machinery.ModuleSpec(name, None)
    return mod

sys.modules["flash_attn"] = make_fake_mod("flash_attn")
sys.modules["flash_attn.ops"] = make_fake_mod("flash_attn.ops")
sys.modules["flash_attn.bert_flash_attention"] = make_fake_mod("flash_attn.bert_flash_attention")

In [2]:
import os
import json
import csv
from PIL import Image
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoProcessor, AutoModelForCausalLM, AdamW, get_scheduler
from peft import PeftModel

# === 数据类 ===
class Florence2CaptionDataset(Dataset):
    def __init__(self, jsonl_path, image_dir):
        self.samples = []
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip():
                    item = json.loads(line)
                    self.samples.append(item)
        self.image_dir = image_dir

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        image_path = os.path.join(self.image_dir, item["image"])
        image = Image.open(image_path)
        if image.mode != 'RGB':
            image = image.convert('RGB')
        prompt = "<DETAILED_CAPTION>"
        caption = item["caption"]
        return {"prompt": prompt, "caption": caption, "image": image}

# === collate_fn ===
def collate_fn(batch):
    prompts = [b["prompt"] for b in batch]
    captions = [b["caption"] for b in batch]
    images = [b["image"] for b in batch]
    model_inputs = processor(
        text=prompts,
        images=images,
        return_tensors="pt",
        padding=True
    )
    labels = processor(
        text=captions,
        images=images,
        return_tensors="pt",
        padding=True
    ).input_ids
    model_inputs["labels"] = labels
    return {k: v.to(model.device) for k, v in model_inputs.items()}

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# === 推理函数（可选） ===
def infer_caption(model, processor, img_path, device):
    img = Image.open(img_path).convert("RGB")
    prompt = "<DETAILED_CAPTION>"
    inputs = processor(text=prompt, images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs.input_ids,
            pixel_values=inputs.pixel_values,
            max_new_tokens=80,
            num_beams=3
        )
    output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f"图片 {img_path} 生成描述：{output}")


In [4]:
# === 加载数据 ===
train_jsonl = r"C:\Users\taste\Documents\0_sis\captions_train.jsonl"
val_jsonl = r"C:\Users\taste\Documents\0_sis\captions_val.jsonl"
img_dir = r"C:\Users\taste\Documents\0_sis\processed"

train_ds = Florence2CaptionDataset(train_jsonl, img_dir)
val_ds = Florence2CaptionDataset(val_jsonl, img_dir)

In [5]:
# === 训练配置 ===
CKPT = "microsoft/Florence-2-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
resume_ckpt_dir = "./florence2-lora-bs4_lr1e-05_ep30/best"  # 你的最佳模型路径

BATCH_SIZE = 4
LEARNING_RATE = 5e-6
EPOCHS = 15  # 加训轮数
LOG_STEP = 10

param_name = "bs4_lr1e-5_ep30_plus15"
param_dir = f"./florence2-lora-{param_name}"
os.makedirs(param_dir, exist_ok=True)

In [6]:
# === 加载模型 ===
#base_model = AutoModelForCausalLM.from_pretrained(CKPT, trust_remote_code=True).to(DEVICE)
#model = PeftModel.from_pretrained(base_model, resume_ckpt_dir).to(DEVICE)
#model.print_trainable_parameters()
from peft import PeftModel, PeftConfig

# === 正确加载 LoRA 的方式 ===
print("📥 正在加载 LoRA 配置...")
peft_config = PeftConfig.from_pretrained(resume_ckpt_dir)

print("📥 加载 base 模型...")
base_model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, trust_remote_code=True)

print("📥 注入 LoRA...")
model = PeftModel.from_pretrained(base_model, resume_ckpt_dir)

# === 转为 GPU 并设置训练模式 ===
model = model.to(DEVICE)
model.train()

# === 确保所有参数是可训练的（尤其是 LoRA 注入后）===
for param in model.parameters():
    param.requires_grad = True

# === 检查 trainable 参数数量 ===
trainable_params = sum(p.numel() for p in model.parameters() if param.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"✅ Trainable params: {trainable_params} / {total_params} ({100*trainable_params/total_params:.2f}%)")


# === 加载 Processor ===
processor = AutoProcessor.from_pretrained(resume_ckpt_dir, trust_remote_code=True)

# === DataLoader ===
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# === 优化器 & Scheduler ===
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = EPOCHS * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# === 初始化日志文件 ===
loss_csv_path = f"train_val_loss_log_{param_name}.csv"
with open(loss_csv_path, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["param_group", "epoch", "train_loss", "val_loss"])

📥 正在加载 LoRA 配置...
📥 加载 base 模型...




📥 注入 LoRA...
✅ Trainable params: 233343944 / 233343944 (100.00%)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# === 开始训练 ===
best_val_loss = float("inf")
best_ckpt_dir = ""
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    batch_losses = []
    progress_bar = tqdm(enumerate(train_dl), total=len(train_dl), desc=f"ContinueTrain Epoch {epoch+1}")
    for step, batch in progress_bar:
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        batch_losses.append(loss.item())
        if (step + 1) % LOG_STEP == 0 or (step + 1) == len(train_dl):
            avg_loss = sum(batch_losses[-LOG_STEP:]) / min(LOG_STEP, len(batch_losses[-LOG_STEP:]))
            progress_bar.set_postfix({"avg_loss": avg_loss})
    avg_epoch_loss = total_loss / len(train_dl)
    print(f"Epoch {epoch+1} - Train avg loss: {avg_epoch_loss:.4f}")

    # === 验证 ===
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dl, desc=f"Epoch {epoch+1} Val"):
            outputs = model(**batch)
            val_loss += outputs.loss.item()
    avg_val_loss = val_loss / len(val_dl)
    print(f"Epoch {epoch+1} - Val avg loss: {avg_val_loss:.4f}")

    # === 记录 loss ===
    with open(loss_csv_path, "a", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([param_name, epoch + 1, avg_epoch_loss, avg_val_loss])

    # === 保存每轮 checkpoint ===
    epoch_dir = os.path.join(param_dir, f"epoch{epoch+1:02d}")
    os.makedirs(epoch_dir, exist_ok=True)
    print(f"保存本轮模型: {epoch_dir}")
    model.save_pretrained(epoch_dir)
    processor.save_pretrained(epoch_dir)

    # === 保存最优 checkpoint ===
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_ckpt_dir = os.path.join(param_dir, "best")
        model.save_pretrained(best_ckpt_dir)
        processor.save_pretrained(best_ckpt_dir)
        print(f"保存最优模型: {best_ckpt_dir}")

print(f"\n>>> 加训完成, 最佳验证集 loss: {best_val_loss:.4f}，模型保存在: {best_ckpt_dir}")

# === 选做：生成一个样本进行推理 ===
infer_caption(model, processor, r"C:\Users\taste\Documents\0_sis\processed\image_145.jpg", DEVICE)

ContinueTrain Epoch 1: 100%|██████████| 40/40 [12:27<00:00, 18.69s/it, avg_loss=1.93]


Epoch 1 - Train avg loss: 1.8489


Epoch 1 Val: 100%|██████████| 10/10 [00:33<00:00,  3.34s/it]


Epoch 1 - Val avg loss: 1.7784
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch01
保存最优模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\best


ContinueTrain Epoch 2: 100%|██████████| 40/40 [12:29<00:00, 18.73s/it, avg_loss=1.53]


Epoch 2 - Train avg loss: 1.5142


Epoch 2 Val: 100%|██████████| 10/10 [00:33<00:00,  3.34s/it]


Epoch 2 - Val avg loss: 1.6940
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch02
保存最优模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\best


ContinueTrain Epoch 3: 100%|██████████| 40/40 [12:21<00:00, 18.54s/it, avg_loss=1.23]


Epoch 3 - Train avg loss: 1.3015


Epoch 3 Val: 100%|██████████| 10/10 [00:32<00:00,  3.28s/it]


Epoch 3 - Val avg loss: 1.6607
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch03
保存最优模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\best


ContinueTrain Epoch 4: 100%|██████████| 40/40 [12:22<00:00, 18.55s/it, avg_loss=1.1] 


Epoch 4 - Train avg loss: 1.1606


Epoch 4 Val: 100%|██████████| 10/10 [00:33<00:00,  3.31s/it]


Epoch 4 - Val avg loss: 1.6491
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch04
保存最优模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\best


ContinueTrain Epoch 5: 100%|██████████| 40/40 [12:23<00:00, 18.59s/it, avg_loss=1.02]


Epoch 5 - Train avg loss: 1.0312


Epoch 5 Val: 100%|██████████| 10/10 [00:32<00:00,  3.30s/it]


Epoch 5 - Val avg loss: 1.6626
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch05


ContinueTrain Epoch 6: 100%|██████████| 40/40 [12:17<00:00, 18.44s/it, avg_loss=0.886]


Epoch 6 - Train avg loss: 0.9302


Epoch 6 Val: 100%|██████████| 10/10 [00:32<00:00,  3.27s/it]


Epoch 6 - Val avg loss: 1.6546
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch06


ContinueTrain Epoch 7: 100%|██████████| 40/40 [12:16<00:00, 18.42s/it, avg_loss=0.843]


Epoch 7 - Train avg loss: 0.8546


Epoch 7 Val: 100%|██████████| 10/10 [00:33<00:00,  3.31s/it]


Epoch 7 - Val avg loss: 1.6757
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch07


ContinueTrain Epoch 8: 100%|██████████| 40/40 [12:19<00:00, 18.50s/it, avg_loss=0.802]


Epoch 8 - Train avg loss: 0.7681


Epoch 8 Val: 100%|██████████| 10/10 [00:32<00:00,  3.28s/it]


Epoch 8 - Val avg loss: 1.6796
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch08


ContinueTrain Epoch 9: 100%|██████████| 40/40 [12:22<00:00, 18.55s/it, avg_loss=0.652]


Epoch 9 - Train avg loss: 0.7120


Epoch 9 Val: 100%|██████████| 10/10 [00:32<00:00,  3.27s/it]


Epoch 9 - Val avg loss: 1.6966
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch09


ContinueTrain Epoch 10: 100%|██████████| 40/40 [12:21<00:00, 18.53s/it, avg_loss=0.646]


Epoch 10 - Train avg loss: 0.6565


Epoch 10 Val: 100%|██████████| 10/10 [00:33<00:00,  3.35s/it]


Epoch 10 - Val avg loss: 1.7137
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch10


ContinueTrain Epoch 11: 100%|██████████| 40/40 [12:43<00:00, 19.10s/it, avg_loss=0.661]


Epoch 11 - Train avg loss: 0.6189


Epoch 11 Val: 100%|██████████| 10/10 [00:33<00:00,  3.35s/it]


Epoch 11 - Val avg loss: 1.7103
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch11


ContinueTrain Epoch 12: 100%|██████████| 40/40 [12:36<00:00, 18.92s/it, avg_loss=0.596]


Epoch 12 - Train avg loss: 0.5931


Epoch 12 Val: 100%|██████████| 10/10 [00:33<00:00,  3.38s/it]


Epoch 12 - Val avg loss: 1.7217
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch12


ContinueTrain Epoch 13: 100%|██████████| 40/40 [12:34<00:00, 18.86s/it, avg_loss=0.553]


Epoch 13 - Train avg loss: 0.5660


Epoch 13 Val: 100%|██████████| 10/10 [00:33<00:00,  3.34s/it]


Epoch 13 - Val avg loss: 1.7261
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch13


ContinueTrain Epoch 14: 100%|██████████| 40/40 [12:39<00:00, 18.99s/it, avg_loss=0.558]


Epoch 14 - Train avg loss: 0.5410


Epoch 14 Val: 100%|██████████| 10/10 [00:33<00:00,  3.38s/it]


Epoch 14 - Val avg loss: 1.7283
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch14


ContinueTrain Epoch 15: 100%|██████████| 40/40 [12:42<00:00, 19.06s/it, avg_loss=0.537]


Epoch 15 - Train avg loss: 0.5249


Epoch 15 Val: 100%|██████████| 10/10 [00:33<00:00,  3.39s/it]


Epoch 15 - Val avg loss: 1.7299
保存本轮模型: ./florence2-lora-bs4_lr1e-5_ep30_plus15\epoch15

>>> 加训完成, 最佳验证集 loss: 1.6491，模型保存在: ./florence2-lora-bs4_lr1e-5_ep30_plus15\best
图片 C:\Users\taste\Documents\0_sis\processed\image_145.jpg 生成描述：A modern shopfront with large glass windows and a white sign reading "Pelquería Murano." The signage and branding indicate a motorcycle repair or replacement shop.


In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM

# 路径改为你的最新微调目录
ckpt_dir = "./florence2-lora-epoch5"

model = AutoModelForCausalLM.from_pretrained(ckpt_dir, trust_remote_code=True).to(DEVICE)
processor = AutoProcessor.from_pretrained(ckpt_dir, trust_remote_code=True)
model.eval()

In [9]:
from PIL import Image

img_path = r"C:\Users\taste\Documents\0_sis\processed\image_181.jpg"  # 换成你想测试的图片名
img = Image.open(img_path).convert("RGB")
prompt = "<DETAILED_CAPTION>"

inputs = processor(text=prompt, images=img, return_tensors="pt").to(model.device)
with torch.no_grad():
    generated_ids = model.generate(
        input_ids=inputs.input_ids,
        pixel_values=inputs.pixel_values,
        max_new_tokens=80,
        num_beams=3  # 可以试试改为5、8，结果可能更好
    )
output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("生成的描述：", output)

生成的描述： The image shows a person walking past a Louis Barton store in London. The store has glass doors and a board with text on it.


In [10]:
img_path = r"C:\Users\taste\Documents\0_sis\processed\image_145.jpg"  # 换成你想测试的图片名
img = Image.open(img_path).convert("RGB")
prompt = "<DETAILED_CAPTION>"

inputs = processor(text=prompt, images=img, return_tensors="pt").to(model.device)
with torch.no_grad():
    generated_ids = model.generate(
        input_ids=inputs.input_ids,
        pixel_values=inputs.pixel_values,
        max_new_tokens=80,
        num_beams=3  # 可以试试改为5、8，结果可能更好
    )
output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("生成的描述：", output)

生成的描述： The image shows a store front with a sign that reads "Murano" and a bicycle parked in front of it. There is a person sitting on a chair in the foreground, and a few plants in the background. The image is slightly blurred, giving it a dreamy quality.
