In [1]:
from functools import partial
import numpy as np
import time
import os 
import copy
import json
import random
from tqdm import tqdm 
import gc
import paddle
from paddlenlp.datasets import load_dataset
import paddle.nn.functional as F
import paddle.nn as nn
import paddlenlp as ppnlp
from paddlenlp.transformers import LinearDecayWithWarmup
import pandas as pd
from paddle.vision import transforms as T
from paddle.io import Dataset
import json
from urllib.parse import urlparse
from PIL import Image
import os
import imghdr
import pickle
from sklearn.metrics import f1_score
from paddle.optimizer.lr import CosineAnnealingDecay
from paddle.optimizer import AdamW
from paddle import nn


# 更通用的写法，兼容 Jupyter 和脚本运行
try:
    base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
except NameError:
    base_dir = os.getcwd()

dataset_dir = os.path.join(base_dir, 'autodl-tmp/queries_dataset_merge')

paddle.set_device('gpu')  # 如果没 GPU，可以改为 'cpu'

#读取数据
import json
data_items_train = json.load(open(os.path.join(dataset_dir, 'dataset_items_train.json'), encoding='utf-8'))
data_items_val = json.load(open(os.path.join(dataset_dir, 'dataset_items_val.json'), encoding='utf-8'))
data_items_test = json.load(open(os.path.join(dataset_dir, 'dataset_items_test.json'), encoding='utf-8'))


#读取数据中的每一个样本：图像img、文本caption、
#对应的img_html_news、inverse_search为支持图像img和文本caption的证据材料
def process_string(input_str):
    input_str = input_str.replace('&#39;', ' ')
    input_str = input_str.replace('<b>', '')
    input_str = input_str.replace('</b>', '')
    # input_str = unidecode(input_str)
    return input_str


class FeatureCachedNewsContextDataset(Dataset):
    def __init__(self, context_data_items_dict, queries_root_dir, split, resnet_model, cache_dir='cache_features'):
        self.cache_path = os.path.join(cache_dir, f'{split}_features_cached.pkl')
        self.split = split
        self.resnet = resnet_model
        os.makedirs(cache_dir, exist_ok=True)

        if os.path.exists(self.cache_path):
            print(f"[INFO] Loading cached features from {self.cache_path}")
            with open(self.cache_path, 'rb') as f:
                self.samples = pickle.load(f)
        else:
            print(f"[INFO] Creating cache with CNN features for {split} set...")
            self.samples = self.preprocess_and_cache(context_data_items_dict, queries_root_dir)
            with open(self.cache_path, 'wb') as f:
                pickle.dump(self.samples, f)
            print(f"[INFO] Cached features saved to {self.cache_path}")

    def preprocess_and_cache(self, data_dict, queries_root_dir):
        from PIL import Image
        import imghdr
        from paddle.vision import transforms as T

        transform = T.Compose([
            T.Resize(256),
            T.CenterCrop(224),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

        def load_image(image_path):
            try:
                if imghdr.what(image_path) == 'gif':
                    with open(image_path, 'rb') as f:
                        img = Image.open(f).convert('RGB')
                else:
                    with open(image_path, 'rb') as f:
                        img = Image.open(f).convert('RGB')
                return transform(img)
            except:
                return None

        def process_string(text):
            return text.replace('&#39;', ' ').replace('<b>', '').replace('</b>', '')

        def extract_captions(inv_dict, direct_dict):
            captions = []
            for key in ['all_fully_matched_captions', 'all_partially_matched_captions']:
                for page in inv_dict.get(key, []):
                    if 'title' in page:
                        captions.append(process_string(page['title']))
                    if 'caption' in page:
                        for val in page['caption'].values():
                            captions.append(process_string(val))
            for key in ['images_with_captions', 'images_with_caption_matched_tags', 'images_with_no_captions']:
                for page in direct_dict.get(key, []):
                    if 'page_title' in page:
                        captions.append(process_string(page['page_title']))
                    if 'caption' in page:
                        for val in page['caption'].values():
                            captions.append(process_string(val))
            return list(set(captions))

        MAX_IMG_PER_SAMPLE = 100
        samples = []

        for key in tqdm(data_dict, desc=f"Processing {self.split} with features"):
            item = data_dict[key]
            try:
                qimg_path = os.path.join(queries_root_dir, item['image_path'])
                qimg_tensor = load_image(qimg_path)
                if qimg_tensor is None:
                    continue
                qImg_feature = self.resnet(qimg_tensor.unsqueeze(0)).detach().cpu().squeeze(0).numpy()

                direct_path = os.path.join(queries_root_dir, item['direct_path'])
                inverse_path = os.path.join(queries_root_dir, item['inv_path'])

                with open(os.path.join(direct_path, 'direct_annotation.json'), encoding='utf-8') as f:
                    direct_dict = json.load(f)
                with open(os.path.join(inverse_path, 'inverse_annotation.json'), encoding='utf-8') as f:
                    inv_dict = json.load(f)
            except Exception as e:
                print(f"[ERROR] {e}, skipping {key}")
                continue

            evidence_features = []
            for key1 in ['images_with_captions', 'images_with_no_captions', 'images_with_caption_matched_tags']:
                pages = direct_dict.get(key1, [])
                for i, page in enumerate(pages):
                    if i >= MAX_IMG_PER_SAMPLE:
                        break
                    img_path = os.path.join(direct_path, page['image_path'].split('/')[-1])
                    img_tensor = load_image(img_path)
                    if img_tensor is not None:
                        img_feature = self.resnet(img_tensor.unsqueeze(0)).detach().cpu().squeeze(0).numpy()
                        evidence_features.append(img_feature)

            if len(evidence_features) == 0:
                continue

            captions = extract_captions(inv_dict, direct_dict)
            sample = {
                'qImg_feature': qImg_feature,
                'qCap': item['caption'],
                'imgs_features': evidence_features,
                'caption': captions
            }
            if self.split != 'test':
                sample['label'] = int(item['label'])

            samples.append(sample)

            # 显存清理
            del qImg_feature, evidence_features, img_tensor, qimg_tensor
            gc.collect()
            paddle.device.cuda.empty_cache()

        print(f"[INFO] Cached {len(samples)} samples for {self.split}")
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        if self.split != 'test':
            return sample, len(sample['caption']), len(sample['imgs_features'])
        else:
            return sample, len(sample['caption']), len(sample['imgs_features'])

from paddle.vision import models
from paddle import nn
import paddle
class EncoderCNN(nn.Layer):
    def __init__(self, resnet_arch='resnet101'):
        super(EncoderCNN, self).__init__()
        if resnet_arch == 'resnet101':
            resnet = models.resnet101(pretrained=True)
        elif resnet_arch == 'resnet50':
            resnet = models.resnet50(pretrained=True)
        else:
            raise ValueError(f"Unsupported ResNet arch: {resnet_arch}")

        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        self.adaptive_pool = nn.AdaptiveAvgPool2D((1, 1))

    def forward(self, images, features='pool'):
        out = self.resnet(images)
        if features == 'pool':
            out = self.adaptive_pool(out)
            out = paddle.reshape(out, (out.shape[0], out.shape[1]))
        return out

#### load Datasets ####
resnet = EncoderCNN(resnet_arch='resnet50')
resnet.eval()
train_dataset = FeatureCachedNewsContextDataset(data_items_train, dataset_dir, 'train', resnet)
val_dataset = FeatureCachedNewsContextDataset(data_items_val, dataset_dir, 'val', resnet)
test_dataset = FeatureCachedNewsContextDataset(data_items_test, dataset_dir, 'test', resnet)
print(f"train_dataset total samples: {len(train_dataset)}")
print(f"val_dataset total samples: {len(val_dataset)}")
print(f"test_dataset total samples: {len(test_dataset)}")

# 打印数据
for step, batch in enumerate(test_dataset, start=1):
    print(batch)
    break

def collate_context_cached_train(batch):
    samples = [item[0] for item in batch]
    max_caps = max([item[1] for item in batch])
    max_imgs = max([item[2] for item in batch])

    qCap_batch, qImg_feature_batch, caps_batch, imgs_feature_batch, labels = [], [], [], [], []

    for sample in samples:
        caps = sample['caption'] + [""] * (max_caps - len(sample['caption']))
        caps_batch.append(caps)

        imgs = sample['imgs_features']
        imgs = [paddle.to_tensor(img, dtype='float32') for img in imgs]
        pad = [paddle.zeros_like(imgs[0]) for _ in range(max_imgs - len(imgs))]
        imgs_padded = imgs + pad
        imgs_feature_batch.append(paddle.stack(imgs_padded))

        qCap_batch.append(sample['qCap'])
        qImg_feature_batch.append(paddle.to_tensor(sample['qImg_feature'], dtype='float32'))

        labels.append(paddle.to_tensor(sample['label']))

    qImg_feature_batch = paddle.stack(qImg_feature_batch, axis=0)
    imgs_feature_batch = paddle.stack(imgs_feature_batch, axis=0)
    labels = paddle.stack(labels)

    return labels, caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch


def collate_context_cached_test(batch):
    samples = [item[0] for item in batch]
    max_caps = max([item[1] for item in batch])
    max_imgs = max([item[2] for item in batch])

    qCap_batch, qImg_feature_batch, caps_batch, imgs_feature_batch = [], [], [], []

    for sample in samples:
        caps = sample['caption'] + [""] * (max_caps - len(sample['caption']))
        caps_batch.append(caps)

        imgs = sample['imgs_features']
        imgs = [paddle.to_tensor(img, dtype='float32') for img in imgs]
        pad = [paddle.zeros_like(imgs[0]) for _ in range(max_imgs - len(imgs))]
        imgs_padded = imgs + pad
        imgs_feature_batch.append(paddle.stack(imgs_padded))

        qCap_batch.append(sample['qCap'])
        qImg_feature_batch.append(paddle.to_tensor(sample['qImg_feature'], dtype='float32'))

    qImg_feature_batch = paddle.stack(qImg_feature_batch, axis=0)
    imgs_feature_batch = paddle.stack(imgs_feature_batch, axis=0)

    return caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch



# load DataLoader
from paddle.io import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True,
                              collate_fn=collate_context_cached_train, return_list=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False,
                            collate_fn=collate_context_cached_train, return_list=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False,
                             collate_fn=collate_context_cached_test, return_list=True, num_workers=2)

# 打印数据
for step, batch in enumerate(train_dataloader, start=1):
    print(batch)
    break

#模型构建
from paddle.vision import models
import paddle
from paddlenlp.transformers import ErnieModel, ErnieTokenizer
from paddle.nn import functional as F
from paddle import nn
import matplotlib.pyplot as plt
import numpy as np
class EncoderCNN(nn.Layer):
    def __init__(self, resnet_arch = 'resnet101'):
        super(EncoderCNN, self).__init__()
        if resnet_arch == 'resnet101':
            resnet = models.resnet101(pretrained=True)
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        self.adaptive_pool = nn.AdaptiveAvgPool2D((1, 1))
    def forward(self, images, features='pool'):
        out = self.resnet(images)
        if features == 'pool':
            out = self.adaptive_pool(out)
            out = paddle.reshape(out, (out.shape[0],out.shape[1]))
        return out


class NetWork(nn.Layer):
    def __init__(self, mode):
        super(NetWork, self).__init__()
        self.mode = mode
        self.ernie = ErnieModel.from_pretrained('ernie-3.0-base-zh')
        self.tokenizer = ErnieTokenizer.from_pretrained('ernie-3.0-base-zh')
        self.attention_text = nn.MultiHeadAttention(embed_dim=768, num_heads=16)
        self.attention_image = nn.MultiHeadAttention(embed_dim=2048, num_heads=16)

        if self.mode == 'text':
            self.classifier = nn.Linear(768, 3)
        else:
            self.classifier1 = nn.Linear(2 * (768 + 2048), 1024)
            self.classifier2 = nn.Linear(1024, 3)

    def forward(self, qCap, qImg_feature, caps, imgs_features):
        # Encode qCap
        encode_dict_qcap = self.tokenizer(text=qCap, max_length=128, truncation=True, padding='max_length')
        input_ids_qcap = paddle.to_tensor(encode_dict_qcap['input_ids'])
        qcap_feature, _ = self.ernie(input_ids_qcap)

        if self.mode == 'text':
            logits = self.classifier(qcap_feature[:, 0, :])
            return logits

        # Encode all evidence captions
        caps_feature = []
        for caption_list in caps:
            encode_dict_cap = self.tokenizer(text=caption_list, max_length=128, truncation=True, padding='max_length')
            input_ids_caps = paddle.to_tensor(encode_dict_cap['input_ids'])
            cap_feature, _ = self.ernie(input_ids_caps)
            cap_feature = cap_feature.mean(axis=1)  # mean pooling over all captions
            caps_feature.append(cap_feature)
        caps_feature = paddle.stack(caps_feature, axis=0)

        # Attention between qcap and caps
        caps_feature = self.attention_text(qcap_feature, caps_feature, caps_feature)

        # Attention over image features
        # imgs_features = paddle.stack(imgs_features, axis=0)  # [B, N, 2048]
        qImg_feature = qImg_feature.unsqueeze(1)  # [B, 1, 2048]
        imgs_features = self.attention_image(qImg_feature, imgs_features, imgs_features)

        # Concatenate and classify
        feature = paddle.concat(
            [qcap_feature[:, 0, :], caps_feature[:, 0, :], qImg_feature.squeeze(1), imgs_features.squeeze(1)],
            axis=-1)
        logits = self.classifier1(feature)
        logits = self.classifier2(logits)
        return logits

@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    all_preds = []
    all_labels = []

    for batch in data_loader:
        labels, caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch = batch
        logits = model(qCap=qCap_batch, qImg_feature=qImg_feature_batch,
                       caps=caps_batch, imgs_features=imgs_feature_batch)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())

        preds = paddle.argmax(F.softmax(logits, axis=-1), axis=1)
        all_preds.extend(preds.numpy().tolist())
        all_labels.extend(labels.numpy().tolist())

        correct = metric.compute(logits, labels)
        metric.update(correct)

    acc = metric.accumulate()
    f1 = f1_score(all_labels, all_preds, average='macro')  # 或 'weighted' 视情况而定
    print(f"Eval loss: {np.mean(losses):.5f}, acc: {acc:.5f}, f1: {f1:.5f}")
    model.train()
    metric.reset()
    return np.mean(losses), acc, f1


# 声明模型
model = NetWork("image")
#print(model)



# AMP scaler
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

# 基本参数
epochs = 10
num_training_steps = len(train_dataloader) * epochs
print(f"Total steps: {num_training_steps}")

# Cosine decay scheduler
base_lr = 5e-5
lr_scheduler = CosineAnnealingDecay(learning_rate=base_lr, T_max=num_training_steps)

# 文件夹准备
save_dir = "checkpoint/"
best_dir = "best_model"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(best_dir, exist_ok=True)

# Decay params 设置
decay_params = [
    p.name for n, p in model.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]

# 优化器 + gradient clipping
optimizer = AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=1.2e-4,
    grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),  # ⭐️ Gradient clipping ⭐️
    apply_decay_param_fun=lambda x: x in decay_params
)

# Loss 和 Metric
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

# 是否用 GPU
if paddle.is_compiled_with_cuda():
    paddle.set_device('gpu')
else:
    paddle.set_device('cpu')

# 训练过程
def do_train(model, criterion, metric, val_dataloader, train_dataloader, optimizer, lr_scheduler,
             save_dir="checkpoint", best_dir="best_model", epochs=10):

    global_step = 0
    best_f1 = 0.0
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(best_dir, exist_ok=True)

    for epoch in range(1, epochs + 1):
        print(f"\nEpoch {epoch}/{epochs}")
        model.train()
        train_loader_progress = tqdm(train_dataloader, desc=f"Training Epoch {epoch}", leave=True)

        for step, batch in enumerate(train_loader_progress, start=1):
            labels, caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch = batch

            with paddle.amp.auto_cast():  # ⭐️ AMP 混合精度训练 ⭐️
                logits = model(qCap=qCap_batch, qImg_feature=qImg_feature_batch,
                               caps=caps_batch, imgs_features=imgs_feature_batch)

                loss = criterion(logits, labels)

            # Metric
            correct = metric.compute(logits, labels)
            metric.update(correct)
            acc = metric.accumulate()

            global_step += 1

            # AMP backward + optimizer step
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.clear_grad()

            lr_scheduler.step()

            train_loader_progress.set_postfix({
                "loss": f"{loss.item():.4f}",
                "acc": f"{acc:.4f}",
                "step": global_step
            })

            # Eval
            if global_step % len(train_dataloader) == 0:
                eval_loss, eval_acc, eval_f1 = evaluate(model, criterion, metric, val_dataloader)

                if eval_f1 > best_f1:
                    best_f1 = eval_f1
                    best_model_path = os.path.join(best_dir, 'model_bestpro.pdparams')
                    paddle.save(model.state_dict(), best_model_path)
                    print(f"[BEST] Step {global_step} | F1: {eval_f1:.4f} (updated)")

# 训练启动
do_train(
    model, criterion, metric, val_dataloader, train_dataloader,
    optimizer, lr_scheduler,
    save_dir="checkpoint", best_dir="best_model", epochs=10
)

# 加载最优模型 + 测试
import os
import paddle

params_path = os.path.join("best_model", "model_bestpro.pdparams")
if os.path.exists(params_path):
    model.set_dict(paddle.load(params_path))
    print("Loaded best model.")

model.eval()
results = []
for batch in tqdm(test_dataloader, desc="Predicting"):
    caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch = batch
    logits = model(qCap=qCap_batch, qImg_feature=qImg_feature_batch,
                   caps=caps_batch, imgs_features=imgs_feature_batch)
    preds = paddle.argmax(F.softmax(logits, axis=-1), axis=1).numpy()
    results.extend(preds.tolist())

# 保存
pd.DataFrame({"id": range(len(results)), "label": results}).to_csv("result.csv", index=False)


PLEASE USE OMP_NUM_THREADS WISELY.
  from .autonotebook import tqdm as notebook_tqdm
  import imghdr
W0613 03:48:46.348315  1525 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 8.9, Driver API Version: 12.4, Runtime API Version: 11.8
W0613 03:48:46.349671  1525 gpu_resources.cc:164] device: 0, cuDNN Version: 8.9.


[INFO] Loading cached features from cache_features/train_features_cached.pkl
[INFO] Loading cached features from cache_features/val_features_cached.pkl
[INFO] Loading cached features from cache_features/test_features_cached.pkl
train_dataset total samples: 11184
val_dataset total samples: 1309
test_dataset total samples: 1129
({'qImg_feature': array([5.0892222e-01, 1.0130705e-01, 4.1554770e-01, ..., 1.5136348e-01,
       7.5677846e-05, 2.2541411e-01], dtype=float32), 'qCap': '看到有人说 这老头说了句话 不是我退休了 要是没退休 你早就在牢里了 说是某地政法系统的前领导 正局级干部退休的 我想问这种人敢说出这种话 在职间到底', 'imgs_features': [array([0.6166205 , 0.32743877, 0.23644671, ..., 0.6098976 , 0.32837993,
       0.09397861], dtype=float32), array([0.23688133, 0.73640347, 0.12396187, ..., 0.24109195, 0.10670266,
       0.16786775], dtype=float32), array([0.30910894, 0.14095385, 0.31212616, ..., 0.0553519 , 0.00824548,
       0.04955674], dtype=float32)], 'caption': ['Boston Orange  波士頓菊子: 朱学渊  - 為中國史學的實證化而努力', '新华每日电讯-微报纸-2021年11月19日', '新华每日电讯-微报纸-202

(…)ers/ernie_3.0/ernie_3.0_base_zh.pdparams: 100%|██████████| 474M/474M [00:39<00:00, 12.1MB/s] 
[32m[2025-06-13 03:49:30,586] [    INFO][0m - Loading weights file from cache at /root/.paddlenlp/models/ernie-3.0-base-zh/model_state.pdparams[0m
[32m[2025-06-13 03:49:31,057] [    INFO][0m - Loaded weights file from disk, setting weights to model.[0m
- This IS expected if you are initializing ErnieModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ErnieModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).[0m
[32m[2025-06-13 03:49:31,684] [    INFO][0m - All the weights of ErnieModel were initialized from the model checkpoint at ernie-3.0-base-zh.
If your task is similar to the task the mo

Total steps: 55920

Epoch 1/10


Training Epoch 1:   6%|▋         | 359/5592 [00:35<07:49, 11.15it/s, loss=6.8661, acc=0.6100, step=359] 

Found inf or nan, current scale is: 1024.0, decrease to: 1024.0*0.5


Training Epoch 1: 100%|█████████▉| 5591/5592 [08:59<00:00, 11.40it/s, loss=0.1545, acc=0.6824, step=5592] 

Eval loss: 1.25958, acc: 0.69901, f1: 0.63306


Training Epoch 1: 100%|██████████| 5592/5592 [09:01<00:00, 10.33it/s, loss=0.1545, acc=0.6824, step=5592]


[BEST] Step 5592 | F1: 0.6331 (updated)

Epoch 2/10


Training Epoch 2: 100%|█████████▉| 5590/5592 [08:48<00:00, 11.44it/s, loss=0.0003, acc=0.7390, step=11184]

Eval loss: 0.89711, acc: 0.75248, f1: 0.72887


Training Epoch 2: 100%|██████████| 5592/5592 [08:57<00:00, 10.41it/s, loss=0.0003, acc=0.7390, step=11184]


[BEST] Step 11184 | F1: 0.7289 (updated)

Epoch 3/10


Training Epoch 3:  22%|██▏       | 1240/5592 [01:56<06:29, 11.18it/s, loss=2.3641, acc=0.7901, step=12425]

Found inf or nan, current scale is: 32768.0, decrease to: 32768.0*0.5


Training Epoch 3:  58%|█████▊    | 3256/5592 [05:03<03:30, 11.11it/s, loss=0.4195, acc=0.7849, step=14440]

Found inf or nan, current scale is: 32768.0, decrease to: 32768.0*0.5


Training Epoch 3:  94%|█████████▍| 5257/5592 [08:10<00:30, 11.15it/s, loss=0.0523, acc=0.7820, step=16441] 

Found inf or nan, current scale is: 32768.0, decrease to: 32768.0*0.5


Training Epoch 3: 100%|██████████| 5592/5592 [08:58<00:00,  2.72s/it, loss=2.6002, acc=0.7808, step=16776]

Eval loss: 0.91749, acc: 0.75248, f1: 0.72737


Training Epoch 3: 100%|██████████| 5592/5592 [08:58<00:00, 10.38it/s, loss=2.6002, acc=0.7808, step=16776]



Epoch 4/10


Training Epoch 4:  31%|███▏      | 1760/5592 [02:42<06:03, 10.54it/s, loss=0.0004, acc=0.8165, step=18536]

Found inf or nan, current scale is: 32768.0, decrease to: 32768.0*0.5


Training Epoch 4:  57%|█████▋    | 3211/5592 [04:57<03:45, 10.54it/s, loss=1.8035, acc=0.8107, step=2e+4] 

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 4: 100%|█████████▉| 5589/5592 [08:38<00:00, 10.72it/s, loss=1.9540, acc=0.8138, step=22365] 

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 4: 100%|██████████| 5592/5592 [08:56<00:00,  3.55s/it, loss=2.0364, acc=0.8138, step=22368]

Eval loss: 1.40730, acc: 0.72040, f1: 0.65938


Training Epoch 4: 100%|██████████| 5592/5592 [08:56<00:00, 10.42it/s, loss=2.0364, acc=0.8138, step=22368]



Epoch 5/10


Training Epoch 5:  43%|████▎     | 2426/5592 [03:45<04:53, 10.78it/s, loss=0.0000, acc=0.8482, step=24795]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 5:  80%|███████▉  | 4466/5592 [06:55<01:37, 11.60it/s, loss=2.3904, acc=0.8479, step=26835] 

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 5: 100%|█████████▉| 5590/5592 [08:55<00:00, 11.64it/s, loss=0.0000, acc=0.8486, step=27960] 

Eval loss: 1.14575, acc: 0.76929, f1: 0.75322


Training Epoch 5: 100%|██████████| 5592/5592 [09:00<00:00,  2.98s/it, loss=0.0000, acc=0.8486, step=27960]

[BEST] Step 27960 | F1: 0.7532 (updated)


Training Epoch 5: 100%|██████████| 5592/5592 [09:00<00:00, 10.35it/s, loss=0.0000, acc=0.8486, step=27960]



Epoch 6/10


Training Epoch 6:  18%|█▊        | 1008/5592 [01:33<07:51,  9.72it/s, loss=0.0004, acc=0.8686, step=28968]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 6:  54%|█████▍    | 3047/5592 [04:42<03:57, 10.72it/s, loss=0.1296, acc=0.8748, step=31007] 

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 6:  90%|█████████ | 5050/5592 [07:49<00:46, 11.59it/s, loss=0.0001, acc=0.8784, step=33010] 

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 6: 100%|██████████| 5592/5592 [08:56<00:00,  2.68s/it, loss=0.0517, acc=0.8789, step=33552] 

Eval loss: 1.37997, acc: 0.75477, f1: 0.72397


Training Epoch 6: 100%|██████████| 5592/5592 [08:57<00:00, 10.40it/s, loss=0.0517, acc=0.8789, step=33552]



Epoch 7/10


Training Epoch 7:  27%|██▋       | 1483/5592 [02:18<06:06, 11.22it/s, loss=1.7269, acc=0.9103, step=35035]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 7:  62%|██████▏   | 3489/5592 [05:25<03:07, 11.23it/s, loss=0.0130, acc=0.9111, step=37041] 

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 7:  91%|█████████ | 5063/5592 [07:52<00:45, 11.65it/s, loss=0.0000, acc=0.9080, step=38615] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 7: 100%|██████████| 5592/5592 [08:57<00:00,  3.53s/it, loss=0.0000, acc=0.9078, step=39144]

Eval loss: 1.53150, acc: 0.75095, f1: 0.73543


Training Epoch 7: 100%|██████████| 5592/5592 [08:58<00:00, 10.39it/s, loss=0.0000, acc=0.9078, step=39144]



Epoch 8/10


Training Epoch 8:  42%|████▏     | 2359/5592 [03:40<05:25,  9.94it/s, loss=0.0007, acc=0.9328, step=41503] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 8:  82%|████████▏ | 4591/5592 [07:07<01:24, 11.91it/s, loss=0.0000, acc=0.9356, step=43736] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 8: 100%|██████████| 5592/5592 [08:57<00:00,  3.21s/it, loss=0.0003, acc=0.9359, step=44736]

Eval loss: 1.68262, acc: 0.76471, f1: 0.74671


Training Epoch 8: 100%|██████████| 5592/5592 [08:57<00:00, 10.39it/s, loss=0.0003, acc=0.9359, step=44736]



Epoch 9/10


Training Epoch 9:  20%|█▉        | 1117/5592 [01:44<06:25, 11.62it/s, loss=0.0002, acc=0.9512, step=45853]

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 9:  61%|██████    | 3391/5592 [05:16<03:22, 10.87it/s, loss=0.0000, acc=0.9500, step=48128] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 9:  97%|█████████▋| 5398/5592 [08:20<00:17, 10.94it/s, loss=0.0001, acc=0.9511, step=50134] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 9: 100%|██████████| 5592/5592 [08:57<00:00,  3.60s/it, loss=0.2084, acc=0.9514, step=50328]

Eval loss: 1.75636, acc: 0.76776, f1: 0.74875


Training Epoch 9: 100%|██████████| 5592/5592 [08:58<00:00, 10.39it/s, loss=0.2084, acc=0.9514, step=50328]



Epoch 10/10


Training Epoch 10:  32%|███▏      | 1811/5592 [02:48<05:29, 11.49it/s, loss=3.3296, acc=0.9602, step=52139] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 10:  69%|██████▉   | 3874/5592 [05:59<02:55,  9.79it/s, loss=0.0000, acc=0.9604, step=54203]

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 10: 100%|██████████| 5592/5592 [08:58<00:00,  5.40s/it, loss=0.0014, acc=0.9616, step=55920] 

Eval loss: 1.77541, acc: 0.76776, f1: 0.74729


Training Epoch 10: 100%|██████████| 5592/5592 [08:59<00:00, 10.37it/s, loss=0.0014, acc=0.9616, step=55920]


Loaded best model.


Predicting: 100%|██████████| 1129/1129 [00:26<00:00, 42.87it/s]


In [2]:
import os
import numpy as np
import paddle
from paddle.nn import functional as F
from tqdm import tqdm
import gc
from sklearn.metrics import f1_score

# 验证函数（带F1）
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    all_preds = []
    all_labels = []

    for batch in tqdm(data_loader, desc="Evaluating"):
        labels, caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch = batch
        logits = model(qCap=qCap_batch, qImg_feature=qImg_feature_batch,
                       caps=caps_batch, imgs_features=imgs_feature_batch)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())

        preds = paddle.argmax(F.softmax(logits, axis=-1), axis=1)
        all_preds.extend(preds.numpy().tolist())
        all_labels.extend(labels.numpy().tolist())

        correct = metric.compute(logits, labels)
        metric.update(correct)

        gc.collect()
        paddle.device.cuda.empty_cache()

    acc = metric.accumulate()
    f1 = f1_score(all_labels, all_preds, average='macro')
    print("Eval Loss: {:.5f}, Accuracy: {:.5f}, F1 Score (macro): {:.5f}".format(np.mean(losses), acc, f1))
    return np.mean(losses), acc, f1

# 加载模型参数
params_path = os.path.join("best_model", "model_bestpro.pdparams")
if os.path.exists(params_path):
    model.set_dict(paddle.load(params_path))
    print("Loaded best model.")
else:
    print("Best model file not found!")

# 损失函数和评估指标
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

# 执行评估
eval_loss, eval_acc, eval_f1 = evaluate(model, criterion, metric, val_dataloader)


Loaded best model.


Evaluating: 100%|██████████| 655/655 [01:38<00:00,  6.63it/s]

Eval Loss: 1.14575, Accuracy: 0.76929, F1 Score (macro): 0.75322



