In [1]:
from functools import partial
import numpy as np
import time
import os 
import copy
import json
import random
from tqdm import tqdm 
import gc
import paddle
from paddlenlp.datasets import load_dataset
import paddle.nn.functional as F
import paddle.nn as nn
import paddlenlp as ppnlp
from paddlenlp.transformers import LinearDecayWithWarmup
import pandas as pd
from paddle.vision import transforms as T
from paddle.io import Dataset
import json
from urllib.parse import urlparse
from PIL import Image
import os
import imghdr
import pickle
from sklearn.metrics import f1_score
from paddle.optimizer.lr import CosineAnnealingDecay
from paddle.optimizer import AdamW
from paddle import nn


# 更通用的写法，兼容 Jupyter 和脚本运行
try:
    base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
except NameError:
    base_dir = os.getcwd()

dataset_dir = os.path.join(base_dir, 'autodl-tmp/queries_dataset_merge')

paddle.set_device('gpu')  # 如果没 GPU，可以改为 'cpu'

#读取数据
import json
data_items_train = json.load(open(os.path.join(dataset_dir, 'dataset_items_train.json'), encoding='utf-8'))
data_items_val = json.load(open(os.path.join(dataset_dir, 'dataset_items_val.json'), encoding='utf-8'))
data_items_test = json.load(open(os.path.join(dataset_dir, 'dataset_items_test.json'), encoding='utf-8'))


#读取数据中的每一个样本：图像img、文本caption、
#对应的img_html_news、inverse_search为支持图像img和文本caption的证据材料
def process_string(input_str):
    input_str = input_str.replace('&#39;', ' ')
    input_str = input_str.replace('<b>', '')
    input_str = input_str.replace('</b>', '')
    # input_str = unidecode(input_str)
    return input_str


class FeatureCachedNewsContextDataset(Dataset):
    def __init__(self, context_data_items_dict, queries_root_dir, split, resnet_model, cache_dir='cache_features'):
        self.cache_path = os.path.join(cache_dir, f'{split}_features_cached.pkl')
        self.split = split
        self.resnet = resnet_model
        os.makedirs(cache_dir, exist_ok=True)

        if os.path.exists(self.cache_path):
            print(f"[INFO] Loading cached features from {self.cache_path}")
            with open(self.cache_path, 'rb') as f:
                self.samples = pickle.load(f)
        else:
            print(f"[INFO] Creating cache with CNN features for {split} set...")
            self.samples = self.preprocess_and_cache(context_data_items_dict, queries_root_dir)
            with open(self.cache_path, 'wb') as f:
                pickle.dump(self.samples, f)
            print(f"[INFO] Cached features saved to {self.cache_path}")

    def preprocess_and_cache(self, data_dict, queries_root_dir):
        from PIL import Image
        import imghdr
        from paddle.vision import transforms as T

        transform = T.Compose([
            T.Resize(256),
            T.CenterCrop(224),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

        def load_image(image_path):
            try:
                if imghdr.what(image_path) == 'gif':
                    with open(image_path, 'rb') as f:
                        img = Image.open(f).convert('RGB')
                else:
                    with open(image_path, 'rb') as f:
                        img = Image.open(f).convert('RGB')
                return transform(img)
            except:
                return None

        def process_string(text):
            return text.replace('&#39;', ' ').replace('<b>', '').replace('</b>', '')

        def extract_captions(inv_dict, direct_dict):
            captions = []
            for key in ['all_fully_matched_captions', 'all_partially_matched_captions']:
                for page in inv_dict.get(key, []):
                    if 'title' in page:
                        captions.append(process_string(page['title']))
                    if 'caption' in page:
                        for val in page['caption'].values():
                            captions.append(process_string(val))
            for key in ['images_with_captions', 'images_with_caption_matched_tags', 'images_with_no_captions']:
                for page in direct_dict.get(key, []):
                    if 'page_title' in page:
                        captions.append(process_string(page['page_title']))
                    if 'caption' in page:
                        for val in page['caption'].values():
                            captions.append(process_string(val))
            return list(set(captions))

        MAX_IMG_PER_SAMPLE = 100
        samples = []

        for key in tqdm(data_dict, desc=f"Processing {self.split} with features"):
            item = data_dict[key]
            try:
                qimg_path = os.path.join(queries_root_dir, item['image_path'])
                qimg_tensor = load_image(qimg_path)
                if qimg_tensor is None:
                    continue
                qImg_feature = self.resnet(qimg_tensor.unsqueeze(0)).detach().cpu().squeeze(0).numpy()

                direct_path = os.path.join(queries_root_dir, item['direct_path'])
                inverse_path = os.path.join(queries_root_dir, item['inv_path'])

                with open(os.path.join(direct_path, 'direct_annotation.json'), encoding='utf-8') as f:
                    direct_dict = json.load(f)
                with open(os.path.join(inverse_path, 'inverse_annotation.json'), encoding='utf-8') as f:
                    inv_dict = json.load(f)
            except Exception as e:
                print(f"[ERROR] {e}, skipping {key}")
                continue

            evidence_features = []
            for key1 in ['images_with_captions', 'images_with_no_captions', 'images_with_caption_matched_tags']:
                pages = direct_dict.get(key1, [])
                for i, page in enumerate(pages):
                    if i >= MAX_IMG_PER_SAMPLE:
                        break
                    img_path = os.path.join(direct_path, page['image_path'].split('/')[-1])
                    img_tensor = load_image(img_path)
                    if img_tensor is not None:
                        img_feature = self.resnet(img_tensor.unsqueeze(0)).detach().cpu().squeeze(0).numpy()
                        evidence_features.append(img_feature)

            if len(evidence_features) == 0:
                continue

            captions = extract_captions(inv_dict, direct_dict)
            sample = {
                'qImg_feature': qImg_feature,
                'qCap': item['caption'],
                'imgs_features': evidence_features,
                'caption': captions
            }
            if self.split != 'test':
                sample['label'] = int(item['label'])

            samples.append(sample)

            # 显存清理
            del qImg_feature, evidence_features, img_tensor, qimg_tensor
            gc.collect()
            paddle.device.cuda.empty_cache()

        print(f"[INFO] Cached {len(samples)} samples for {self.split}")
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        if self.split != 'test':
            return sample, len(sample['caption']), len(sample['imgs_features'])
        else:
            return sample, len(sample['caption']), len(sample['imgs_features'])

from paddle.vision import models
from paddle import nn
import paddle
class EncoderCNN(nn.Layer):
    def __init__(self, resnet_arch='resnet101'):
        super(EncoderCNN, self).__init__()
        if resnet_arch == 'resnet101':
            resnet = models.resnet101(pretrained=True)
        elif resnet_arch == 'resnet50':
            resnet = models.resnet50(pretrained=True)
        else:
            raise ValueError(f"Unsupported ResNet arch: {resnet_arch}")

        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        self.adaptive_pool = nn.AdaptiveAvgPool2D((1, 1))

    def forward(self, images, features='pool'):
        out = self.resnet(images)
        if features == 'pool':
            out = self.adaptive_pool(out)
            out = paddle.reshape(out, (out.shape[0], out.shape[1]))
        return out

#### load Datasets ####
resnet = EncoderCNN(resnet_arch='resnet50')
resnet.eval()
train_dataset = FeatureCachedNewsContextDataset(data_items_train, dataset_dir, 'train', resnet)
val_dataset = FeatureCachedNewsContextDataset(data_items_val, dataset_dir, 'val', resnet)
test_dataset = FeatureCachedNewsContextDataset(data_items_test, dataset_dir, 'test', resnet)
print(f"train_dataset total samples: {len(train_dataset)}")
print(f"val_dataset total samples: {len(val_dataset)}")
print(f"test_dataset total samples: {len(test_dataset)}")

# 打印数据
for step, batch in enumerate(test_dataset, start=1):
    print(batch)
    break

def collate_context_cached_train(batch):
    samples = [item[0] for item in batch]
    max_caps = max([item[1] for item in batch])
    max_imgs = max([item[2] for item in batch])

    qCap_batch, qImg_feature_batch, caps_batch, imgs_feature_batch, labels = [], [], [], [], []

    for sample in samples:
        caps = sample['caption'] + [""] * (max_caps - len(sample['caption']))
        caps_batch.append(caps)

        imgs = sample['imgs_features']
        imgs = [paddle.to_tensor(img, dtype='float32') for img in imgs]
        pad = [paddle.zeros_like(imgs[0]) for _ in range(max_imgs - len(imgs))]
        imgs_padded = imgs + pad
        imgs_feature_batch.append(paddle.stack(imgs_padded))

        qCap_batch.append(sample['qCap'])
        qImg_feature_batch.append(paddle.to_tensor(sample['qImg_feature'], dtype='float32'))

        labels.append(paddle.to_tensor(sample['label']))

    qImg_feature_batch = paddle.stack(qImg_feature_batch, axis=0)
    imgs_feature_batch = paddle.stack(imgs_feature_batch, axis=0)
    labels = paddle.stack(labels)

    return labels, caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch


def collate_context_cached_test(batch):
    samples = [item[0] for item in batch]
    max_caps = max([item[1] for item in batch])
    max_imgs = max([item[2] for item in batch])

    qCap_batch, qImg_feature_batch, caps_batch, imgs_feature_batch = [], [], [], []

    for sample in samples:
        caps = sample['caption'] + [""] * (max_caps - len(sample['caption']))
        caps_batch.append(caps)

        imgs = sample['imgs_features']
        imgs = [paddle.to_tensor(img, dtype='float32') for img in imgs]
        pad = [paddle.zeros_like(imgs[0]) for _ in range(max_imgs - len(imgs))]
        imgs_padded = imgs + pad
        imgs_feature_batch.append(paddle.stack(imgs_padded))

        qCap_batch.append(sample['qCap'])
        qImg_feature_batch.append(paddle.to_tensor(sample['qImg_feature'], dtype='float32'))

    qImg_feature_batch = paddle.stack(qImg_feature_batch, axis=0)
    imgs_feature_batch = paddle.stack(imgs_feature_batch, axis=0)

    return caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch



# load DataLoader
from paddle.io import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True,
                              collate_fn=collate_context_cached_train, return_list=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False,
                            collate_fn=collate_context_cached_train, return_list=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False,
                             collate_fn=collate_context_cached_test, return_list=True, num_workers=2)

# 打印数据
for step, batch in enumerate(train_dataloader, start=1):
    print(batch)
    break

#模型构建
from paddle.vision import models
import paddle
from paddlenlp.transformers import ErnieMModel,ErnieMTokenizer
from paddle.nn import functional as F
from paddle import nn
import matplotlib.pyplot as plt
import numpy as np
class EncoderCNN(nn.Layer):
    def __init__(self, resnet_arch = 'resnet101'):
        super(EncoderCNN, self).__init__()
        if resnet_arch == 'resnet101':
            resnet = models.resnet101(pretrained=True)
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        self.adaptive_pool = nn.AdaptiveAvgPool2D((1, 1))
    def forward(self, images, features='pool'):
        out = self.resnet(images)
        if features == 'pool':
            out = self.adaptive_pool(out)
            out = paddle.reshape(out, (out.shape[0],out.shape[1]))
        return out


class NetWork(nn.Layer):
    def __init__(self, mode):
        super(NetWork, self).__init__()
        self.mode = mode
        self.ernie = ErnieMModel.from_pretrained('ernie-m-base')
        self.tokenizer = ErnieMTokenizer.from_pretrained('ernie-m-base')
        self.attention_text = nn.MultiHeadAttention(embed_dim=768, num_heads=16)
        self.attention_image = nn.MultiHeadAttention(embed_dim=2048, num_heads=16)

        if self.mode == 'text':
            self.classifier = nn.Linear(768, 3)
        else:
            self.classifier1 = nn.Linear(2 * (768 + 2048), 1024)
            self.classifier2 = nn.Linear(1024, 3)

    def forward(self, qCap, qImg_feature, caps, imgs_features):
        # Encode qCap
        encode_dict_qcap = self.tokenizer(text=qCap, max_length=128, truncation=True, padding='max_length')
        input_ids_qcap = paddle.to_tensor(encode_dict_qcap['input_ids'])
        qcap_feature, _ = self.ernie(input_ids_qcap)

        if self.mode == 'text':
            logits = self.classifier(qcap_feature[:, 0, :])
            return logits

        # Encode all evidence captions
        caps_feature = []
        for caption_list in caps:
            encode_dict_cap = self.tokenizer(text=caption_list, max_length=128, truncation=True, padding='max_length')
            input_ids_caps = paddle.to_tensor(encode_dict_cap['input_ids'])
            cap_feature, _ = self.ernie(input_ids_caps)
            cap_feature = cap_feature.mean(axis=1)  # mean pooling over all captions
            caps_feature.append(cap_feature)
        caps_feature = paddle.stack(caps_feature, axis=0)

        # Attention between qcap and caps
        caps_feature = self.attention_text(qcap_feature, caps_feature, caps_feature)

        # Attention over image features
        # imgs_features = paddle.stack(imgs_features, axis=0)  # [B, N, 2048]
        qImg_feature = qImg_feature.unsqueeze(1)  # [B, 1, 2048]
        imgs_features = self.attention_image(qImg_feature, imgs_features, imgs_features)

        # Concatenate and classify
        feature = paddle.concat(
            [qcap_feature[:, 0, :], caps_feature[:, 0, :], qImg_feature.squeeze(1), imgs_features.squeeze(1)],
            axis=-1)
        logits = self.classifier1(feature)
        logits = self.classifier2(logits)
        return logits

@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    all_preds = []
    all_labels = []

    for batch in data_loader:
        labels, caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch = batch
        logits = model(qCap=qCap_batch, qImg_feature=qImg_feature_batch,
                       caps=caps_batch, imgs_features=imgs_feature_batch)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())

        preds = paddle.argmax(F.softmax(logits, axis=-1), axis=1)
        all_preds.extend(preds.numpy().tolist())
        all_labels.extend(labels.numpy().tolist())

        correct = metric.compute(logits, labels)
        metric.update(correct)

    acc = metric.accumulate()
    f1 = f1_score(all_labels, all_preds, average='macro')  # 或 'weighted' 视情况而定
    print(f"Eval loss: {np.mean(losses):.5f}, acc: {acc:.5f}, f1: {f1:.5f}")
    model.train()
    metric.reset()
    return np.mean(losses), acc, f1


# 声明模型
model = NetWork("image")
#print(model)



# AMP scaler
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

# 基本参数
epochs = 10
num_training_steps = len(train_dataloader) * epochs
print(f"Total steps: {num_training_steps}")

# Cosine decay scheduler
base_lr = 5e-5
lr_scheduler = CosineAnnealingDecay(learning_rate=base_lr, T_max=num_training_steps)

# 文件夹准备
save_dir = "checkpoint/"
best_dir = "best_model"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(best_dir, exist_ok=True)

# Decay params 设置
decay_params = [
    p.name for n, p in model.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]

# 优化器 + gradient clipping
optimizer = AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=1.2e-4,
    grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),  # ⭐️ Gradient clipping ⭐️
    apply_decay_param_fun=lambda x: x in decay_params
)

# Loss 和 Metric
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

# 是否用 GPU
if paddle.is_compiled_with_cuda():
    paddle.set_device('gpu')
else:
    paddle.set_device('cpu')

# 训练过程
def do_train(model, criterion, metric, val_dataloader, train_dataloader, optimizer, lr_scheduler,
             save_dir="checkpoint", best_dir="best_model", epochs=10):

    global_step = 0
    best_f1 = 0.0
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(best_dir, exist_ok=True)

    for epoch in range(1, epochs + 1):
        print(f"\nEpoch {epoch}/{epochs}")
        model.train()
        train_loader_progress = tqdm(train_dataloader, desc=f"Training Epoch {epoch}", leave=True)

        for step, batch in enumerate(train_loader_progress, start=1):
            labels, caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch = batch

            with paddle.amp.auto_cast():  # ⭐️ AMP 混合精度训练 ⭐️
                logits = model(qCap=qCap_batch, qImg_feature=qImg_feature_batch,
                               caps=caps_batch, imgs_features=imgs_feature_batch)

                loss = criterion(logits, labels)

            # Metric
            correct = metric.compute(logits, labels)
            metric.update(correct)
            acc = metric.accumulate()

            global_step += 1

            # AMP backward + optimizer step
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.clear_grad()

            lr_scheduler.step()

            train_loader_progress.set_postfix({
                "loss": f"{loss.item():.4f}",
                "acc": f"{acc:.4f}",
                "step": global_step
            })

            # Eval
            if global_step % len(train_dataloader) == 0:
                eval_loss, eval_acc, eval_f1 = evaluate(model, criterion, metric, val_dataloader)

                if eval_f1 > best_f1:
                    best_f1 = eval_f1
                    best_model_path = os.path.join(best_dir, 'model_bestpro.pdparams')
                    paddle.save(model.state_dict(), best_model_path)
                    print(f"[BEST] Step {global_step} | F1: {eval_f1:.4f} (updated)")

# 训练启动
do_train(
    model, criterion, metric, val_dataloader, train_dataloader,
    optimizer, lr_scheduler,
    save_dir="checkpoint", best_dir="best_model", epochs=10
)

# 加载最优模型 + 测试
import os
import paddle

params_path = os.path.join("best_model", "model_bestpro.pdparams")
if os.path.exists(params_path):
    model.set_dict(paddle.load(params_path))
    print("Loaded best model.")

model.eval()
results = []
for batch in tqdm(test_dataloader, desc="Predicting"):
    caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch = batch
    logits = model(qCap=qCap_batch, qImg_feature=qImg_feature_batch,
                   caps=caps_batch, imgs_features=imgs_feature_batch)
    preds = paddle.argmax(F.softmax(logits, axis=-1), axis=1).numpy()
    results.extend(preds.tolist())

# 保存
pd.DataFrame({"id": range(len(results)), "label": results}).to_csv("result.csv", index=False)


PLEASE USE OMP_NUM_THREADS WISELY.
  from .autonotebook import tqdm as notebook_tqdm
W0613 03:31:17.285213  9151 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 8.9, Driver API Version: 12.8, Runtime API Version: 11.8
W0613 03:31:17.285980  9151 gpu_resources.cc:164] device: 0, cuDNN Version: 8.9.


[INFO] Loading cached features from cache_features/train_features_cached.pkl
[INFO] Loading cached features from cache_features/val_features_cached.pkl
[INFO] Loading cached features from cache_features/test_features_cached.pkl
train_dataset total samples: 11184
val_dataset total samples: 1309
test_dataset total samples: 1129
({'qImg_feature': array([5.0892222e-01, 1.0130705e-01, 4.1554770e-01, ..., 1.5136348e-01,
       7.5677846e-05, 2.2541411e-01], dtype=float32), 'qCap': '看到有人说 这老头说了句话 不是我退休了 要是没退休 你早就在牢里了 说是某地政法系统的前领导 正局级干部退休的 我想问这种人敢说出这种话 在职间到底', 'imgs_features': [array([0.6166205 , 0.32743877, 0.23644671, ..., 0.6098976 , 0.32837993,
       0.09397861], dtype=float32), array([0.23688133, 0.73640347, 0.12396187, ..., 0.24109195, 0.10670266,
       0.16786775], dtype=float32), array([0.30910894, 0.14095385, 0.31212616, ..., 0.0553519 , 0.00824548,
       0.04955674], dtype=float32)], 'caption': ['Boston Orange  波士頓菊子: 朱学渊  - 為中國史學的實證化而努力', '新华每日电讯-微报纸-2021年11月19日', '新华每日电讯-微报纸-202

[32m[2025-06-13 03:31:21,019] [    INFO][0m - Loading weights file from cache at /root/.paddlenlp/models/ernie-m-base/model_state.pdparams[0m
[32m[2025-06-13 03:31:22,367] [    INFO][0m - Loaded weights file from disk, setting weights to model.[0m
[32m[2025-06-13 03:31:23,787] [    INFO][0m - All model checkpoint weights were used when initializing ErnieMModel.
[0m
[32m[2025-06-13 03:31:23,789] [    INFO][0m - All the weights of ErnieMModel were initialized from the model checkpoint at ernie-m-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ErnieMModel for predictions without further training.[0m
[32m[2025-06-13 03:31:24,482] [    INFO][0m - tokenizer config file saved in /root/.paddlenlp/models/ernie-m-base/tokenizer_config.json[0m
[32m[2025-06-13 03:31:24,484] [    INFO][0m - Special tokens file saved in /root/.paddlenlp/models/ernie-m-base/special_tokens_map.json[0m


Total steps: 55920

Epoch 1/10


Training Epoch 1: 100%|█████████▉| 5590/5592 [09:30<00:00, 10.38it/s, loss=0.0114, acc=0.6763, step=5592] 

Eval loss: 1.28057, acc: 0.70894, f1: 0.65595


Training Epoch 1: 100%|██████████| 5592/5592 [09:35<00:00,  9.71it/s, loss=0.0114, acc=0.6763, step=5592]


[BEST] Step 5592 | F1: 0.6559 (updated)

Epoch 2/10


Training Epoch 2:  80%|████████  | 4498/5592 [07:29<01:58,  9.26it/s, loss=0.0249, acc=0.7414, step=10090]

Found inf or nan, current scale is: 32768.0, decrease to: 32768.0*0.5


Training Epoch 2: 100%|█████████▉| 5590/5592 [09:34<00:00, 10.57it/s, loss=0.0006, acc=0.7437, step=11184]

Eval loss: 1.16250, acc: 0.74484, f1: 0.71191


Training Epoch 2: 100%|██████████| 5592/5592 [09:39<00:00,  9.65it/s, loss=0.0006, acc=0.7437, step=11184]


[BEST] Step 11184 | F1: 0.7119 (updated)

Epoch 3/10


Training Epoch 3:  16%|█▋        | 912/5592 [01:31<07:25, 10.50it/s, loss=0.0029, acc=0.7862, step=12096]

Found inf or nan, current scale is: 32768.0, decrease to: 32768.0*0.5


Training Epoch 3:  53%|█████▎    | 2985/5592 [04:57<04:00, 10.84it/s, loss=0.0315, acc=0.7769, step=14169] 

Found inf or nan, current scale is: 32768.0, decrease to: 32768.0*0.5


Training Epoch 3:  75%|███████▍  | 4187/5592 [06:56<02:13, 10.55it/s, loss=0.0541, acc=0.7779, step=15372]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 3: 100%|██████████| 5592/5592 [09:32<00:00,  2.61s/it, loss=0.6921, acc=0.7798, step=16776]

Eval loss: 1.07662, acc: 0.73262, f1: 0.69506


Training Epoch 3: 100%|██████████| 5592/5592 [09:32<00:00,  9.77it/s, loss=0.6921, acc=0.7798, step=16776]



Epoch 4/10


Training Epoch 4:  26%|██▋       | 1476/5592 [02:28<06:48, 10.09it/s, loss=0.2578, acc=0.8226, step=18253]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 4:  98%|█████████▊| 5487/5592 [09:09<00:10, 10.47it/s, loss=4.2655, acc=0.8089, step=22264] 

Found inf or nan, current scale is: 32768.0, decrease to: 32768.0*0.5


Training Epoch 4: 100%|█████████▉| 5591/5592 [09:32<00:00, 10.21it/s, loss=0.0122, acc=0.8095, step=22368]

Eval loss: 1.02347, acc: 0.75630, f1: 0.73193


Training Epoch 4: 100%|██████████| 5592/5592 [09:40<00:00,  3.79s/it, loss=0.0122, acc=0.8095, step=22368]

[BEST] Step 22368 | F1: 0.7319 (updated)


Training Epoch 4: 100%|██████████| 5592/5592 [09:41<00:00,  9.62it/s, loss=0.0122, acc=0.8095, step=22368]



Epoch 5/10


Training Epoch 5:  10%|█         | 576/5592 [00:58<08:09, 10.24it/s, loss=0.0235, acc=0.8438, step=22944]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 5:  47%|████▋     | 2628/5592 [04:23<04:45, 10.37it/s, loss=0.0001, acc=0.8400, step=24996]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 5:  84%|████████▍ | 4687/5592 [07:49<01:26, 10.44it/s, loss=0.0475, acc=0.8435, step=27055] 

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 5: 100%|█████████▉| 5591/5592 [09:31<00:00, 10.06it/s, loss=0.3506, acc=0.8425, step=27960]

Eval loss: 1.21259, acc: 0.75936, f1: 0.74050


Training Epoch 5: 100%|██████████| 5592/5592 [09:40<00:00,  3.86s/it, loss=0.3506, acc=0.8425, step=27960]

[BEST] Step 27960 | F1: 0.7405 (updated)


Training Epoch 5: 100%|██████████| 5592/5592 [09:41<00:00,  9.62it/s, loss=0.3506, acc=0.8425, step=27960]



Epoch 6/10


Training Epoch 6:  20%|█▉        | 1102/5592 [01:50<07:36,  9.84it/s, loss=0.0000, acc=0.8866, step=29062]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 6:  26%|██▌       | 1446/5592 [02:25<06:27, 10.70it/s, loss=0.0001, acc=0.8808, step=29407] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 6:  97%|█████████▋| 5451/5592 [09:05<00:13, 10.39it/s, loss=0.0102, acc=0.8781, step=33412]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 6: 100%|█████████▉| 5591/5592 [09:20<00:00,  9.05it/s, loss=0.0133, acc=0.8785, step=33552]

Eval loss: 1.20564, acc: 0.77540, f1: 0.75805


Training Epoch 6: 100%|██████████| 5592/5592 [09:42<00:00,  6.48s/it, loss=0.0133, acc=0.8785, step=33552]

[BEST] Step 33552 | F1: 0.7580 (updated)


Training Epoch 6: 100%|██████████| 5592/5592 [09:42<00:00,  9.60it/s, loss=0.0133, acc=0.8785, step=33552]



Epoch 7/10


Training Epoch 7:  33%|███▎      | 1866/5592 [03:06<06:45,  9.19it/s, loss=0.0000, acc=0.9097, step=35418]

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 7:  42%|████▏     | 2352/5592 [03:53<05:25,  9.96it/s, loss=0.0000, acc=0.9077, step=35904]

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 7: 100%|██████████| 5592/5592 [09:34<00:00,  3.24s/it, loss=3.0182, acc=0.9102, step=39144] 

Eval loss: 1.47559, acc: 0.76776, f1: 0.75210


Training Epoch 7: 100%|██████████| 5592/5592 [09:35<00:00,  9.72it/s, loss=3.0182, acc=0.9102, step=39144]



Epoch 8/10


Training Epoch 8:  14%|█▍        | 791/5592 [01:18<07:42, 10.39it/s, loss=0.0000, acc=0.9254, step=39935] 

Found inf or nan, current scale is: 16384.0, decrease to: 16384.0*0.5


Training Epoch 8:  18%|█▊        | 983/5592 [01:38<07:46,  9.89it/s, loss=0.0000, acc=0.9273, step=40127]

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 8:  63%|██████▎   | 3499/5592 [05:49<03:21, 10.39it/s, loss=0.0000, acc=0.9320, step=42643] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 8: 100%|█████████▉| 5578/5592 [09:19<00:01, 10.61it/s, loss=0.0041, acc=0.9336, step=44722] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 8: 100%|██████████| 5592/5592 [09:38<00:00,  4.30s/it, loss=0.0000, acc=0.9336, step=44736]

Eval loss: 1.60233, acc: 0.77158, f1: 0.75148


Training Epoch 8: 100%|██████████| 5592/5592 [09:38<00:00,  9.66it/s, loss=0.0000, acc=0.9336, step=44736]



Epoch 9/10


Training Epoch 9:  39%|███▉      | 2189/5592 [03:40<05:22, 10.54it/s, loss=0.0000, acc=0.9559, step=46926]

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 9:  82%|████████▏ | 4598/5592 [07:42<01:36, 10.35it/s, loss=1.3808, acc=0.9530, step=49335] 

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 9: 100%|█████████▉| 5591/5592 [09:32<00:00, 10.57it/s, loss=5.7431, acc=0.9534, step=50328] 

Eval loss: 1.61498, acc: 0.77998, f1: 0.75909


Training Epoch 9: 100%|██████████| 5592/5592 [09:42<00:00,  3.81s/it, loss=5.7431, acc=0.9534, step=50328]

[BEST] Step 50328 | F1: 0.7591 (updated)


Training Epoch 9: 100%|██████████| 5592/5592 [09:43<00:00,  9.59it/s, loss=5.7431, acc=0.9534, step=50328]



Epoch 10/10


Training Epoch 10:  24%|██▍       | 1330/5592 [02:12<06:49, 10.42it/s, loss=0.0051, acc=0.9617, step=51659]

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 10:  60%|██████    | 3376/5592 [05:39<03:38, 10.16it/s, loss=0.0000, acc=0.9606, step=53704]

Found inf or nan, current scale is: 8192.0, decrease to: 8192.0*0.5


Training Epoch 10:  90%|█████████ | 5060/5592 [08:26<00:51, 10.24it/s, loss=0.0000, acc=0.9603, step=55388] 

Found inf or nan, current scale is: 4096.0, decrease to: 4096.0*0.5


Training Epoch 10: 100%|█████████▉| 5591/5592 [09:29<00:00, 10.06it/s, loss=0.0000, acc=0.9599, step=55920] 

Eval loss: 1.63478, acc: 0.77846, f1: 0.75975


Training Epoch 10: 100%|██████████| 5592/5592 [09:39<00:00,  3.80s/it, loss=0.0000, acc=0.9599, step=55920]

[BEST] Step 55920 | F1: 0.7597 (updated)


Training Epoch 10: 100%|██████████| 5592/5592 [09:40<00:00,  9.63it/s, loss=0.0000, acc=0.9599, step=55920]


Loaded best model.


Predicting: 100%|██████████| 1129/1129 [00:25<00:00, 44.17it/s]


In [2]:
import os
import numpy as np
import paddle
from paddle.nn import functional as F
from tqdm import tqdm
import gc
from sklearn.metrics import f1_score

# 验证函数（带F1）
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    all_preds = []
    all_labels = []

    for batch in tqdm(data_loader, desc="Evaluating"):
        labels, caps_batch, imgs_feature_batch, qCap_batch, qImg_feature_batch = batch
        logits = model(qCap=qCap_batch, qImg_feature=qImg_feature_batch,
                       caps=caps_batch, imgs_features=imgs_feature_batch)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())

        preds = paddle.argmax(F.softmax(logits, axis=-1), axis=1)
        all_preds.extend(preds.numpy().tolist())
        all_labels.extend(labels.numpy().tolist())

        correct = metric.compute(logits, labels)
        metric.update(correct)

        gc.collect()
        paddle.device.cuda.empty_cache()

    acc = metric.accumulate()
    f1 = f1_score(all_labels, all_preds, average='macro')
    print("Eval Loss: {:.5f}, Accuracy: {:.5f}, F1 Score (macro): {:.5f}".format(np.mean(losses), acc, f1))
    return np.mean(losses), acc, f1

# 加载模型参数
params_path = os.path.join("best_model", "model_bestpro.pdparams")
if os.path.exists(params_path):
    model.set_dict(paddle.load(params_path))
    print("Loaded best model.")
else:
    print("Best model file not found!")

# 损失函数和评估指标
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

# 执行评估
eval_loss, eval_acc, eval_f1 = evaluate(model, criterion, metric, val_dataloader)


Loaded best model.


Evaluating: 100%|██████████| 655/655 [01:34<00:00,  6.93it/s]

Eval Loss: 1.63478, Accuracy: 0.77846, F1 Score (macro): 0.75975



