## 任务 A：文本-图片标注任务

### A.1 数据预处理

In [10]:
import os
import fitz  # PyMuPDF
from PIL import Image
from tqdm import tqdm
import json

def extract_text_image_pairs(pdf_folder, output_image_folder, output_json="text_image_mapping.json"):
    os.makedirs(output_image_folder, exist_ok=True)
    result = []
    idx = 0

    for file_name in tqdm(os.listdir(pdf_folder)):
        if not file_name.endswith(".pdf"):
            continue
        pdf_path = os.path.join(pdf_folder, file_name)
        doc = fitz.open(pdf_path)
        for page_number in range(len(doc)):
            page = doc.load_page(page_number)
            text = page.get_text().strip()
            if not text:
                continue  # 跳过没有文本的页面

            # 将整个页面截图保存为图像
            pix = page.get_pixmap(dpi=200)
            image_path = os.path.join(output_image_folder, f"{os.path.splitext(file_name)[0]}_page_{page_number + 1}.png")
            pix.save(image_path)

            result.append({
                "text": text,
                "image_path": image_path
            })
            idx += 1
            if page_number == 3: # 为节省时间，只取前 n 张进行训练
                break

    # 保存 JSON 映射
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    
    print(f"共提取 {idx} 个文本-图像对，已保存至 {output_json}")
    return result

extract_text_image_pairs(
    pdf_folder="ori_data/The_combination_of_images_and_text",
    output_image_folder="mid_data/extracted_images",
    output_json="mid_data/text_image_mapping.json"
)


100%|██████████| 20/20 [00:21<00:00,  1.05s/it]

共提取 70 个文本-图像对，已保存至 mid_data/text_image_mapping.json





[{'text': '2 0 2 2\nL I V I N G  P R O D U C T S',
  'image_path': 'mid_data/extracted_images/Living_Products_2022_page_1.png'},
 {'text': 'Living Products\n2022\ncassina.com',
  'image_path': 'mid_data/extracted_images/Living_Products_2022_page_2.png'},
 {'text': '3 Fauteuil Grand Confort, \ngrand modèle, trois places, \ndurable\nLe Corbusier, P. Jeanneret, \nC. Perriand\n3 Fauteuil Grand Confort, \nméridienne, durable\nLe Corbusier, P. Jeanneret, \nC. Perriand\n4 Chaise longue à réglage \ncontinu\nLe Corbusier, P. Jeanneret, \nC. Perriand\n4 Chaise longue à réglage \ncontinu, noire\nLe Corbusier, P. Jeanneret, \nC. Perriand\nLady\nMarco Zanuso\nOmbra\nCharlotte Perriand\nP22\nPatrick Norguet\nSoriana\nAfra & Tobia Scarpa\nSoriana\nAfra & Tobia Scarpa\nTokyo Chaise Longue\nCharlotte Perriand\nTokyo Dormeuse\nCharlotte Perriand\nTre pezzi\nFranco Albini\nUtrecht\nGerrit Thomas Rietveld\nUtrecht Baby/XL\nGerrit Thomas Rietveld\nWink\nToshiyuki Kita\nWoodline\nMarco Zanuso\nCapitol Compl

In [31]:
import json
import pickle

def build_hashed_mapping(json_path, output_pickle_path="hashed_mapping.pkl", key="image_path"):
    """
    从 JSON 列表创建哈希表，并保存为 pickle
    :param json_path: 原始 JSON 路径（列表格式）
    :param output_pickle_path: 输出的 pickle 路径
    :param key: 使用哪个字段作为哈希表键（如 image_path 或 text）
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # 构建哈希字典：key => entire entry
    hashed_data = {entry[key]: entry for entry in data}

    # 保存为 pickle
    with open(output_pickle_path, "wb") as f:
        pickle.dump(hashed_data, f)

    print(f"✅ 已保存哈希结构数据，共计 {len(hashed_data)} 项，路径：{output_pickle_path}")
    return hashed_data
build_hashed_mapping(
    json_path="mid_data/text_image_mapping.json",
    output_pickle_path="mid_data/image_path_hash.pkl",
    key="image_path"
)

✅ 已保存哈希结构数据，共计 70 项，路径：mid_data/image_path_hash.pkl


{'mid_data/extracted_images/Living_Products_2022_page_1.png': {'text': '2 0 2 2\nL I V I N G  P R O D U C T S',
  'image_path': 'mid_data/extracted_images/Living_Products_2022_page_1.png'},
 'mid_data/extracted_images/Living_Products_2022_page_2.png': {'text': 'Living Products\n2022\ncassina.com',
  'image_path': 'mid_data/extracted_images/Living_Products_2022_page_2.png'},
 'mid_data/extracted_images/Living_Products_2022_page_3.png': {'text': '3 Fauteuil Grand Confort, \ngrand modèle, trois places, \ndurable\nLe Corbusier, P. Jeanneret, \nC. Perriand\n3 Fauteuil Grand Confort, \nméridienne, durable\nLe Corbusier, P. Jeanneret, \nC. Perriand\n4 Chaise longue à réglage \ncontinu\nLe Corbusier, P. Jeanneret, \nC. Perriand\n4 Chaise longue à réglage \ncontinu, noire\nLe Corbusier, P. Jeanneret, \nC. Perriand\nLady\nMarco Zanuso\nOmbra\nCharlotte Perriand\nP22\nPatrick Norguet\nSoriana\nAfra & Tobia Scarpa\nSoriana\nAfra & Tobia Scarpa\nTokyo Chaise Longue\nCharlotte Perriand\nTokyo Dormeu

In [33]:
import json
import pickle
import time
import random

# 加载原始 JSON 列表
with open("mid_data/text_image_mapping.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

# 加载哈希表（以 image_path 为 key）
with open("mid_data/image_path_hash.pkl", "rb") as f:
    hash_data = pickle.load(f)

# 获取所有 image_path 用于采样
all_image_paths = [entry["image_path"] for entry in json_data]
sample_paths = random.sample(all_image_paths, min(100, len(all_image_paths)))  # 最多采样100个

# -----------------------
# 1️⃣ 列表查找测试
# -----------------------
json_start = time.time()
for path in sample_paths:
    _ = next((entry for entry in json_data if entry["image_path"] == path), None)
json_end = time.time()
json_total_time = json_end - json_start
json_avg_time = json_total_time / len(sample_paths)

# -----------------------
# 2️⃣ 哈希查找测试
# -----------------------
hash_start = time.time()
for path in sample_paths:
    _ = hash_data.get(path, None)
hash_end = time.time()
hash_total_time = hash_end - hash_start
hash_avg_time = hash_total_time / len(sample_paths)

# -----------------------
# ✅ 结果展示
# -----------------------
print(f"查找轮数：{len(sample_paths)}")
print(f"🔍 JSON 列表查找总耗时：{json_total_time:.6f} 秒，平均每次：{json_avg_time:.8f} 秒")
print(f"⚡ 哈希表查找总耗时：{hash_total_time:.6f} 秒，平均每次：{hash_avg_time:.8f} 秒")
print(f"🚀 哈希查找加速比：约 {json_avg_time / hash_avg_time:.1f} 倍")


查找轮数：70
🔍 JSON 列表查找总耗时：0.000800 秒，平均每次：0.00001143 秒
⚡ 哈希表查找总耗时：0.000187 秒，平均每次：0.00000267 秒
🚀 哈希查找加速比：约 4.3 倍


### A.2 Clip模型

In [12]:
import os
import json
import torch
import clip
from PIL import Image
from tqdm import tqdm

# 设置设备
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载模型
model, preprocess = clip.load("ViT-B/32", device=device)

# 加载图文对
with open("mid_data/text_image_mapping.json", "r", encoding="utf-8") as f:
    data_pairs = json.load(f)

# 结果保存
text_features_list = []
image_features_list = []
pair_ids = []

for i, item in enumerate(tqdm(data_pairs)):
    try:
        text = clip.tokenize(item["text"]).to(device)
        image = preprocess(Image.open(item["image_path"]).convert("RGB")).unsqueeze(0).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text)

        # 归一化向量
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        image_features_list.append(image_features.cpu())
        text_features_list.append(text_features.cpu())
        pair_ids.append(i)

    except Exception as e:
        print(f"跳过第 {i} 个 pair")

# 拼接保存
image_tensor = torch.cat(image_features_list, dim=0)
text_tensor = torch.cat(text_features_list, dim=0)
torch.save({
    "image_features": image_tensor,
    "text_features": text_tensor,
    "pair_ids": pair_ids
}, "mid_data/clip_features.pt")

print("图文向量已提取并保存至 mid_data/clip_features.pt")


  9%|▊         | 6/70 [00:00<00:02, 21.36it/s]

跳过第 2 个 pair
跳过第 3 个 pair


 21%|██▏       | 15/70 [00:00<00:03, 16.74it/s]

跳过第 12 个 pair
跳过第 13 个 pair
跳过第 16 个 pair
跳过第 17 个 pair


 34%|███▍      | 24/70 [00:01<00:03, 14.84it/s]

跳过第 21 个 pair
跳过第 24 个 pair


 44%|████▍     | 31/70 [00:01<00:02, 17.74it/s]

跳过第 27 个 pair
跳过第 28 个 pair
跳过第 31 个 pair


 60%|██████    | 42/70 [00:02<00:01, 17.86it/s]

跳过第 37 个 pair
跳过第 38 个 pair
跳过第 41 个 pair


 69%|██████▊   | 48/70 [00:02<00:01, 18.05it/s]

跳过第 44 个 pair
跳过第 45 个 pair


 77%|███████▋  | 54/70 [00:03<00:01, 14.35it/s]

跳过第 51 个 pair
跳过第 54 个 pair
跳过第 55 个 pair


 86%|████████▌ | 60/70 [00:03<00:00, 18.90it/s]

跳过第 58 个 pair
跳过第 59 个 pair


 94%|█████████▍| 66/70 [00:04<00:00, 15.01it/s]

跳过第 63 个 pair
跳过第 66 个 pair


100%|██████████| 70/70 [00:04<00:00, 15.82it/s]

图文向量已提取并保存至 mid_data/clip_features.pt





### A.3 EfficientNet-B0 + TF-IDF

In [17]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# -------------------------------
# 1. 加载图文映射数据，并构建 TF‑IDF 向量器
# -------------------------------
mapping_path = "mid_data/text_image_mapping.json"
with open(mapping_path, "r", encoding="utf-8") as f:
    data_pairs = json.load(f)

# 提取所有文本，训练 TF-IDF
all_texts = [item["text"] for item in data_pairs]
vectorizer = TfidfVectorizer(max_features=2048)  # 限制向量维度
tfidf_matrix = vectorizer.fit_transform(all_texts).toarray()
# 用于 Dataset
for i, item in enumerate(data_pairs):
    item["tfidf_vector"] = tfidf_matrix[i]

# -------------------------------
# 2. 自定义 Dataset
# -------------------------------
class ImageTextTFIDFDataset(Dataset):
    def __init__(self, data_pairs, transform=None):
        self.data = data_pairs
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # 加载图片
        img = Image.open(item["image_path"]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        # 文本向量
        tfidf_vec = torch.tensor(item["tfidf_vector"], dtype=torch.float32)
        return img, tfidf_vec

# -------------------------------
# 3. 定义图像特征提取器（EfficientNet-B0） + 融合模型
# -------------------------------
class CrossModalRegressor(nn.Module):
    def __init__(self, tfidf_dim):
        super().__init__()
        # 载入预训练 EfficientNet-B0，去掉最后的分类头
        effnet = models.efficientnet_b0(pretrained=True)
        self.cnn = nn.Sequential(
            effnet.features,
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),           # 输出维度：1280
        )
        # 从 1280 维映射到隐藏，再到 TF‑IDF 维
        self.head = nn.Sequential(
            nn.Linear(1280, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, tfidf_dim)
        )

    def forward(self, x):
        feat = self.cnn(x)
        out = self.head(feat)
        return out

# -------------------------------
# 4. 训练准备
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 图像预处理
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(  # EfficientNet 推荐的 ImageNet stats
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225]
    )
])

# Dataset & DataLoader
dataset = ImageTextTFIDFDataset(data_pairs, transform=transform)
loader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=4)

# 模型、损失、优化器
model = CrossModalRegressor(tfidf_dim=tfidf_matrix.shape[1]).to(device)
criterion = nn.MSELoss()  # 回归预测 TF‑IDF 向量
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

# -------------------------------
# 5. 训练循环
# -------------------------------
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for images, tfidf_vecs in tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        images = images.to(device)
        tfidf_vecs = tfidf_vecs.to(device)

        optimizer.zero_grad()
        preds = model(images)
        loss = criterion(preds, tfidf_vecs)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * images.size(0)

    avg_loss = total_loss / len(dataset)
    print(f"[Epoch {epoch+1}] Avg Loss: {avg_loss:.4f}")

# -------------------------------
# 6. 保存模型
# -------------------------------
os.makedirs("mid_data", exist_ok=True)
torch.save({
    "model_state": model.state_dict(),
    "vectorizer": vectorizer
}, "mid_data/cross_modal_effnet_tfidf.pth")
print("训练完成，模型已保存至 mid_data/cross_modal_effnet_tfidf.pth")


Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /home/fintuser/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:04<00:00, 4.45MB/s]
Epoch 1/10: 100%|██████████| 5/5 [00:02<00:00,  1.97it/s]


[Epoch 1] Avg Loss: 0.0056


Epoch 2/10: 100%|██████████| 5/5 [00:02<00:00,  2.03it/s]


[Epoch 2] Avg Loss: 0.0025


Epoch 3/10: 100%|██████████| 5/5 [00:02<00:00,  1.94it/s]


[Epoch 3] Avg Loss: 0.0016


Epoch 4/10: 100%|██████████| 5/5 [00:02<00:00,  2.12it/s]


[Epoch 4] Avg Loss: 0.0012


Epoch 5/10: 100%|██████████| 5/5 [00:02<00:00,  2.33it/s]


[Epoch 5] Avg Loss: 0.0011


Epoch 6/10: 100%|██████████| 5/5 [00:02<00:00,  2.12it/s]


[Epoch 6] Avg Loss: 0.0010


Epoch 7/10: 100%|██████████| 5/5 [00:02<00:00,  2.33it/s]


[Epoch 7] Avg Loss: 0.0009


Epoch 8/10: 100%|██████████| 5/5 [00:01<00:00,  2.55it/s]


[Epoch 8] Avg Loss: 0.0009


Epoch 9/10: 100%|██████████| 5/5 [00:02<00:00,  2.03it/s]


[Epoch 9] Avg Loss: 0.0008


Epoch 10/10: 100%|██████████| 5/5 [00:02<00:00,  2.31it/s]

[Epoch 10] Avg Loss: 0.0008
训练完成，模型已保存至 mid_data/cross_modal_effnet_tfidf.pth





## 任务 B： 本地搜索引擎

In [21]:
import random
import json
from PIL import Image
import clip
import torch

# 路径配置
mapping_json = "mid_data/text_image_mapping.json"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载CLIP模型
model, preprocess = clip.load("ViT-B/32", device=device)

# 加载图文对映射
with open(mapping_json, "r", encoding="utf-8") as f:
    data = json.load(f)

# 随机选择3个图像
samples = random.sample(data, k=3)

# 显示图像并输出文本
for idx, item in enumerate(samples, 1):
    image_path = item["image_path"]
    text = item["text"]
    
    print(f"🔹 图片 {idx}: {image_path}")
    print(f"📝 对应文本: {text}\n")

    try:
        img = Image.open(image_path).convert("RGB")
        img.show(title=f"Image {idx}")
    except Exception as e:
        print(f"❌ 无法打开图片: {e}")


🔹 图片 1: mid_data/extracted_images/Ghost-wall-mikal-harrsen_page_1.png
📝 对应文本: L I V I N G  C O L L E C T I O N
G h o s t  W a l l
2 0 2 3
Design Mikal Harrsen

🔹 图片 2: mid_data/extracted_images/Cassina-LB-DETAILS_page_3.png
📝 对应文本: D E TA I L S  C O L L E C T I O N

🔹 图片 3: mid_data/extracted_images/08 - OUTDOOR COLLECTION 2024_page_2.png
📝 对应文本: 3



In [28]:
import os
import json
import torch
import clip
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 参数
image_folder = "mid_data/extracted_images"
mapping_json = "mid_data/text_image_mapping.json"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. 加载模型
model, preprocess = clip.load("ViT-B/32", device=device)

# 2. 加载图像数据和路径
with open(mapping_json, "r", encoding="utf-8") as f:
    pairs = json.load(f)

image_paths = [item["image_path"] for item in pairs]
images = []
for p in tqdm(image_paths, desc="预处理图像"):
    try:
        img = Image.open(p).convert("RGB")
        img = preprocess(img)
        images.append(img)
    except:
        print(f"跳过损坏图像: {p}")

# 3. 批量嵌入图像向量
image_input = torch.stack(images).to(device)
with torch.no_grad():
    image_features = model.encode_image(image_input)
    image_features /= image_features.norm(dim=-1, keepdim=True)  # 归一化

# 4. 搜索函数
def search_images_by_text(query, top_k=1, threshold=0.1):
    model.eval()
    with torch.no_grad():
        text_tokens = clip.tokenize([query]).to(device)
        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # 计算余弦相似度
        sims = (image_features @ text_features.T).squeeze().cpu().numpy()

    top_indices = np.argsort(sims)[::-1][:top_k]
    best_score = sims[top_indices[0]]

    if best_score < threshold:
        print("❌ 无浏览结果（得分低于阈值）")
    else:
        print(f"✅ Top-{top_k} 匹配图像（相似度：{best_score:.4f}）:")
        for i in top_indices:
            print(f" - 相似度: {sims[i]:.4f} -> 图像路径: {image_paths[i]}")
            # 显示图像（可选）
            Image.open(image_paths[i]).show()

# 5. 示例查询
query_text = "L I V I N G  C O L L E C T I O N G h o s t  W a l l 2 0 2 3 Design Mikal Harrsen"
search_images_by_text(query_text)


预处理图像: 100%|██████████| 70/70 [00:06<00:00, 10.64it/s]


✅ Top-1 匹配图像（相似度：0.2786）:
 - 相似度: 0.2786 -> 图像路径: mid_data/extracted_images/Cassina-PRO_2022_page_4.png
