In [2]:
# create_my_faiss_rag.py
import json
import os
import numpy as np
import faiss
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

# -------------------------------
# 1. 配置参数
# -------------------------------
MODEL_NAME = "./clip_model_cache"  # CLIP 模型，输出 512 维向量
DIMENSION = 512  # 向量维度

# Faiss 索引保存路径
INDEX_DIR = "./multimodal_rag_system_output/data_storage/vector_indices"
os.makedirs(INDEX_DIR, exist_ok=True)

TEXT_INDEX_PATH = os.path.join(INDEX_DIR, "my_phone_text_vector_index.faiss")
IMAGE_INDEX_PATH = os.path.join(INDEX_DIR, "my_phone_image_vector_index.faiss")
MEAN_INDEX_PATH = os.path.join(INDEX_DIR, "my_phone_mean_vector_index.faiss")

# 示例数据：文本-图片对列表
# 请确保图片路径存在！可以替换成你自己的数据
RAG_DATA_FILE = "./RAG_data/mobilePhone.json"  # 可改为你的实际路径
# 从文件读取图文对
data_pairs = []
with open(RAG_DATA_FILE, 'r', encoding='utf-8') as f:
    data_pairs = json.load(f)

# -------------------------------
# 2. 加载 CLIP 模型和处理器
# -------------------------------
print("Loading CLIP model...")
model = CLIPModel.from_pretrained(MODEL_NAME)
processor = CLIPProcessor.from_pretrained(MODEL_NAME)

# 使用 CPU，如有 GPU 可启用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

Loading CLIP model...


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [5]:
# -------------------------------
# 4. 处理每一对文本-图片，生成向量并添加到索引
# -------------------------------
ids = []  # 存储 ID（可扩展为元数据）
for i, pair in enumerate(data_pairs):
    text = pair["text"]
    img_path = pair["image_path"]

    print(f"\n 原始内容：{i+1}: {text}")

    # --- 文本向量化 ---
    inputs_text = processor(text=text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        text_features = model.get_text_features(**inputs_text)
    text_vec = text_features.cpu().numpy().astype('float32')

    print(f"\n 文本的向量化：{i+1}: {text_vec}")

    # --- 图像向量化 ---
    if not os.path.exists(img_path):
        print(f"⚠️  Image not found: {img_path}, skipping...")
        continue

    image = Image.open(img_path).convert("RGB")
    inputs_image = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        image_features = model.get_image_features(**inputs_image)
    image_vec = image_features.cpu().numpy().astype('float32')

    print(f"\n 图片的向量化：{i+1}: {image_vec}")
    # --- 归一化（CLIP 向量通常已归一化，但 Faiss 中常使用内积搜索，这里保持 L2）---
    # 注意：CLIP 输出的是归一化向量，所以 L2 距离 ≈ 2 - 2*cosine，适合语义相似度

    # --- 计算平均向量 ---
    mean_vec = (text_vec + image_vec) / 2
    ids.append(i)
    print(f"✅ Added vector {i} to all indexes.")


 原始内容：1: iPhone 15 支持 USB-C 接口，兼容 USB 2.0 速度。可使用 PD 快充协议，最高支持 27W 快充。搭载 A17 仿生芯片，性能提升 20%，相机系统升级为 4800 万像素主摄，支持 2 倍光学变焦和智能 HDR 5，视频拍摄支持电影效果模式。

 文本的向量化：1: [[-7.10833728e-01  4.13684756e-01 -3.04710835e-01  1.26674533e-01
   3.36590528e-01 -5.39583743e-01 -4.35511351e-01  1.22007966e+00
   3.65388453e-01  4.47284728e-01 -3.31330061e-01  5.49824685e-02
   2.23936632e-01 -3.60731781e-03  2.69934505e-01 -9.09779966e-02
  -2.99257338e-01 -2.34562963e-01  2.42053479e-01 -3.75216514e-01
  -2.31918871e-01  3.30225825e-02  1.27430707e-01 -6.64402172e-02
  -2.64216155e-01  3.32242846e-02 -2.35044658e-02  6.73925161e-01
  -6.42967373e-02  1.16781756e-01  4.78903502e-02 -7.82896280e-02
  -8.93784389e-02  3.13221395e-01 -4.46781516e-02  2.76491463e-01
  -3.13840240e-01 -8.47999752e-02 -4.24524486e-01 -6.06208928e-02
  -4.57979664e-02  3.39888036e-01  3.09578210e-01 -1.87254131e-01
   1.87130645e-03  1.38902724e-01  2.43521407e-02  1.56123042e-01
   8.53850767e-02  2.45557427e-01 -2.90632427e-01  4

In [15]:
print(text_vec)
norm = np.linalg.norm(text_vec)
print(f"Text vector norm: {norm}")  # 应该 ≈ 1.0

[[-1.71712399e-01  1.12379298e-01  6.05907068e-02  1.40583962e-02
   1.07603595e-01 -2.15058088e-01 -2.36935988e-01  1.41358232e+00
   4.65357453e-02  2.63929158e-01 -4.60326374e-01  2.06304103e-01
   1.83927506e-01 -5.36693782e-02  2.37742096e-01 -2.30418026e-01
  -6.40672326e-01 -2.66485870e-01  9.73419845e-02 -1.57181889e-01
  -1.95746601e-01 -6.12706915e-02 -3.78227830e-02 -9.64311212e-02
  -9.08949673e-02  9.83569026e-02  2.93250650e-01  4.08545107e-01
   1.08656302e-01  1.46603078e-01 -1.19115084e-01 -2.01749057e-01
  -3.49930882e-01 -7.42176026e-02  7.28260100e-01 -1.68577760e-01
  -4.41318631e-01 -1.18485808e-01 -8.64150822e-02  3.90007734e-01
  -1.45814776e-01  6.08165622e-01  8.54009092e-02 -1.96315929e-01
  -3.87470573e-02 -6.81173503e-02 -1.64744064e-01  2.02223971e-01
  -9.36595276e-02 -2.65972197e-01  8.29633847e-02  3.37895662e-01
  -3.39652896e-02 -6.35824949e-02  1.43508613e-01 -1.64670408e-01
   3.53370130e-01  9.68486071e-03  2.32588917e-01 -7.63847232e-02
   1.29119

In [14]:
norm_text_vec = text_vec / np.linalg.norm(text_vec, axis=1, keepdims=True)
print(norm_text_vec)
norm2 = np.linalg.norm(norm_text_vec)
print(f"Text vector norm: {norm2}")  # 应该 ≈ 1.0

[[-1.84048023e-02  1.20452503e-02  6.49434747e-03  1.50683359e-03
   1.15333712e-02 -2.30507627e-02 -2.53957193e-02  1.51513249e-01
   4.98788198e-03  2.82889530e-02 -4.93395701e-02  2.21124757e-02
   1.97140649e-02 -5.75249270e-03  2.54821219e-02 -2.46970989e-02
  -6.86697513e-02 -2.85629909e-02  1.04334923e-02 -1.68473665e-02
  -2.09808815e-02 -6.56723091e-03 -4.05399268e-03 -1.03358626e-02
  -9.74247605e-03  1.05422754e-02  3.14317457e-02  4.37894538e-02
   1.16462046e-02  1.57134868e-02 -1.27672181e-02 -2.16242485e-02
  -3.75069529e-02 -7.95493089e-03  7.80577511e-02 -1.80688202e-02
  -4.73022461e-02 -1.26997698e-02 -9.26230475e-03  4.18025441e-02
  -1.56289954e-02  6.51855543e-02  9.15360171e-03 -2.10419036e-02
  -4.15306026e-03 -7.30108237e-03 -1.76579095e-02  2.16751508e-02
  -1.00387922e-02 -2.85079330e-02  8.89233779e-03  3.62169705e-02
  -3.64053180e-03 -6.81501906e-03  1.53818112e-02 -1.76500138e-02
   3.78755853e-02  1.03806099e-03  2.49297842e-02 -8.18721112e-03
   1.38395