In [1]:
# ────────────────────────────────────────────────
# ❶ 依存ライブラリのインストール（最初だけ）
# ────────────────────────────────────────────────
!pip -q install --upgrade faiss-cpu sentence_transformers


In [2]:
# ────────────────────────────────────────────────
# ❷ Google Drive をマウント
# ────────────────────────────────────────────────
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# ────────────────────────────────────────────────
# ❸ インポート
# ────────────────────────────────────────────────
import os, numpy as np, matplotlib.pyplot as plt
from glob import glob
from pathlib import Path
from PIL import Image
import faiss, torch
from sentence_transformers import SentenceTransformer

# ★★ 自分の環境に合わせてここだけ書き換えてください ★★
IMAGES_PATH = '/content/drive/MyDrive/images'        # 画像が入った最上位フォルダ
INDEX_PATH  = '/content/drive/MyDrive/faiss_indexes/vector.index'
TOP_K       = 5   #似ている画像を上位何件検索するかの設定です。デフォルトは5にしていますが変えても大丈夫です


In [4]:
# ────────────────────────────────────────────────
# ❹ CLIP モデルをロード
# ────────────────────────────────────────────────
device = 'cpu'
model  = SentenceTransformer('clip-ViT-B-32', device=device)
print("Model loaded on", device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Model loaded on cpu


In [5]:
# ────────────────────────────────────────────────
# ❺ 主要関数
# ────────────────────────────────────────────────
def generate_clip_embeddings(images_dir: str, model):
    exts = {'.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif'}
    image_paths = sorted(str(p) for p in Path(images_dir).rglob('*') if p.suffix.lower() in exts)
    if not image_paths:
        raise ValueError(f'No images found under {images_dir}')

    images = [Image.open(p).convert('RGB') for p in image_paths]
    embeddings = model.encode(
        images, convert_to_numpy=True, normalize_embeddings=True,
        batch_size=32, show_progress_bar=True
    ).astype(np.float32)
    return embeddings, image_paths


def create_faiss_index(embeddings, paths, index_path):
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    index = faiss.IndexIDMap(index)
    index.add_with_ids(embeddings, np.arange(len(paths)))
    faiss.write_index(index, index_path)
    with open(index_path + '.paths', 'w') as f:
        f.writelines(p + '\n' for p in paths)
    print(f'✅ Index saved to {index_path}')
    return index


def load_faiss_index(index_path):
    index = faiss.read_index(index_path)
    with open(index_path + '.paths') as f:
        paths = [l.strip() for l in f]
    print(f'✅ Index loaded ({len(paths)} items)')
    return index, paths


def retrieve_similar(query, model, index, paths, top_k=TOP_K):
    if isinstance(query, str) and query.lower().endswith(('.png','.jpg','.jpeg','.webp','.bmp','.gif')):
        query_img = Image.open(query).convert('RGB')
        q_emb = model.encode(query_img, convert_to_numpy=True, normalize_embeddings=True)
    else:
        query_img = None
        q_emb = model.encode(query, convert_to_numpy=True, normalize_embeddings=True)

    q_emb = q_emb.astype(np.float32).reshape(1, -1)
    D, I = index.search(q_emb, top_k)
    return query_img, [paths[int(i)] for i in I[0]]


def visualize(query_img, retrieved_paths):
    n = len(retrieved_paths) + (1 if query_img else 0)
    plt.figure(figsize=(4*n, 4))
    col = 1
    if query_img:
        plt.subplot(1, n, col); col += 1
        plt.imshow(query_img); plt.axis('off'); plt.title('Query')
    for i, p in enumerate(retrieved_paths, 1):
        plt.subplot(1, n, col); col += 1
        plt.imshow(Image.open(p)); plt.axis('off')
        plt.title(f'Result {i}\n{Path(p).name}')
    plt.show()


In [6]:
# ────────────────────────────────────────────────
# ❻ インデックスを作成 (初回のみ) またはロード
# ────────────────────────────────────────────────
if Path(INDEX_PATH).exists():
    index, img_paths = load_faiss_index(INDEX_PATH)      # ← 既存ファイルを使う
else:
    embeddings, img_paths = generate_clip_embeddings(IMAGES_PATH, model)
    index = create_faiss_index(embeddings, img_paths, INDEX_PATH)


✅ Index loaded (518 items)


In [None]:
# ────────────────────────────────────────────────
# ❼ 例：検索して結果＋ファイル名を表示
# ────────────────────────────────────────────────
query_path = '/content/drive/MyDrive/testdata/testdata.jpg'   # 検索したい画像
query_img, results = retrieve_similar(query_path, model, index, img_paths)
print("Retrieved files:", [Path(p).name for p in results])
visualize(query_img, results)
