In [1]:
import os
import shutil
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoProcessor, AutoModel

from segment_anything import sam_model_registry, SamPredictor
import cv2


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 경로 설정
IMAGE_DIR = "./1차"
OUTPUT_DIR = IMAGE_DIR + "_dino_grouped_no_people"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 모든 이미지 임베딩 추출
image_paths = [
    os.path.join(IMAGE_DIR, fname)
    for fname in os.listdir(IMAGE_DIR)
    if fname.lower().endswith(("jpg", "jpeg", "png"))
]

In [3]:

# DINOv2 모델 로딩
model_id = "facebook/dinov2-large"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_id)
dino_model = AutoModel.from_pretrained(model_id).to(device).eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [5]:
# SAM 로드 (사전 다운로드된 체크포인트 필요)
SAM_CHECKPOINT = "sam_vit_h_4b8939.pth"
model_type = "vit_h"
sam = sam_model_registry[model_type](checkpoint=SAM_CHECKPOINT).to(device)
predictor = SamPredictor(sam)

In [6]:
# ---------------------- 사람 마스크 + 제거 함수 ----------------------
def remove_person_with_sam(img_np):
    predictor.set_image(img_np)
    height, width, _ = img_np.shape

    # 그리드 기반 프롬프트 (전체 탐색용)
    step = 128
    points = []
    for y in range(step//2, height, step):
        for x in range(step//2, width, step):
            points.append([x, y])
    input_points = np.array(points)
    input_labels = np.ones(len(points))  # foreground

    masks, scores, _ = predictor.predict(
        point_coords=input_points,
        point_labels=input_labels,
        multimask_output=False
    )

    # 사람으로 추정되는 마스크를 통합 (보수적 접근)
    full_mask = np.any(masks, axis=0).astype(np.uint8)
    inpainted = cv2.inpaint(img_np, full_mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
    return inpainted



In [9]:

# ---------------------- DINOv2 임베딩 추출 ----------------------
def extract_dino_embedding(img_np):
    pil_img = Image.fromarray(img_np)
    inputs = processor(images=pil_img, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = dino_model(**inputs)
        return outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()  # CLS 토큰

In [14]:
# ---------------------- 메인 처리 ----------------------
image_paths = [
    os.path.join(IMAGE_DIR, fname)
    for fname in os.listdir(IMAGE_DIR)
    if fname.lower().endswith(("jpg", "jpeg", "png"))
]

print("[1] 사람 제거 + 임베딩 추출 중...")
embeddings = []
for path in tqdm(image_paths):
    img_np = np.array(Image.open(path).convert("RGB"))
    # clean_img = remove_person_with_sam(img_np)
    emb = extract_dino_embedding(img_np)
    embeddings.append(emb)
embeddings = np.array(embeddings)

[1] 사람 제거 + 임베딩 추출 중...


100%|██████████| 232/232 [01:04<00:00,  3.58it/s]


In [15]:
# ---------------------- 클러스터링 ----------------------
def group_by_cosine_similarity(embeddings, threshold=0.9):
    sim = cosine_similarity(embeddings)
    used = np.zeros(len(embeddings), dtype=bool)
    groups = []
    for i in range(len(embeddings)):
        if not used[i]:
            idxs = np.where(sim[i] >= threshold)[0]
            groups.append(idxs)
            used[idxs] = True
    return groups

groups = group_by_cosine_similarity(embeddings, threshold=0.7)

print("[2] 그룹 저장 중...")
for i, group in enumerate(groups):
    group_dir = os.path.join(OUTPUT_DIR, f"group_{i}")
    os.makedirs(group_dir, exist_ok=True)
    for idx in group:
        shutil.copy(image_paths[idx], group_dir)

print(f"완료! 그룹화된 결과는 '{OUTPUT_DIR}'에 저장됨.")


[2] 그룹 저장 중...
완료! 그룹화된 결과는 './1차_dino_grouped_no_people'에 저장됨.
