In [1]:
# prepare_embeddings.py
import pandas as pd
import numpy as np
import torch
import clip
from PIL import Image
import requests
from io import BytesIO
from tqdm import tqdm

# 설정
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CSV_FILE = "pokemon_data_master.csv"
EMBEDDINGS_FILE = "embeddings.npy"

def load_clip_model():
    model, preprocess = clip.load("ViT-B/32", device=DEVICE)
    return model, preprocess

def create_clip_embeddings(df, model, preprocess):
    embeddings = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="임베딩 생성 중"):
        try:
            response = requests.get(row['image_url'])
            if response.status_code == 200:
                image = Image.open(BytesIO(response.content)).convert("RGB")
                image_input = preprocess(image).unsqueeze(0).to(DEVICE)

                with torch.no_grad():
                    embedding = model.encode_image(image_input)
                embedding /= embedding.norm(dim=-1, keepdim=True)  # 정규화
                embeddings.append(embedding.cpu().numpy())
            else:
                embeddings.append(np.zeros((1, 512)))
        except Exception as e:
            print(f"{row['name_kor']} 임베딩 생성 실패: {e}")
            embeddings.append(np.zeros((1, 512)))

    embeddings = np.vstack(embeddings)
    return embeddings

if __name__ == "__main__":
    print("CSV 불러오는 중")
    df = pd.read_csv(CSV_FILE)

    print("CLIP 모델 로드 중")
    model, preprocess = load_clip_model()

    print("임베딩 생성 시작")
    embeddings = create_clip_embeddings(df, model, preprocess)

    print(f"'{EMBEDDINGS_FILE}' 저장 중")
    np.save(EMBEDDINGS_FILE, embeddings)

    print("완료")


CSV 불러오는 중
CLIP 모델 로드 중
임베딩 생성 시작


임베딩 생성 중: 100%|██████████| 1023/1023 [09:33<00:00,  1.78it/s] 

'embeddings.npy' 저장 중
완료



