1. 데이터 준비

In [1]:
!pip install quickdraw transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# 동화에 들어가는 사물들
# objects=["shoe","crown","clock","dog", "apple","flower","horse","sword","bird","tree","snowflake","castle","lion","rainbow","lollipop"]
objects=["lion","rainbow","lollipop"]

from quickdraw import QuickDrawDataGroup as qddg

# 각 object에 대한 QuickDrawDataGroup들
qddgs=[]
test_size=10000

# recognized=True로 인식 성공한 데이터를 Test에 사용함.
for object in objects:
  qddgs.append(qddg(name=object,recognized=True,max_drawings=test_size))

# qddgs를 돌면서 각 qddg의 drawings를 가져와 사진으로 변환 후 'images'에 저장.
total_images=[]
for datagroup in qddgs:
  drawings = datagroup.drawings
  object_images=[]
  for drawing in drawings:
    object_images.append(drawing.image)
  total_images.append(object_images)

loading lion drawings
load complete
loading rainbow drawings
load complete
loading lollipop drawings
load complete


2. 모델, label 준비

In [3]:
# 동화에 들어가는 label 준비
labels=["shoe","crown","clock","dog", "apple",
        "flower","horse","sword","bird","tree",
        "snowflake","castle","lion","rainbow","lollipop"]

# 정확도 향상을 위한 a photo of a {object} 형식으로 변환.
clip_labels = [f"a photo of a {label}" for label in labels]
clip_labels

# 모델 가져오기.
from transformers import CLIPProcessor, CLIPModel
model_id = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)

# 속도향상을 위해 GPU 사용할 수 있으면 사용하기.
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# label 토큰화
label_tokens = processor(
    text=clip_labels,
    padding=True,
    images=None,
    return_tensors='pt'
).to(device)

# encode tokens to sentence embeddings
label_emb = model.get_text_features(**label_tokens)

# detach from pytorch gradient computation
label_emb = label_emb.detach().cpu().numpy()

import numpy as np
# normalization
label_emb = label_emb / np.linalg.norm(label_emb, axis=0)

3. 테스트

In [4]:
from tqdm.auto import tqdm

preds_list=[]

for index, obj in enumerate(objects):
  preds = []
  batch_size = 32

  for i in tqdm(range(0, len(total_images[index]), batch_size)):
      i_end = min(i + batch_size, len(total_images[index]))
      images = processor(
          text=None,
          images=total_images[index][i:i_end],
          return_tensors='pt'
      )['pixel_values'].to(device)
      img_emb = model.get_image_features(images)
      img_emb = img_emb.detach().cpu().numpy()
      scores = np.dot(img_emb, label_emb.T)
      preds.extend(np.argmax(scores, axis=1))

  preds_list.append(preds)

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

4. 정확도 출력

In [5]:
for index, object in enumerate(objects):
  object_idx= labels.index(object)

  true_preds = []

  for pred in preds_list[index]:
    if pred == object_idx:
      true_preds.append(1)
    else:
      true_preds.append(0)

  print(f'{object} - prediction success {sum(true_preds)} / {len(true_preds)}, {sum(true_preds) / len(true_preds) * 100:.2f}%')

lion - prediction success 2665 / 10000, 26.65%
rainbow - prediction success 0 / 10000, 0.00%
lollipop - prediction success 9134 / 10000, 91.34%
