In [1]:
import pandas as pd
import numpy as np
import os
from PIL import Image
from tqdm import tqdm 
from concurrent.futures import ThreadPoolExecutor
import time
import requests

# PyTorch & Scikit-learn
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.models as models
from sklearn.metrics import f1_score
import torch.nn.functional as F
from sklearn.metrics import f1_score, classification_report

In [None]:
# --- 경로 설정 ---
data_dir = 'safebooru\data'
tmp_dir = os.path.join(data_dir, 'tmp')
image_dir = os.path.join(data_dir, 'images')
model_dir = os.path.join(data_dir, 'model')
test_csv_path = os.path.join(data_dir, 'test.csv')
model_save_path = os.path.join(model_dir, 'best_model.pth')

In [3]:
# --- 설정 ---
BATCH_SIZE = 64
NUM_WORKERS = 0
if NUM_WORKERS > 20: NUM_WORKERS = 20

MAX_RETRIES = 3
RETRY_DELAY = 3

# --- 장치 설정 ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✅ 설정 완료, 사용 장치: {device}, 데이터로더 워커: {NUM_WORKERS}")

✅ 설정 완료, 사용 장치: cuda, 데이터로더 워커: 0


In [4]:
class SafebooruDataset(Dataset):
    def __init__(self, csv_path, image_dir, transform=None):
        self.df = pd.read_csv(csv_path)
        self.image_dir = image_dir
        self.transform = transform
        self.tag_columns = [col for col in self.df.columns if col not in ['id', 'created_at', 'rating', 'score', 'sample_url', 'sample_width', 'sample_height', 'preview_url']]
        self.labels = self.df[self.tag_columns].values.astype(np.float32)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, f"{self.df.iloc[idx]['id']}.jpg")
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        labels = torch.from_numpy(self.labels[idx])
        return image, labels

# 테스트 데이터에는 데이터 증강을 적용하지 않음
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_dataset = SafebooruDataset(csv_path=test_csv_path, image_dir=image_dir, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

num_tags = len(test_dataset.tag_columns)
tag_names = test_dataset.tag_columns

print(f"✅ 테스트 데이터셋 로드 완료. 태그 수: {num_tags}")

✅ 테스트 데이터셋 로드 완료. 태그 수: 278


In [5]:
# 모델 구조 정의
model = models.resnet50()
model.fc = nn.Linear(model.fc.in_features, num_tags)

# 저장된 가중치 불러오기
model.load_state_dict(torch.load(model_save_path, map_location=device))
model = model.to(device)
model.eval() # 반드시 평가 모드로 설정

print(f"✅ '{os.path.basename(model_save_path)}' 모델 로드 완료")

✅ 'best_model.pth' 모델 로드 완료


In [6]:
all_preds = []
all_labels = []

with torch.no_grad(): # 그라디언트 계산 비활성화
    for images, labels in tqdm(test_loader, desc="테스트 진행"):
        images, labels = images.to(device), labels.to(device)
        
        outputs = model(images)
        
        preds = torch.sigmoid(outputs) > 0.5 # 임계값 0.5 기준으로 예측
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# 리스트를 하나의 numpy 배열로 변환
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

print("✅ 예측 완료")

테스트 진행: 100%|██████████| 245/245 [02:19<00:00,  1.75it/s]

✅ 예측 완료





In [7]:
# F1 스코어 계산
macro_f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
micro_f1 = f1_score(all_labels, all_preds, average='micro', zero_division=0)
weighted_f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

print("--- 🧪 최종 모델 성능 평가 🧪 ---")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")
print("-" * 30)

# 주요 태그에 대한 상세 리포트 출력 (상위 15개)
report = classification_report(all_labels, all_preds, target_names=tag_names, zero_division=0, output_dict=True)
report_df = pd.DataFrame(report).transpose()

# support(샘플 수)가 높은 순으로 정렬하여 상위 15개 태그의 성능 확인
print("주요 태그별 상세 성능 (상위 15개):")
print(report_df.sort_values(by='support', ascending=False).head(15))

--- 🧪 최종 모델 성능 평가 🧪 ---
Macro F1 Score: 0.3293
Micro F1 Score: 0.3897
Weighted F1 Score: 0.3587
------------------------------
주요 태그별 상세 성능 (상위 15개):
              precision    recall  f1-score   support
samples avg    0.452842  0.260115  0.293381  116848.0
macro avg      0.514417  0.258852  0.329255  116848.0
weighted avg   0.554353  0.286680  0.358694  116848.0
micro avg      0.608258  0.286680  0.389693  116848.0
long_hair      0.611396  0.485018  0.540923    3938.0
touhou         0.851340  0.800611  0.825197    3927.0
short_hair     0.601283  0.376903  0.463359    3481.0
solo           0.496333  0.255346  0.337209    3180.0
highres        0.545232  0.089775  0.154165    2484.0
hat            0.741538  0.436990  0.549914    2206.0
blonde_hair    0.608763  0.536331  0.570256    2202.0
blue_eyes      0.607399  0.246013  0.350189    2069.0
red_eyes       0.652657  0.412118  0.505218    1997.0
brown_hair     0.587983  0.301099  0.398256    1820.0
smile          0.405797  0.034022  0.062