In [1]:
import os
import numpy as np
import json
from PIL import Image
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import timm
import torch.nn as nn
import torch.optim as optim

##### 데이터

In [None]:
SAVE_PATH = '/workspace/yonghak/data/image_emotion_crop_dataset.npz'

In [3]:
# 전체 데이터 불러오기
data = np.load(SAVE_PATH)
images = data['images']     # (전체 개수, 224, 224, 3)
emotions = data['emotions'] # (전체 개수,)
sets = data['sets']         # (전체 개수,)

In [4]:
# train/val/test 인덱스 분리
train_idx = sets == 'train'
val_idx = sets == 'val'
test_idx = sets == 'test'

train_images = images[train_idx]
val_images = images[val_idx]
test_images = images[test_idx]
train_emotions = emotions[train_idx]
val_emotions = emotions[val_idx]
test_emotions = emotions[test_idx]

In [5]:
# 라벨 인코딩
emotion_le = LabelEncoder()
train_emotion_idx = emotion_le.fit_transform(train_emotions)
val_emotion_idx = emotion_le.transform(val_emotions)
test_emotion_idx = emotion_le.transform(test_emotions)
NUM_CLASSES = len(emotion_le.classes_)
print('감정 클래스:', emotion_le.classes_)

감정 클래스: ['기쁨' '당황' '분노' '슬픔']


In [6]:
print(f"Train 이미지 shape: {train_images.shape}, 감정 라벨 shape: {train_emotion_idx.shape}")
print(f"Validation 이미지 shape: {val_images.shape}, 감정 라벨 shape: {val_emotion_idx.shape}")
print(f"Test 이미지 shape: {test_images.shape}, 감정 라벨 shape: {test_emotion_idx.shape}")

Train 이미지 shape: (5994, 224, 224, 3), 감정 라벨 shape: (5994,)
Validation 이미지 shape: (1200, 224, 224, 3), 감정 라벨 shape: (1200,)
Test 이미지 shape: (1200, 224, 224, 3), 감정 라벨 shape: (1200,)


In [7]:
# Dataset 클래스
class EmotionDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        image = Image.fromarray(image.astype('uint8'))
        if self.transform:
            image = self.transform(image)
        return image, label

In [8]:
img_size = 224
common_transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [9]:
train_dataset = EmotionDataset(train_images, train_emotion_idx, transform=common_transform)
val_dataset = EmotionDataset(val_images, val_emotion_idx, transform=common_transform)
test_dataset = EmotionDataset(test_images, test_emotion_idx, transform=common_transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

##### 모델

In [None]:
# ViT 모델
class ViT_EmotionClassifier(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES):
        super(ViT_EmotionClassifier, self).__init__()
        self.vit = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=0)
        self.vit.head = nn.Linear(self.vit.num_features, num_classes)
    def forward(self, x):
        return self.vit(x)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ViT_EmotionClassifier(num_classes=NUM_CLASSES).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [11]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001, path='checkpoint.pt'):
        self.patience = patience              # 개선 안되는 최대 epoch 수
        self.min_delta = min_delta            # 최소 개선 폭
        self.counter = 0                      # 개선 안 된 epoch 수
        self.best_score = None                # 최고 성능
        self.early_stop = False               # 중단 여부
        self.path = path                      # 모델 저장 경로

    def __call__(self, val_score, model):
        if self.best_score is None:
            self.best_score = val_score
            self.save_checkpoint(model)
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            print(f"EarlyStopping 카운트: {self.counter}/{self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.save_checkpoint(model)
            self.counter = 0

    def save_checkpoint(self, model):
        torch.save(model.state_dict(), self.path)
        print(f'>>> [Val] 모델 저장됨: {self.path} (Val Acc: {self.best_score:.4f})')

In [12]:
save_path_val = '/workspace/yonghak/vit_base_crop.pth'
early_stopping = EarlyStopping(patience=3, min_delta=0.001, path=save_path_val)

for epoch in range(20):
    
    # ---- 학습 ----
    model.train()
    train_loss, train_correct = 0, 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * imgs.size(0)
        preds = outputs.argmax(dim=1)
        train_correct += (preds == labels).sum().item()
    train_acc = train_correct / len(train_dataset)
    train_loss_avg = train_loss / len(train_dataset)
    print(f"Epoch {epoch+1} | Train Loss: {train_loss_avg:.4f} | Train Acc: {train_acc:.4f}")

    # ---- 검증 ----
    model.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * imgs.size(0)
            preds = outputs.argmax(dim=1)
            val_correct += (preds == labels).sum().item()
    val_acc = val_correct / len(val_dataset)
    val_loss_avg = val_loss / len(val_dataset)
    print(f"Validation Loss: {val_loss_avg:.4f} | Validation Acc: {val_acc:.4f}")

    # ---- Early Stopping 호출 ----
    early_stopping(val_acc, model)

    if early_stopping.early_stop:
        print("EarlyStopping: validation 성능이 개선되지 않아 학습을 조기 종료합니다.")
        break

Epoch 1 | Train Loss: 1.4147 | Train Acc: 0.2840
Validation Loss: 1.2809 | Validation Acc: 0.3767
>>> [Val] 모델 저장됨: /workspace/yonghak/vit_base_crop.pth (Val Acc: 0.3767)
Epoch 2 | Train Loss: 0.9488 | Train Acc: 0.5906
Validation Loss: 0.9546 | Validation Acc: 0.6000
>>> [Val] 모델 저장됨: /workspace/yonghak/vit_base_crop.pth (Val Acc: 0.6000)
Epoch 3 | Train Loss: 0.6898 | Train Acc: 0.7191
Validation Loss: 0.8758 | Validation Acc: 0.6683
>>> [Val] 모델 저장됨: /workspace/yonghak/vit_base_crop.pth (Val Acc: 0.6683)
Epoch 4 | Train Loss: 0.5424 | Train Acc: 0.7875
Validation Loss: 0.8627 | Validation Acc: 0.6800
>>> [Val] 모델 저장됨: /workspace/yonghak/vit_base_crop.pth (Val Acc: 0.6800)
Epoch 5 | Train Loss: 0.4373 | Train Acc: 0.8302
Validation Loss: 0.8081 | Validation Acc: 0.7025
>>> [Val] 모델 저장됨: /workspace/yonghak/vit_base_crop.pth (Val Acc: 0.7025)
Epoch 6 | Train Loss: 0.3525 | Train Acc: 0.8629
Validation Loss: 0.9705 | Validation Acc: 0.6650
EarlyStopping 카운트: 1/3
Epoch 7 | Train Loss: 0.

In [13]:
# 평가
model.eval()
test_loss, test_correct = 0, 0
with torch.no_grad():
    for imgs, labels in test_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * imgs.size(0)
        preds = outputs.argmax(dim=1)
        test_correct += (preds == labels).sum().item()

print(f"Test Loss: {test_loss/len(test_dataset):.4f} "
      f"Test Acc: {test_correct/len(test_dataset):.4f}")

Test Loss: 1.2517 Test Acc: 0.6833


In [14]:
# 혼동행렬
from sklearn.metrics import confusion_matrix, classification_report
all_preds = []
all_labels = []
model.eval()
with torch.no_grad():
    for imgs, labels in test_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        preds = outputs.argmax(dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
cm = confusion_matrix(all_labels, all_preds)
print(cm)
acc = np.mean(np.array(all_labels) == np.array(all_preds))
print(f"\n 정확도: {acc:.4f} \n")
print(classification_report(all_labels, all_preds, target_names=emotion_le.classes_))

[[236  29  19  16]
 [ 20 242  26  12]
 [ 23  89 159  29]
 [ 20  50  48 182]]

 정확도: 0.6825 

              precision    recall  f1-score   support

          기쁨       0.79      0.79      0.79       300
          당황       0.59      0.81      0.68       300
          분노       0.63      0.53      0.58       300
          슬픔       0.76      0.61      0.68       300

    accuracy                           0.68      1200
   macro avg       0.69      0.68      0.68      1200
weighted avg       0.69      0.68      0.68      1200

