In [1]:
import os
import cv2
import torch
import numpy as np
import pandas as pd

import wandb
import datetime
from tqdm.auto import tqdm

from timm.models import resnet18, efficientnetv2_s, vit_tiny_r_s16_p8_224, vit_small_r26_s32_224
from torchsummary import summary

In [9]:
model = vit_tiny_r_s16_p8_224(pretrained=True).cuda()
summary(model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
     StdConv2dSame-1         [-1, 64, 112, 112]           9,408
          Identity-2         [-1, 64, 112, 112]               0
              ReLU-3         [-1, 64, 112, 112]               0
      GroupNormAct-4         [-1, 64, 112, 112]             128
     MaxPool2dSame-5           [-1, 64, 56, 56]               0
            Conv2d-6            [-1, 192, 7, 7]         786,624
       HybridEmbed-7              [-1, 49, 192]               0
           Dropout-8              [-1, 50, 192]               0
          Identity-9              [-1, 50, 192]               0
         Identity-10              [-1, 50, 192]               0
        LayerNorm-11              [-1, 50, 192]             384
           Linear-12              [-1, 50, 576]         111,168
         Identity-13            [-1, 3, 50, 64]               0
         Identity-14            [-1, 3,

In [2]:
model = vit_small_r26_s32_224(pretrained=True).cuda()
summary(model, (3, 224, 224))

HBox(children=(HTML(value='Downloading model.safetensors'), FloatProgress(value=0.0, max=145752166.0), HTML(va…


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
     StdConv2dSame-1         [-1, 64, 112, 112]           9,408
          Identity-2         [-1, 64, 112, 112]               0
              ReLU-3         [-1, 64, 112, 112]               0
      GroupNormAct-4         [-1, 64, 112, 112]             128
     MaxPool2dSame-5           [-1, 64, 56, 56]               0
     StdConv2dSame-6          [-1, 256, 56, 56]          16,384
          Identity-7          [-1, 256, 56, 56]               0
          Identity-8          [-1, 256, 56, 56]               0
      GroupNormAct-9          [-1, 256, 56, 56]             512
   DownsampleConv-10          [-1, 256, 56, 56]               0
    StdConv2dSame-11           [-1, 64, 56, 56]           4,096
         Identity-12           [-1, 64, 56, 56]               0
             ReLU-13           [-1, 64, 56, 56]               0
     GroupNormAct-14           [-1, 64

In [3]:
model

VisionTransformer(
  (patch_embed): HybridEmbed(
    (backbone): ResNetV2(
      (stem): Sequential(
        (conv): StdConv2dSame(3, 64, kernel_size=(7, 7), stride=(2, 2), bias=False)
        (norm): GroupNormAct(
          32, 64, eps=1e-05, affine=True
          (drop): Identity()
          (act): ReLU(inplace=True)
        )
        (pool): MaxPool2dSame(kernel_size=(3, 3), stride=(2, 2), padding=(0, 0), dilation=(1, 1), ceil_mode=False)
      )
      (stages): Sequential(
        (0): ResNetStage(
          (blocks): Sequential(
            (0): Bottleneck(
              (downsample): DownsampleConv(
                (conv): StdConv2dSame(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (norm): GroupNormAct(
                  32, 256, eps=1e-05, affine=True
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (conv1): StdConv2dSame(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      

In [10]:
model = efficientnetv2_s().cuda()
summary(model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 24, 112, 112]             648
          Identity-2         [-1, 24, 112, 112]               0
              SiLU-3         [-1, 24, 112, 112]               0
    BatchNormAct2d-4         [-1, 24, 112, 112]              48
            Conv2d-5         [-1, 24, 112, 112]           5,184
          Identity-6         [-1, 24, 112, 112]               0
              SiLU-7         [-1, 24, 112, 112]               0
    BatchNormAct2d-8         [-1, 24, 112, 112]              48
          Identity-9         [-1, 24, 112, 112]               0
        ConvBnAct-10         [-1, 24, 112, 112]               0
           Conv2d-11         [-1, 24, 112, 112]           5,184
         Identity-12         [-1, 24, 112, 112]               0
             SiLU-13         [-1, 24, 112, 112]               0
   BatchNormAct2d-14         [-1, 24, 1

In [5]:
model

EfficientNet(
  (conv_stem): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNormAct2d(
    24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNormAct2d(
          24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (drop_path): Identity()
      )
      (1): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNormAct2d(
          24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (drop_path): Identity()
      )
    )
    (1): Seq

In [2]:
model = resnet18(pretrained=True).cuda()

summary(model, (3, 128, 128))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 64, 64]           9,408
       BatchNorm2d-2           [-1, 64, 64, 64]             128
              ReLU-3           [-1, 64, 64, 64]               0
         MaxPool2d-4           [-1, 64, 32, 32]               0
            Conv2d-5           [-1, 64, 32, 32]          36,864
       BatchNorm2d-6           [-1, 64, 32, 32]             128
          Identity-7           [-1, 64, 32, 32]               0
              ReLU-8           [-1, 64, 32, 32]               0
          Identity-9           [-1, 64, 32, 32]               0
           Conv2d-10           [-1, 64, 32, 32]          36,864
      BatchNorm2d-11           [-1, 64, 32, 32]             128
             ReLU-12           [-1, 64, 32, 32]               0
       BasicBlock-13           [-1, 64, 32, 32]               0
           Conv2d-14           [-1, 64,

In [3]:
df = pd.read_csv("/opt/ml/level3_cv_finalproject-cv-01/model/data/data.csv")
df.head()

Unnamed: 0,id,class_id,label,img_path,json_path
0,0,1,baek_sook,./data/image/baek_sook/baek_sook_0001.jpg,./data/json/baek_sook/1_korea_baek_sook.json
1,1,2,baek_sook,./data/image/baek_sook/baek_sook_0002.jpg,./data/json/baek_sook/2_korea_baek_sook.json
2,2,3,baek_sook,./data/image/baek_sook/baek_sook_0003.jpg,./data/json/baek_sook/3_korea_baek_sook.json
3,3,4,baek_sook,./data/image/baek_sook/baek_sook_0004.jpg,./data/json/baek_sook/4_korea_baek_sook.json
4,4,5,baek_sook,./data/image/baek_sook/baek_sook_0005.jpg,./data/json/baek_sook/5_korea_baek_sook.json


In [4]:
base_path = "/opt/ml/level3_cv_finalproject-cv-01/model"
images_path = []
labels = []

for i, image_path, label in zip(df['class_id'], df["img_path"], df["label"]):
    if i > 200:
        images_path.append(os.path.join(base_path, image_path))
        labels.append(label)

classes = sorted(set(labels))
cls2idx = {c:i for i, c in enumerate(classes)}
print(f"labels : {len(labels)}, classes : {len(classes)}")

labels : 76673, classes : 93


In [5]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, csv_path, is_train=False, tf=None) -> None:
        df = pd.read_csv(csv_path)
        images_path = []
        labels = []
        for i, image_path, label in zip(df['class_id'], df["img_path"], df["label"]):
            if is_train and i > 200:
                images_path.append(os.path.join(base_path, image_path))
                labels.append(label)
            else:
                images_path.append(os.path.join(base_path, image_path))
                labels.append(label)
        
        self.classes = sorted(set(labels))
        self.cls2idx = {c:i for i, c in enumerate(classes)}
        self.images_path = images_path
        self.labels = labels
        self.is_train = is_train
        self.tf = tf
    
    def __len__(self):
        return len(self.images_path)

    def __getitem__(self, index):
        image_path = self.images_path[index]
        label = self.labels[labels]

        image = cv2.imread(image_path)
        image = image / 255.0

        if self.tf is not None:
            inputs = ({"image": image})
            result = self.tf(**inputs)

            image = result["image"]
        image = image.transpose(2, 0, 1)
        return image, self.cls2idx[label]

In [6]:
import albumentations as A

tf = A.Compose([
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

train_dataset = CustomDataset(
            "/opt/ml/level3_cv_finalproject-cv-01/model/data/data.csv", is_train=True, tf=tf
        )

In [7]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=4,
            shuffle=True,
            num_workers=8,
            drop_last=True,
        )