In [None]:
!pip uninstall -y pillow
!CC="cc -mavx2" pip install -U --force-reinstall pillow-simd

In [None]:
!wget https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000284/data/data.tar.gz

In [None]:
!tar -zxvf data.tar.gz

In [None]:
from glob import glob
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import random
import albumentations as A
from PIL import Image
import cv2
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend
from tensorflow.keras import layers
from torch.utils.data import Dataset, DataLoader

In [None]:
tf.config.set_soft_device_placement(True)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  print('Device:', tpu.master())
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
  strategy = tf.distribute.get_strategy()
print('Number of replicas', strategy.num_replicas_in_sync)

In [None]:
# 랜덤 시드 설정
seed_value = 1337

# Python의 시드 설정
random.seed(seed_value)

# Numpy의 시드 설정
np.random.seed(seed_value)

# TensorFlow의 시드 설정
tf.random.set_seed(seed_value)

In [None]:
img_width, img_height = 384, 384

In [None]:
transform = A.Compose([
    A.ShiftScaleRotate(scale_limit=(0, 0.1), p=0.7),
    A.RandomBrightnessContrast(brightness_limit=[-0.3, 0.1], contrast_limit=[-0.3, 0.1], p=1),
    A.HorizontalFlip(p=0.5),
    A.GaussNoise(var_limit=(10, 50), p=0.5),
    A.CoarseDropout(p=0.3, max_holes=15, max_height=15, max_width=15),
    A.OneOf([
        A.CLAHE(p=0.7),
        A.ToGray(p=0.1),
        A.Blur(blur_limit=(5, 10), p=0.2)
    ], p=1)
])

In [None]:
ROOT_DIR = "/content/data"
TRAIN_DIR = "/content/data/train"

In [None]:
def get_train_file_path(id):
    return f"{TRAIN_DIR}/{id}"
df = pd.read_csv(f"{ROOT_DIR}/train.csv")
df['file_path'] = df['ID'].apply(get_train_file_path)
df.head()

In [None]:
file_path = df['file_path']
target = df['target']

In [None]:
from PIL import Image
import cv2
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [None]:
def load_dataset(dataset_paths, img_width, img_height):
    x_data = []
    for path in tqdm(dataset_paths):
        img = cv2.imread(path)
        img = cv2.resize(img, (img_width, img_height))
        img = np.asarray(img)
        img = tf.keras.applications.efficientnet_v2.preprocess_input(img)
        x_data.append(img)

    x_data = np.array(x_data)
    return x_data

x_data = load_dataset(file_path, img_width, img_height)
y_data = np.array(target)

In [None]:
patch_size      = (2,2)   # 2-by-2 sized patches
dropout_rate    = 0.5     # Dropout rate
num_heads       = 8       # Attention heads
embed_dim       = 64      # Embedding dimension
num_mlp         = 128     # MLP layer size
qkv_bias        = True    # Convert embedded patches to query, key, and values
window_size     = 2       # Size of attention window
shift_size      = 1       # Size of shifting window
image_dimension = 24      # Initial image size / Input size of the transformer model

num_patch_x = image_dimension // patch_size[0]
num_patch_y = image_dimension // patch_size[1]

In [None]:
def window_partition(x, window_size):
    _, height, width, channels = x.shape
    patch_num_y = height // window_size
    patch_num_x = width // window_size
    x = tf.reshape(
        x, shape=(-1, patch_num_y, window_size, patch_num_x, window_size, channels)
    )
    x = tf.transpose(x, (0, 1, 3, 2, 4, 5))
    windows = tf.reshape(x, shape=(-1, window_size, window_size, channels))
    return windows


def window_reverse(windows, window_size, height, width, channels):
    patch_num_y = height // window_size
    patch_num_x = width // window_size
    x = tf.reshape(
        windows,
        shape=(-1, patch_num_y, patch_num_x, window_size, window_size, channels),
    )
    x = tf.transpose(x, perm=(0, 1, 3, 2, 4, 5))
    x = tf.reshape(x, shape=(-1, height, width, channels))
    return x


class DropPath(layers.Layer):
    def __init__(self, drop_prob=None, **kwargs):
        super(DropPath, self).__init__(**kwargs)
        self.drop_prob = drop_prob

    def call(self, inputs, training=None):
        if self.drop_prob == 0.0 or not training:
            return inputs
        else:
            batch_size = tf.shape(inputs)[0]
            keep_prob = 1 - self.drop_prob
            path_mask_shape = (batch_size,) + (1,) * (len(tf.shape(inputs)) - 1)
            path_mask = tf.floor(
                backend.random_bernoulli(path_mask_shape, p=keep_prob)
            )
            outputs = (
                tf.math.divide(tf.cast(inputs, dtype=tf.float32), keep_prob) * path_mask
            )
            return outputs

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "drop_prob": self.drop_prob,
            }
        )
        return config

In [None]:
class PatchExtract(layers.Layer):
    def __init__(self, patch_size, **kwargs):
        super().__init__(**kwargs)
        self.patch_size_x = patch_size[0]
        self.patch_size_y = patch_size[0]

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=(1, self.patch_size_x, self.patch_size_y, 1),
            strides=(1, self.patch_size_x, self.patch_size_y, 1),
            rates=(1, 1, 1, 1),
            padding="VALID",
        )
        patch_dim = patches.shape[-1]
        patch_num = patches.shape[1]
        return tf.reshape(patches, (batch_size, patch_num * patch_num, patch_dim))

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "patch_size_y": self.patch_size_y,
                "patch_size_x": self.patch_size_x,
            }
        )
        return config


class PatchEmbedding(layers.Layer):
    def __init__(self, num_patch, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.num_patch = num_patch
        self.proj = layers.Dense(embed_dim)
        self.pos_embed = layers.Embedding(input_dim=num_patch, output_dim=embed_dim)

    def call(self, patch):
        pos = tf.range(start=0, limit=self.num_patch, delta=1)
        return self.proj(patch) + self.pos_embed(pos)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "num_patch": self.num_patch,
            }
        )
        return config


class PatchMerging(layers.Layer):
    def __init__(self, num_patch, embed_dim):
        super().__init__()
        self.num_patch = num_patch
        self.embed_dim = embed_dim
        self.linear_trans = layers.Dense(2 * embed_dim, use_bias=False)

    def call(self, x):
        height, width = self.num_patch
        _, _, C = x.get_shape().as_list()
        x = tf.reshape(x, shape=(-1, height, width, C))
        feat_maps = x

        x0 = x[:, 0::2, 0::2, :]
        x1 = x[:, 1::2, 0::2, :]
        x2 = x[:, 0::2, 1::2, :]
        x3 = x[:, 1::2, 1::2, :]
        x = tf.concat((x0, x1, x2, x3), axis=-1)
        x = tf.reshape(x, shape=(-1, (height // 2) * (width // 2), 4 * C))
        return self.linear_trans(x), feat_maps

    def get_config(self):
        config = super().get_config()
        config.update({"num_patch": self.num_patch, "embed_dim": self.embed_dim})
        return config

In [None]:
class WindowAttention(layers.Layer):
    def __init__(
        self,
        dim,
        window_size,
        num_heads,
        qkv_bias=True,
        dropout_rate=0.0,
        return_attention_scores=False,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.dim = dim
        self.window_size = window_size
        self.num_heads = num_heads
        self.scale = (dim // num_heads) ** -0.5
        self.return_attention_scores = return_attention_scores
        self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias)
        self.dropout = layers.Dropout(dropout_rate)
        self.proj = layers.Dense(dim)

    def build(self, input_shape):
        self.relative_position_bias_table = self.add_weight(
            shape=(
                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1),
                self.num_heads,
            ),
            initializer="zeros",
            trainable=True,
            name="relative_position_bias_table",
        )

        self.relative_position_index = self.get_relative_position_index(
            self.window_size[0], self.window_size[1]
        )
        super().build(input_shape)

    def get_relative_position_index(self, window_height, window_width):
        x_x, y_y = tf.meshgrid(range(window_height), range(window_width))
        coords = tf.stack([y_y, x_x], axis=0)
        coords_flatten = tf.reshape(coords, [2, -1])

        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
        relative_coords = tf.transpose(relative_coords, perm=[1, 2, 0])

        x_x = (relative_coords[:, :, 0] + window_height - 1) * (2 * window_width - 1)
        y_y = relative_coords[:, :, 1] + window_width - 1
        relative_coords = tf.stack([x_x, y_y], axis=-1)

        return tf.reduce_sum(relative_coords, axis=-1)

    def call(self, x, mask=None):
        _, size, channels = x.shape
        head_dim = channels // self.num_heads
        x_qkv = self.qkv(x)
        x_qkv = tf.reshape(x_qkv, shape=(-1, size, 3, self.num_heads, head_dim))
        x_qkv = tf.transpose(x_qkv, perm=(2, 0, 3, 1, 4))
        q, k, v = x_qkv[0], x_qkv[1], x_qkv[2]
        q = q * self.scale
        k = tf.transpose(k, perm=(0, 1, 3, 2))
        attn = q @ k

        relative_position_bias = tf.gather(
            self.relative_position_bias_table,
            self.relative_position_index,
            axis=0,
        )
        relative_position_bias = tf.transpose(relative_position_bias, [2, 0, 1])
        attn = attn + tf.expand_dims(relative_position_bias, axis=0)

        if mask is not None:
            nW = mask.get_shape()[0]
            mask_float = tf.cast(
                tf.expand_dims(tf.expand_dims(mask, axis=1), axis=0), tf.float32
            )
            attn = (
                tf.reshape(attn, shape=(-1, nW, self.num_heads, size, size))
                + mask_float
            )
            attn = tf.reshape(attn, shape=(-1, self.num_heads, size, size))
            attn = tf.nn.softmax(attn, axis=-1)
        else:
            attn = tf.nn.softmax(attn, axis=-1)
        attn = self.dropout(attn)

        x_qkv = attn @ v
        x_qkv = tf.transpose(x_qkv, perm=(0, 2, 1, 3))
        x_qkv = tf.reshape(x_qkv, shape=(-1, size, channels))
        x_qkv = self.proj(x_qkv)
        x_qkv = self.dropout(x_qkv)

        if self.return_attention_scores:
            return x_qkv, attn
        else:
            return x_qkv

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "dim": self.dim,
                "window_size": self.window_size,
                "num_heads": self.num_heads,
                "scale": self.scale,
            }
        )
        return config

In [None]:
import jax
from jax import jit
from jax import random
from jax import numpy as jnp
from jax.experimental import jax2tf

class SwinTransformer(layers.Layer):
    def __init__(
        self,
        dim,
        num_patch,
        num_heads,
        window_size=7,
        shift_size=0,
        num_mlp=1024,
        qkv_bias=True,
        dropout_rate=0.0,
        **kwargs,
    ):
        super(SwinTransformer, self).__init__(**kwargs)

        self.dim = dim
        self.num_patch = num_patch
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.num_mlp = num_mlp

        self.norm1 = layers.LayerNormalization(epsilon=1e-5)
        self.attn = WindowAttention(
            dim,
            window_size=(self.window_size, self.window_size),
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            dropout_rate=dropout_rate,
        )
        self.drop_path = (
            DropPath(dropout_rate) if dropout_rate > 0.0 else tf.identity
        )
        self.norm2 = layers.LayerNormalization(epsilon=1e-5)

        self.mlp = keras.Sequential(
            [
                layers.Dense(num_mlp),
                layers.Activation(keras.activations.gelu),
                layers.Dropout(dropout_rate),
                layers.Dense(dim),
                layers.Dropout(dropout_rate),
            ]
        )

        if min(self.num_patch) < self.window_size:
            self.shift_size = 0
            self.window_size = min(self.num_patch)

    def build(self, input_shape):
        if self.shift_size == 0:
            self.attn_mask = None
        else:
            height, width = self.num_patch
            h_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            w_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            mask_array = jnp.zeros((1, height, width, 1))
            count = 0
            for h in h_slices:
                for w in w_slices:
                    mask_array[:, h, w, :] = count
                    count += 1
            mask_array = tf.convert_to_tensor(mask_array)

            # mask array to windows
            mask_windows = window_partition(mask_array, self.window_size)
            mask_windows = tf.reshape(
                mask_windows, shape=[-1, self.window_size * self.window_size]
            )
            attn_mask = tf.expand_dims(mask_windows, axis=1) - tf.expand_dims(
                mask_windows, axis=2
            )
            attn_mask = tf.where(attn_mask != 0, -100.0, attn_mask)
            attn_mask = tf.where(attn_mask == 0, 0.0, attn_mask)
            self.attn_mask = tf.Variable(initial_value=attn_mask, trainable=False)

    def call(self, x):
        height, width = self.num_patch
        _, num_patches_before, channels = x.shape
        x_skip = x
        x = self.norm1(x)
        x = tf.reshape(x, shape=(-1, height, width, channels))
        if self.shift_size > 0:
            shifted_x = tf.roll(
                x, shift=[-self.shift_size, -self.shift_size], axis=[1, 2]
            )
        else:
            shifted_x = x

        x_windows = window_partition(shifted_x, self.window_size)
        x_windows = tf.reshape(
            x_windows, shape=(-1, self.window_size * self.window_size, channels)
        )
        attn_windows = self.attn(x_windows, mask=self.attn_mask)

        attn_windows = tf.reshape(
            attn_windows, shape=(-1, self.window_size, self.window_size, channels)
        )
        shifted_x = window_reverse(
            attn_windows, self.window_size, height, width, channels
        )
        if self.shift_size > 0:
            x = tf.roll(
                shifted_x, shift=[self.shift_size, self.shift_size], axis=[1, 2]
            )
        else:
            x = shifted_x

        x = tf.reshape(x, shape=(-1, height * width, channels))
        x = self.drop_path(x)
        x = tf.cast(x_skip, dtype=tf.float32) + tf.cast(x, dtype=tf.float32)
        x_skip = x
        x = self.norm2(x)
        x = self.mlp(x)
        x = self.drop_path(x)
        x = tf.cast(x_skip, dtype=tf.float32) + tf.cast(x, dtype=tf.float32)
        return x

In [None]:
def train_step_sam(self, data, rho=0.05):
    """
    Overrides the train_step method of Model

    Args:
        data : Data on which model is to be trained
        rho  : Hyperparameter Rho indicating the size of neighborhood
    """

    sample_weight = None
    x, y = data

    # Opening Gradient Tape scope to record operations during 1st forward pass
    with tf.GradientTape() as tape:
        y_pred = self(x, training=True)
        # Calculating loss to calculate gradients
        loss = self.compiled_loss(y, y_pred, sample_weight=sample_weight, regularization_losses=self.losses)


    trainable_vars = self.trainable_variables
    # Calculating gradients with respect trainable variable
    gradients = tape.gradient(loss, trainable_vars)

    """
    This is the first step which involves calculating the point w_adv with highest loss and virtually moving to that point so that we can get gradient at that point.
    """
    eps_w_ls = [] # list to store the updates done to trainable variables in first step

    #computing the norm of gradients which is required for computing eps_w
    grad_norm = tf.linalg.global_norm(gradients)

    # Iterating over trainable_vars
    for i in range(len(trainable_vars)):
        # we will calculate eps_w to find w_adv point having highest loss in rho neighborhood
        eps_w = tf.math.multiply(gradients[i], rho / grad_norm )
        # temporarily moving to w_adv point
        trainable_vars[i].assign_add(eps_w)
        # storing updates done in eps_w_ls list
        eps_w_ls.append(eps_w)

    # Opening Gradient Tape scope to record operations during 2nd forward pass
    with tf.GradientTape() as tape:
        y_pred = self(x, training=True)
        # Calculating loss to calculate gradient at w_adv point
        loss = self.compiled_loss(y, y_pred, sample_weight=sample_weight, regularization_losses=self.losses)

    trainable_vars = self.trainable_variables
    #computing gradient at w_adv which is our objective in this first step
    gradients = tape.gradient(loss, trainable_vars)

    """
    This is the second step in SAM where we will do actual update at the initial point from the gradient calculated at adversial point w_adv
    """

    for i in range(len(trainable_vars)):
        # Going back to orignal parameters
        trainable_vars[i].assign_sub(eps_w_ls[i])

    # Updating parameters with gradients computed at w_adv
    self.optimizer.apply_gradients(zip(gradients, trainable_vars))

    # Updating the metrics.
    self.compiled_metrics.update_state(y, y_pred, sample_weight=sample_weight)

    # returns a dictionary mapping metric names (including the loss) to their current value.
    return {m.name: m.result() for m in self.metrics}

In [None]:
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization, LayerNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.applications.efficientnet_v2 import EfficientNetV2M
from sklearn.metrics import f1_score
from tensorflow.keras.losses import categorical_crossentropy

with strategy.scope():
    base = EfficientNetV2M(weights='imagenet', include_top=False, input_shape=(img_height, img_width, 3))

    class HybridModel(keras.Model):
        def __init__(self, model_name, **kwargs):
            super().__init__(name=model_name, **kwargs)

            # base model with compatible output which will be an input of transformer model
            self.multi_output_cnn = keras.Model(
                [base.inputs],
                [base.get_layer("block6a_expand_activation").output, base.output],
                name="efficientnet",
            )

            # base model's (cnn model) head
            self.conv_head = keras.Sequential(
                [
                    layers.GlobalAveragePooling2D(),
                    layers.AlphaDropout(0.5),
                    layers.LayerNormalization()
                ],
                name="conv_head",
            )

            # stuff of swin transformers
            self.patch_extract = PatchExtract(patch_size)
            self.patch_embedds = PatchEmbedding(num_patch_x * num_patch_y, embed_dim)
            self.patch_merging = PatchMerging(
                (num_patch_x, num_patch_y), embed_dim=embed_dim
            )

            # swin blocks containers
            self.swin_sequences = keras.Sequential(name="swin_blocks")
            for i in range(shift_size):
                self.swin_sequences.add(
                    SwinTransformer(
                        dim=embed_dim,
                        num_patch=(num_patch_x, num_patch_y),
                        num_heads=num_heads,
                        window_size=window_size,
                        shift_size=i,
                        num_mlp=num_mlp,
                        qkv_bias=qkv_bias,
                        dropout_rate=dropout_rate,
                    )
                )

            # swin block's head
            self.swin_head = keras.Sequential(
                [
                    layers.GlobalAveragePooling1D(),
                    layers.AlphaDropout(0.5),
                    layers.LayerNormalization(),
                ],
                name="swin_head",
            )

            # classifier
            self.classifier = layers.Dense(
                17, activation='softmax'
            )

            # build the graph
            self.build_graph()

        def forward_cnn(self, inputs):
            # CNN model.
            return self.multi_output_cnn(inputs)

        def forward_transformer(self, inputs):
            # Transformer model.
            x = self.patch_extract(inputs)
            x = self.patch_embedds(x)
            x = self.swin_sequences(tf.cast(x, dtype=tf.float32))
            x, swin_gcam_top = self.patch_merging(x)
            return x, swin_gcam_top

        def call(self, inputs, training=None, **kwargs):
            cnn_mid_layer, cnn_gcam_top = self.forward_cnn(inputs)
            transformer_output, transformer_gcam_top = self.forward_transformer(
                cnn_mid_layer
            )

            transformer_output = self.swin_head(transformer_output)
            cnn_output = self.conv_head(cnn_gcam_top)
            logits = self.classifier(tf.concat([transformer_output, cnn_output], axis=-1))

            return logits

        def build_graph(self):
            x = keras.Input(shape=(img_width, img_height, 3))
            return keras.Model(inputs=[x], outputs=self.call(x))

        # overriding the train_step method  with our custom train_step_sam created in earlier cell
        def train_step(self, data):
            return train_step_sam(self, data, rho=0.05) # using rho as 0.05 you can tune this hyperparameter

    # focal loss 정의
    def focal_loss(gamma=2.0, alpha=0.25):
        def loss(y_true, y_pred):
            ce_loss = categorical_crossentropy(y_true, y_pred)
            pt = tf.math.exp(-ce_loss)
            focal_loss = alpha * tf.math.pow(1. - pt, gamma) * ce_loss
            return focal_loss
        return loss

    model = HybridModel("efficientnet")
    model.build(input_shape=(None, img_height, img_width, 3))

    model.compile(
        optimizer = tf.keras.optimizers.AdamW(learning_rate=0.0001),
        loss = focal_loss(),
        metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
from sklearn.model_selection import StratifiedKFold

# 콜백들
earlystopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    mode='min',
    min_delta = 0.001,
    patience=8,
    restore_best_weights=True,
    verbose=0)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
checkpoint_dir = f'/content/Checkpoints'
best_f1_score = 0
best_weights = None

for fold, (train_index, val_index) in enumerate(kfold.split(x_data, y_data)):

    # 학습 데이터와 검증 데이터를 나누기
    x_train, y_train = x_data[train_index], y_data[train_index]
    x_val, y_val = x_data[val_index], y_data[val_index]

    model.fit(x_train, tf.keras.utils.to_categorical(y_train),
                batch_size=32,
                epochs=30,
                validation_data=(x_val, tf.keras.utils.to_categorical(y_val)),
                callbacks=[earlystopping, reduce_lr])

    # 모델 예측
    y_pred = model.predict(x_val)
    y_pred_label = np.argmax(y_pred, axis=1)

    # 모델 평가
    score_f1 = f1_score(y_val, y_pred_label, average='macro')
    print(f'Fold {fold+1} - f1_score:', score_f1)

    # 가장 좋은 f1_score를 보인 모델 저장
    if score_f1 > best_f1_score:
        best_f1_score = score_f1
        best_weights = model.get_weights()

# 모델 checkpoint 저장
checkpoint_name = f'{checkpoint_dir}/'
model.set_weights(best_weights)
model.save_weights(checkpoint_name)