# 2025 데이터 크리에이터 캠프

@PHASE: Mission 2

@TEAM: 최후의 인공지능

## Check GPU Availability

In [None]:
!nvidia-smi

In [None]:
# Set CUDA Device Number
DEVICE_NUM = 0
ADDITIONAL_GPU = 0

from os import environ
environ["CUDA_VISIBLE_DEVICES"] = ",".join([f"{i+DEVICE_NUM}" for i in range(0, ADDITIONAL_GPU+1)])
environ["CUDA_VISIBLE_DEVICES"]

## Imports

In [None]:
import os
os.getcwd()

In [None]:
os.chdir("/home/ubuntu/test_trainer/src")

In [None]:
from os import path

from creator_camp.datasets import KompsatDatasetForHeightRegression, DatasetHolder

import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import SwinModel, AutoImageProcessor, AutoConfig

from tqdm.auto import tqdm
import matplotlib.pyplot as plt

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"INFO: Using device - {device}")

In [None]:
# Pretrained Model
base_model_id = "microsoft/swin-base-patch4-window7-224"

## Define Dataset

In [None]:
DATA_ROOT = path.join(".", "data")

kompstats = DatasetHolder(
    train=KompsatDatasetForHeightRegression(root=DATA_ROOT, train=True),
    valid=KompsatDatasetForHeightRegression(root=DATA_ROOT, train=False)
)
kompstats.test = kompstats.valid

In [None]:
kompstats.train[0]

In [None]:
rgb_image, annotation = kompstats.train[0]
fig, axes = plt.subplots(1, 1, figsize=(6, 6))

axes.imshow(rgb_image)
axes.set_title('Image')
axes.axis('off')

for region in annotation['regions']:
    x1, y1, w, h = region['xywh']
    rect = plt.Rectangle((x1, y1), w, h, fill=False, edgecolor='red', linewidth=2)
    axes.add_patch(rect)

    polyline = region['polyline']
    xs = [polyline[0], polyline[2]]
    ys = [polyline[1], polyline[3]]
    axes.plot(xs, ys, color='blue', linewidth=2)

plt.tight_layout()
plt.show()

In [None]:
ORIGIN_SIZE = 512
PROCESSOR = AutoImageProcessor.from_pretrained(base_model_id, use_fast=True)
TARGET_SIZE = PROCESSOR.size['height']

TARGET_SIZE

In [None]:
def target_transform(data):
    regions = data['regions']
    polylines = []
    heights = []
    for region in regions:
        polyline = region['xywh']
        height = region['chi_height']
        polylines.append(polyline)
        heights.append(height)
    return torch.tensor(polylines) / ORIGIN_SIZE * TARGET_SIZE, torch.tensor(heights) / 100

In [None]:
kompstats.train.target_transform = target_transform
kompstats.valid.target_transform = target_transform

In [None]:
kompstats.train[0]

## DataLoader

In [None]:
# Set Batch Size
BATCH_SIZE = 4, 8, 8  # Local
BATCH_SIZE = 32, 64, 64  # A6000
BATCH_SIZE = 300, 515, 515  # A100

print(f"INFO: Set batch size - Train: {BATCH_SIZE[0]}, Valid: {BATCH_SIZE[1]}, Test: {BATCH_SIZE[2]}")

In [None]:
def collate_fn(batch):
    images, labels = zip(*batch)
    images = PROCESSOR(images=list(images), return_tensors="pt").to(device)['pixel_values']
    coords, heights = zip(*labels)
    return images, coords, heights

In [None]:
train_loader = DataLoader(kompstats.train, batch_size=BATCH_SIZE[0], shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(kompstats.valid, batch_size=BATCH_SIZE[1], shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(kompstats.test, batch_size=BATCH_SIZE[2], shuffle=False, collate_fn=collate_fn)

In [None]:
load_samples = train_loader.__iter__().__next__()
load_samples

In [None]:
load_samples[0].shape

## Load Model

In [None]:
base_config = AutoConfig.from_pretrained(base_model_id)
base_config.hidden_size

In [None]:
base_state_dict = SwinModel.from_pretrained(base_model_id, device_map="cpu").state_dict()

In [None]:
class SwinForHeightRegression(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.model = SwinModel(config)
        self.coords_embedding = nn.Linear(4, self.model.config.hidden_size)
        self.decoder_height = 1
        self.cross_attentions = nn.ModuleList([nn.MultiheadAttention(
            embed_dim=self.model.config.hidden_size,
            num_heads=8,
            dropout=0.1,
            batch_first=True
        ) for _ in range(self.decoder_height)])
        self.head = nn.Linear(self.model.config.hidden_size, 1)

    def forward(self, pixel_values, polylines):
        dtype = self.model.dtype
        outputs = self.model(pixel_values.to(device).to(dtype))
        results = []

        for hidden_states, polyline in zip(outputs.last_hidden_state, polylines):
            coords_embed = self.coords_embedding(polyline.to(device).to(dtype)).unsqueeze(0)

            key_and_value = hidden_states.unsqueeze(0)
            for cross_attn in self.cross_attentions:
                coords_embed, _ = cross_attn(coords_embed, key_and_value, key_and_value)

            results.append(self.head(coords_embed).reshape(-1))
        return results

In [None]:
model = SwinForHeightRegression(base_config)
model.model.load_state_dict(base_state_dict)
model.bfloat16().to(device)

## Train

In [None]:
# Set Epoch Count & Learning Rate
EPOCHS = 100
LEARNING_RATE = 1e-4, 1e-6

In [None]:
classifier = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE[0], weight_decay=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=LEARNING_RATE[1])

In [None]:
for name, param in model.model.named_parameters():
    param.requires_grad = False  # backbone freeze

for epoch in tqdm(range(EPOCHS), desc="Running Epochs"):
    train_loss, valid_loss = 0, 0
    train_rmse, valid_rmse = 0, 0

    if epoch > 20:
        for name, param in model.model.named_parameters():
            if "norm" not in name:
                param.requires_grad = True

    model.train()
    train_bar = tqdm(total=int(len(kompstats.train)/BATCH_SIZE[0]+0.5), desc=f"Training for {epoch+1}/{EPOCHS}")
    for inputs, coords, heights in train_loader:
        optimizer.zero_grad()

        preds = model(inputs, coords)
        all_preds = torch.cat([pred.flatten() for pred in preds])
        all_heights = torch.cat([height.flatten() for height in heights]).to(device).to(all_preds.dtype)

        losses = classifier(all_preds, all_heights)
        rmse = torch.sqrt(classifier(all_preds*100, all_heights*100))
        losses.backward()

        train_loss += losses.item()
        train_rmse += rmse.item()

        optimizer.step()
        train_bar.update(1)
        train_bar.set_postfix({"Loss": f"{losses.item():.6f}", "RMSE": rmse.item()})
    train_bar.set_postfix({"Loss": f"{train_loss / train_bar.total:.6f}", "RMSE": train_rmse / train_bar.total})
    train_bar.close()

    model.eval()
    valid_bar = tqdm(total=int(len(kompstats.valid)/BATCH_SIZE[1]+0.5), desc=f"Validating for {epoch+1}/{EPOCHS}")
    with torch.inference_mode():
        for inputs, coords, heights in valid_loader:
            preds = model(inputs, coords)
            all_preds = torch.cat([pred.flatten() for pred in preds])
            all_heights = torch.cat([height.flatten() for height in heights]).to(device).to(all_preds.dtype)

            losses = classifier(all_preds, all_heights)
            rmse = torch.sqrt(classifier(all_preds*100, all_heights*100))

            valid_loss += losses.item()
            valid_rmse += rmse.item()
            valid_bar.update(1)

    valid_bar.set_postfix({"Loss": f"{valid_loss / valid_bar.total:.6f}", "RMSE": valid_rmse / valid_bar.total})
    valid_bar.close()

    scheduler.step()

 ## Evaluate