# TOC

1. [Import](#1-import)
2. [필요한 정보 입력](#2-필요한-정보-입력)
3. [클래스별 면적](#3-클래스별-면적)   
4. [Outlier](#4-outlier)   
    4.1. [Outlier 찾기](#41-outlier-찾기)   
    4.2. [Dice 점수가 가장 낮은 이미지 살펴보기](#42-dice-점수가-가장-낮은-이미지-살펴보기)      
    4.3. [Dice 점수가 가장 높은 이미지 살펴보기](#43-dice-점수가-가장-높은-이미지-살펴보기)   

# 1. Import

In [None]:
import os
os.chdir('/opt/ml/input/code/local')

import numpy as np
import albumentations as A
import matplotlib.pyplot as plt
import pandas as pd
import cv2
from tqdm import tqdm
import seaborn as sns
import json

import torch
import torch.nn.functional as F

from dataset import XRayDataset
from visualize import label2rgb
from metric import dice_coef

# 2. 필요한 정보 입력

In [None]:
data_root = "/opt/ml/input/data"
save_dir = "/opt/ml/input/code/local/checkpoints/[test]Baseline1_1226"

In [None]:
CLASSES = [
    "finger-1",
    "finger-2",
    "finger-3",
    "finger-4",
    "finger-5",
    "finger-6",
    "finger-7",
    "finger-8",
    "finger-9",
    "finger-10",
    "finger-11",
    "finger-12",
    "finger-13",
    "finger-14",
    "finger-15",
    "finger-16",
    "finger-17",
    "finger-18",
    "finger-19",
    "Trapezium",
    "Trapezoid",
    "Capitate",
    "Hamate",
    "Scaphoid",
    "Lunate",
    "Triquetrum",
    "Pisiform",
    "Radius",
    "Ulna",
]

PALETTE = [
    (220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228),
    (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30),
    (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30), (165, 42, 42),
    (255, 77, 255), (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157),
    (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118), (255, 179, 240),
    (0, 125, 92), (209, 0, 151), (188, 208, 182), (0, 220, 176),
]

# 3. 클래스별 면적

In [None]:
def calculate_polygon_area(points):
    """
    Shoelace formula를 이용한 면적 계산
    """
    n = len(points) 
    area = 0.0
    for i in range(n):
        j = (i + 1) % n
        area += points[i][0] * points[j][1]
        area -= points[j][0] * points[i][1]
    area = abs(area) / 2.0
    return area

In [None]:
dataset = XRayDataset(data_root, transforms=None, split="train_all")
df = dataset.df
meta = pd.read_excel(os.path.join(data_root, "meta_data.xlsx"), engine="openpyxl")

In [None]:
areas = {key:0 for key in CLASSES}

In [None]:
for idx in tqdm(range(len(df))):
    filename, labelname = df.iloc[idx]
    with open(labelname) as f:
        data = json.load(f)
        for ann in data["annotations"]:
            area = calculate_polygon_area(ann["points"])
            areas[ann["label"]] += area

In [None]:
areas = {key:areas[key]//len(df) for key in CLASSES}
labels = np.array(list(areas.keys()))
values = np.array(list(areas.values()))
color = np.where(values < np.quantile(values, 0.25), "tomato", "royalblue")

In [None]:
(sum(values) / values) / sum(sum(values) / values) * 100

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
ax.bar(labels, values, color=color, edgecolor='black')
plt.xticks(rotation='vertical')
plt.ylabel('Average Area')
plt.title('Average Area for each Class')
plt.show()

# 4. Outlier

## 4.1. Outlier 찾기

In [None]:
transform = A.Resize(512, 512)
dataset = XRayDataset(data_root, transforms=transform, split="train_all")
model = torch.load(os.path.join(save_dir, "best_model.pt"))
thr = 0.5

In [None]:
dices = []
preds = []
for images, masks in tqdm(dataset):
    images, masks = images.unsqueeze(0), masks.unsqueeze(0)

    outputs = model(images.cuda())["out"]
    output_h, output_w = outputs.size(-2), outputs.size(-1)
    mask_h, mask_w = masks.size(-2), masks.size(-1)

    # restore original size
    if output_h != mask_h or output_w != mask_w:
        outputs = F.interpolate(outputs, size=(mask_h, mask_w), mode="bilinear")

    outputs = torch.sigmoid(outputs)
    outputs = (outputs > thr).detach().cpu()
    preds.append(outputs)
    masks = masks.detach().cpu()

    dice = dice_coef(outputs, masks)
    dices.append(dice)
dices = torch.cat(dices, 0)
dice = {c: dices[:, idx] for idx, c in enumerate(CLASSES)}
df = pd.DataFrame(dice)

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
box = sns.boxplot(data=df, showmeans=True, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=20)
plt.tight_layout()
plt.show()

## 4.2. Dice 점수가 가장 낮은 이미지 살펴보기

In [None]:
def show_detail(idx, c, interval = 10):
    images, masks = dataset[idx]
    filenames = dataset.df["filenames"].iloc[idx]
    filename = filenames.split("/")[-1]
    pred = preds[idx][0].cpu().numpy()
    
    y_idx, x_idx = np.where(pred[c]==True)
    
    min_indicies = list(map(lambda x:x-interval, [min(y_idx), min(x_idx)]))
    max_indicies = list(map(lambda x:x+interval, [max(y_idx), max(x_idx)]))
    
    # Image
    img = images.cpu().numpy()
    img = np.transpose(img, (1,2,0))
    img *= 255
    img = img.astype(np.uint8)
    img = cv2.resize(img, (2048, 2048))
    
    # GT Mask
    zero_mask = np.zeros_like(masks)
    zero_mask[c] = masks[c]
    gt_mask = label2rgb(zero_mask)
    
    # PRED Mask
    zero_mask = np.zeros_like(pred)
    zero_mask[c] = pred[c]
    pred_mask = label2rgb(zero_mask)
    
    # Detail
    img = img[min_indicies[0]:max_indicies[0], min_indicies[1]:max_indicies[1]]
    gt_mask = gt_mask[min_indicies[0]:max_indicies[0], min_indicies[1]:max_indicies[1]]
    pred_mask = pred_mask[min_indicies[0]:max_indicies[0], min_indicies[1]:max_indicies[1]]
    
    h, w, _ = img.shape
    ratio_h, ratio_w = h/(h+w), w/(h+w)
    
    # Image Show
    fig, ax = plt.subplots(1, 3, figsize=(20*ratio_w, 20*ratio_h))
    show_imgs = [img, gt_mask, pred_mask]
    show_titles = [f"{CLASSES[c]}", "GT", "Pred"]
    for i, (show_img, show_title) in enumerate(zip(show_imgs, show_titles)):
        ax[i].imshow(show_img, cmap='gray')
        ax[i].set_title(show_title, fontsize=20)
        ax[i].set_xticks([])
        ax[i].set_yticks([])

    fig.suptitle(f"{idx}-{filename}", y = 0.7, fontsize=20)

    plt.tight_layout()
    plt.show()

In [None]:
# 보고 싶은 클래스 선택
cls_idx = 28

# Outlier 확인
min_val = df.describe().loc["25%"] - 1.5 * (df.describe().loc["75%"] - df.describe().loc["25%"])
outlier = {c:[] for c in CLASSES}
for idx in range(len(df)):
    val = df.iloc[idx]
    for c in CLASSES:
        if val[c] < min_val[c]:
            outlier[c].append((idx, min_val[c]-val[c]))
            
if len(outlier[CLASSES[cls_idx]]) > 0:
    outlier_idx = np.array(sorted(outlier[CLASSES[cls_idx]], key=lambda x: x[1], reverse=True), dtype=np.int64)[:, 0]

    # 시각화
    show_detail(outlier_idx[0], cls_idx, interval=10)
else:
    print("No outlier")

## 4.3. Dice 점수가 가장 높은 이미지 살펴보기

In [None]:
def show_detail(idx, c, interval = 10):
    images, masks = dataset[idx]
    filenames = dataset.df["filenames"].iloc[idx]
    filename = filenames.split("/")[-1]
    pred = preds[idx][0].cpu().numpy()
    
    y_idx, x_idx = np.where(pred[c]==True)
    
    min_indicies = list(map(lambda x:x-interval, [min(y_idx), min(x_idx)]))
    max_indicies = list(map(lambda x:x+interval, [max(y_idx), max(x_idx)]))
    
    # Image
    img = images.cpu().numpy()
    img = np.transpose(img, (1,2,0))
    img *= 255
    img = img.astype(np.uint8)
    img = cv2.resize(img, (2048, 2048))
    
    # GT Mask
    zero_mask = np.zeros_like(masks)
    zero_mask[c] = masks[c]
    gt_mask = label2rgb(zero_mask)
    
    # PRED Mask
    zero_mask = np.zeros_like(pred)
    zero_mask[c] = pred[c]
    pred_mask = label2rgb(zero_mask)
    
    # Detail
    img = img[min_indicies[0]:max_indicies[0], min_indicies[1]:max_indicies[1]]
    gt_mask = gt_mask[min_indicies[0]:max_indicies[0], min_indicies[1]:max_indicies[1]]
    pred_mask = pred_mask[min_indicies[0]:max_indicies[0], min_indicies[1]:max_indicies[1]]
    
    h, w, _ = img.shape
    ratio_h, ratio_w = h/(h+w), w/(h+w)
    
    # Image Show
    fig, ax = plt.subplots(1, 3, figsize=(20*ratio_w, 20*ratio_h))
    show_imgs = [img, gt_mask, pred_mask]
    show_titles = [f"{CLASSES[c]}", "GT", "Pred"]
    for i, (show_img, show_title) in enumerate(zip(show_imgs, show_titles)):
        ax[i].imshow(show_img, cmap='gray')
        ax[i].set_title(show_title, fontsize=20)
        ax[i].set_xticks([])
        ax[i].set_yticks([])

    fig.suptitle(f"{idx}-{filename}", y = 0.7, fontsize=20)

    plt.tight_layout()
    plt.show()

In [None]:
# 보고 싶은 클래스 선택
cls_idx = 28

# Max value 확인
max_val = df.describe().loc["max"]
inlier = {c:[] for c in CLASSES}
for idx in range(len(df)):
    val = df.iloc[idx]
    for c in CLASSES:
        if val[c] == max_val[c]:
            inlier[c].append((idx, val[c] - max_val[c]))
            

max_idx = np.array(inlier[CLASSES[cls_idx]], dtype=np.int64)[:, 0]

# 시각화
show_detail(max_idx[0], cls_idx, interval=10)
