# TOC

1. [Import](#1-import)
2. [필요한 정보 입력](#2-필요한-정보-입력)
3. [Meta data 분석](#3-meta-data-분석)   
    3.1. [성별 비교](#31-성별-비교)   
    3.2. [나이 비교](#32-나이-비교)   
    3.3. [체중(몸무게) 비교](#33-체중몸무게-비교)   
    3.4. [키(신장) 비교](#34-키신장-비교)   
4. [Meta data에 따른 결과 분석](#4-meta-data에-따른-결과-분석)   
    4.1. [성별 비교](#41-성별-비교)   
    4.2. [나이 비교](#42-나이-비교)   
    4.3. [체중(몸무게) 비교](#43-체중몸무게-비교)   
    4.4. [키(신장) 비교](#44-키신장-비교)   

# 1. Import

In [1]:
import os
os.chdir('/opt/ml/input/code/local')

import numpy as np
import albumentations as A
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn.functional as F

from dataset import XRayMetaDataset
from metric import dice_coef

# 2. 필요한 정보 입력

In [176]:
data_root = "/opt/ml/input/data"
save_dir = "/opt/ml/input/code/local/checkpoints/[test]Baseline1_1226"

In [None]:
CLASSES = [
    "finger-1",
    "finger-2",
    "finger-3",
    "finger-4",
    "finger-5",
    "finger-6",
    "finger-7",
    "finger-8",
    "finger-9",
    "finger-10",
    "finger-11",
    "finger-12",
    "finger-13",
    "finger-14",
    "finger-15",
    "finger-16",
    "finger-17",
    "finger-18",
    "finger-19",
    "Trapezium",
    "Trapezoid",
    "Capitate",
    "Hamate",
    "Scaphoid",
    "Lunate",
    "Triquetrum",
    "Pisiform",
    "Radius",
    "Ulna",
]

# 3. Meta data 분석

In [31]:
# xlsx path
df = pd.read_excel(os.path.join(data_root, "meta_data.xlsx"), engine='openpyxl')
train_all_df = pd.read_csv(os.path.join(data_root, "train_all.csv"))
train_df = pd.read_csv(os.path.join(data_root, "train1.csv"))
val_df = pd.read_csv(os.path.join(data_root, "val1.csv"))
test_df = pd.read_csv(os.path.join(data_root, "test.csv"))

In [32]:
meta_id = df["ID"].values
train_all_id = list(set(map(lambda x:int(x.split("/")[-2][2:]), train_all_df["filenames"].values)))
train_id = list(set(map(lambda x:int(x.split("/")[-2][2:]), train_df["filenames"].values)))
val_id = list(set(map(lambda x:int(x.split("/")[-2][2:]), val_df["filenames"].values)))
test_id = list(set(map(lambda x:int(x.split("/")[-2][2:]), test_df["filenames"].values)))

In [123]:
colors = ["royalblue", "tomato", "forestgreen", "gold"]
labels = ["train", "val", "test"]

In [161]:
def plot_rate(apply, splits, kr_col, en_col):
    train_age = np.array(list(map(apply, df[df["ID"].isin(train_id)][kr_col].values)))
    val_age = np.array(list(map(apply, df[df["ID"].isin(val_id)][kr_col].values)))
    test_age = np.array(list(map(apply, df[df["ID"].isin(test_id)][kr_col].values)))
    
    _, train_cnts = np.unique(train_age, return_counts=True)
    _, val_cnts = np.unique(val_age, return_counts=True)
    _, test_cnts = np.unique(test_age, return_counts=True)
    
    train_cnts = np.divide(train_cnts, sum(train_cnts))
    val_cnts = np.divide(val_cnts, sum(val_cnts))
    test_cnts = np.divide(test_cnts, sum(test_cnts))
    
    cnts = np.stack([train_cnts, val_cnts, test_cnts], axis=0)

    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    ax.bar(labels, cnts[:, 0], color=colors[0], label=str(splits[0]), edgecolor='black')
    for i, split in enumerate(splits[1:], start=1):
        ax.bar(labels, cnts[:, i], color=colors[i], bottom=np.sum(cnts[:, :i], axis=1), label=str(split), edgecolor='black')

    ax.legend()
    plt.ylabel("Rate")
    plt.title(en_col)
    plt.show()

## 3.1. 성별 비교

In [162]:
splits = ["Male", "Female"]

In [163]:
def gender_apply(gender):
    return str(gender)[-1]

In [None]:
plot_rate(gender_apply, splits, "성별", "Gender")

## 3.2. 나이 비교

In [166]:
splits = [(0, 25), (25, 29), (29, 36), (36, 70)]

In [167]:
def age_apply(age):
    for i, split in enumerate(splits):
        if split[0] <= age < split[1]:
            return i

In [None]:
plot_rate(age_apply, splits, "나이", "Age")

## 3.3. 체중(몸무게) 비교

In [169]:
splits = [(0, 55), (55, 63), (63, 74), (74, 120)]

In [170]:
def weight_apply(weight):
    for i, split in enumerate(splits):
        if split[0] <= weight < split[1]:
            return i

In [None]:
plot_rate(weight_apply, splits, "체중(몸무게)", "Weight")

## 3.4. 키(신장) 비교|

In [172]:
splits = [(0, 161), (161, 168), (168, 174), (174, 200)]

In [173]:
def height_apply(height):
    for i, split in enumerate(splits):
        if split[0] <= height < split[1]:
            return i

In [None]:
plot_rate(height_apply, splits, "키(신장)", "Height")

# 4. Meta data에 따른 결과 분석

In [177]:
transform = A.Resize(512, 512)
model = torch.load(os.path.join(save_dir, "best_model.pt"))
thr = 0.5

## 4.1. 성별 비교

In [None]:
partials = ["M", "W"] # TODO
splits = ["train1", "val1"]
new_df = pd.DataFrame(index=splits, columns=partials)
for split in splits:
    for idx, partial in enumerate(partials):
        partial_df = df[df["성별"].str.contains("남" if partial=="M" else "여")]["ID"]
        meta_info = list(map(lambda x: "ID"+format(x, "0>3"),  partial_df.values))
        dataset = XRayMetaDataset(data_root, meta_info, transforms=transform, split=split)
        new_df.loc[split][partial] = len(dataset)
new_df.div(new_df.sum(axis=1), axis=0)

In [None]:
partials = ["M", "W"] # TODO
new_df = pd.DataFrame(index=CLASSES, columns=partials)

for idx, partial in enumerate(partials):
    # TODO
    partial_df = df[df["성별"].str.contains("남" if partial=="M" else "여")]["ID"]
    meta_info = list(map(lambda x: "ID"+format(x, "0>3"),  partial_df.values))
    dataset = XRayMetaDataset(data_root, meta_info, transforms=transform, split="val1")

    dices = []
    for images, masks in tqdm(dataset):
        images, masks = images.unsqueeze(0), masks.unsqueeze(0)

        outputs = model(images.cuda())["out"]
        output_h, output_w = outputs.size(-2), outputs.size(-1)
        mask_h, mask_w = masks.size(-2), masks.size(-1)

        # restore original size
        if output_h != mask_h or output_w != mask_w:
            outputs = F.interpolate(outputs, size=(mask_h, mask_w), mode="bilinear")

        outputs = torch.sigmoid(outputs)
        outputs = (outputs > thr).detach().cpu()
        masks = masks.detach().cpu()

        dice = dice_coef(outputs, masks)
        dices.append(dice)
    dices = torch.cat(dices, 0)
    dices_per_class = torch.mean(dices, 0)
    new_df[partial] = dices_per_class

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(25, 8))

x = np.arange(len(CLASSES))
width=0.3

for idx, partial in enumerate(partials):
    ax.bar(x+(-len(partials)+1+2*idx)*width/2, new_df[partial], 
       width=width)

ax.set_xticks(x)
ax.set_xticklabels(CLASSES, fontsize=20, rotation=30)  

plt.ylim([0.7, 1.0]) 
plt.tight_layout()
plt.show()

## 4.2. 나이 비교

In [None]:
partials = [(0, 25), (25, 29), (29, 36), (36, 70)] # TODO
splits = ["train1", "val1"]
new_df = pd.DataFrame(index=splits, columns=partials)
for split in splits:
    for idx, partial in enumerate(partials):
        partial_df = df[(df["나이"] >= partial[0]) & (df["나이"] < partial[1])]["ID"]
        meta_info = list(map(lambda x: "ID"+format(x, "0>3"),  partial_df.values))
        dataset = XRayMetaDataset(data_root, meta_info, transforms=transform, split=split)
        new_df.loc[split][partial] = len(dataset)
new_df.div(new_df.sum(axis=1), axis=0)

In [None]:
partials = [(0, 25), (25, 29), (29, 36), (36, 70)] # TODO
new_df = pd.DataFrame(index=CLASSES, columns=partials)

for idx, partial in enumerate(partials):
    # TODO
    partial_df = df[(df["나이"] >= partial[0]) & (df["나이"] < partial[1])]["ID"]
    
    meta_info = list(map(lambda x: "ID"+format(x, "0>3"),  partial_df.values))
    dataset = XRayMetaDataset(data_root, meta_info, transforms=transform, split="val1")

    dices = []
    for images, masks in tqdm(dataset):
        images, masks = images.unsqueeze(0), masks.unsqueeze(0)

        outputs = model(images.cuda())["out"]
        output_h, output_w = outputs.size(-2), outputs.size(-1)
        mask_h, mask_w = masks.size(-2), masks.size(-1)

        # restore original size
        if output_h != mask_h or output_w != mask_w:
            outputs = F.interpolate(outputs, size=(mask_h, mask_w), mode="bilinear")

        outputs = torch.sigmoid(outputs)
        outputs = (outputs > thr).detach().cpu()
        masks = masks.detach().cpu()

        dice = dice_coef(outputs, masks)
        dices.append(dice)
    dices = torch.cat(dices, 0)
    dices_per_class = torch.mean(dices, 0)
    new_df[partial] = dices_per_class

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(25, 8))

x = np.arange(len(CLASSES))
width=0.15

for idx, partial in enumerate(partials):
    ax.bar(x+(-len(partials)+1+2*idx)*width/2, new_df[partial], 
       width=width)

ax.set_xticks(x)
ax.set_xticklabels(CLASSES, fontsize=20, rotation=30)  

plt.ylim([0.7, 1.0]) 
plt.tight_layout()
plt.show()

## 4.3. 체중(몸무게) 비교

In [None]:
partials = [(0, 55), (55, 63), (63, 74), (74, 120)] # TODO
splits = ["train1", "val1"]
new_df = pd.DataFrame(index=splits, columns=partials)
for split in splits:
    for idx, partial in enumerate(partials):
        partial_df = df[(df["체중(몸무게)"] >= partial[0]) & (df["체중(몸무게)"] < partial[1])]["ID"]
        meta_info = list(map(lambda x: "ID"+format(x, "0>3"),  partial_df.values))
        dataset = XRayMetaDataset(data_root, meta_info, transforms=transform, split=split)
        new_df.loc[split][partial] = len(dataset)
new_df.div(new_df.sum(axis=1), axis=0)

In [None]:
partials = [(0, 55), (55, 63), (63, 74), (74, 120)] # TODO
new_df = pd.DataFrame(index=CLASSES, columns=partials)

for idx, partial in enumerate(partials):
    # TODO
    partial_df = df[(df["체중(몸무게)"] >= partial[0]) & (df["체중(몸무게)"] < partial[1])]["ID"]
    
    meta_info = list(map(lambda x: "ID"+format(x, "0>3"),  partial_df.values))
    dataset = XRayMetaDataset(data_root, meta_info, transforms=transform, split="val1")

    dices = []
    for images, masks in tqdm(dataset):
        images, masks = images.unsqueeze(0), masks.unsqueeze(0)

        outputs = model(images.cuda())["out"]
        output_h, output_w = outputs.size(-2), outputs.size(-1)
        mask_h, mask_w = masks.size(-2), masks.size(-1)

        # restore original size
        if output_h != mask_h or output_w != mask_w:
            outputs = F.interpolate(outputs, size=(mask_h, mask_w), mode="bilinear")

        outputs = torch.sigmoid(outputs)
        outputs = (outputs > thr).detach().cpu()
        masks = masks.detach().cpu()

        dice = dice_coef(outputs, masks)
        dices.append(dice)
    dices = torch.cat(dices, 0)
    dices_per_class = torch.mean(dices, 0)
    new_df[partial] = dices_per_class

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(25, 8))

x = np.arange(len(CLASSES))
width=0.15

for idx, partial in enumerate(partials):
    ax.bar(x+(-len(partials)+1+2*idx)*width/2, new_df[partial], 
       width=width)

ax.set_xticks(x)
ax.set_xticklabels(CLASSES, fontsize=20, rotation=30)  

plt.ylim([0.7, 1.0]) 
plt.tight_layout()
plt.show()

## 4.4. 키(신장) 비교

In [None]:
partials = [(0, 161), (161, 168), (168, 174), (174, 200)] # TODO
splits = ["train1", "val1"]
new_df = pd.DataFrame(index=splits, columns=partials)
for split in splits:
    for idx, partial in enumerate(partials):
        partial_df = df[(df["키(신장)"] >= partial[0]) & (df["키(신장)"] < partial[1])]["ID"]
        meta_info = list(map(lambda x: "ID"+format(x, "0>3"),  partial_df.values))
        dataset = XRayMetaDataset(data_root, meta_info, transforms=transform, split=split)
        new_df.loc[split][partial] = len(dataset)
new_df.div(new_df.sum(axis=1), axis=0)

In [None]:
partials = [(0, 161), (161, 168), (168, 174), (174, 200)] # TODO
new_df = pd.DataFrame(index=CLASSES, columns=partials)

for idx, partial in enumerate(partials):
    # TODO
    partial_df = df[(df["키(신장)"] >= partial[0]) & (df["키(신장)"] < partial[1])]["ID"]
    
    meta_info = list(map(lambda x: "ID"+format(x, "0>3"),  partial_df.values))
    dataset = XRayMetaDataset(data_root, meta_info, transforms=transform, split="val1")

    dices = []
    for images, masks in tqdm(dataset):
        images, masks = images.unsqueeze(0), masks.unsqueeze(0)

        outputs = model(images.cuda())["out"]
        output_h, output_w = outputs.size(-2), outputs.size(-1)
        mask_h, mask_w = masks.size(-2), masks.size(-1)

        # restore original size
        if output_h != mask_h or output_w != mask_w:
            outputs = F.interpolate(outputs, size=(mask_h, mask_w), mode="bilinear")

        outputs = torch.sigmoid(outputs)
        outputs = (outputs > thr).detach().cpu()
        masks = masks.detach().cpu()

        dice = dice_coef(outputs, masks)
        dices.append(dice)
    dices = torch.cat(dices, 0)
    dices_per_class = torch.mean(dices, 0)
    new_df[partial] = dices_per_class

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(25, 8))

x = np.arange(len(CLASSES))
width=0.15

for idx, partial in enumerate(partials):
    ax.bar(x+(-len(partials)+1+2*idx)*width/2, new_df[partial], 
       width=width)

ax.set_xticks(x)
ax.set_xticklabels(CLASSES, fontsize=20, rotation=30)  

plt.ylim([0.7, 1.0]) 
plt.tight_layout()
plt.show()