# 3. Data Analysis

> * input이 될 X에 대한 분석
>   * 전체 데이터 수:  4,500
>   * 이미지 사이즈: (384, 512)
>   * 한 사람당 사진의 개수: 7 [마스크 착용 5장, 이상하게 착용(코스크, 턱스크) 1장, 미착용 1장]
>   * 분석 대상이 되는 객체의 위치
>   * RGB 채너별 통계값
> * target이 될 y에 대한 분석 
>   * y값의 독립적 분포 
>       * ex) y_1의 분포는?
>   * y값 들간의 관계 분포 
>       * ex) y_1, y_2 정보를 섞은 분포는?
> * X, y 관계를 확인할 수 있는 분석
>   * X특성과 y의 특성 간의 분포 차이는 어떻게 있을까요??
>   * 이미지 사이즈와 y 특성의 관계
>   * RGB 통계값과 y 특성의 관계
>   * 객체의 위치와 y 특성의 관계
>   * 데이터의 노이즈 확인 
>       * ex) y 값이 잘못 부여된것이 있을까??

In [None]:
import os
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from PIL import Image
from tqdm import tqdm

os.chdir("../../")

## 3.1. Meta info

1. 남/녀 비율
2. 나이대 비율: `<30`, `>=30 and <60`, `>=60`
3. 정확히 labeling 되었는지?
4. 중복된 데이터 있는지?

In [None]:
data_dir = os.path.join(os.getcwd(), 'input', 'data')
train_data_dir = os.path.join(data_dir, 'train')
train_data_meta_info = pd.read_csv(os.path.join(train_data_dir, 'train.csv'))

In [None]:
train_data_meta_info.info()

In [None]:
train_data_meta_info.describe(include='all')

In [None]:
is_dup_id = train_data_meta_info['id'].duplicated()
dup_id = train_data_meta_info[is_dup_id]['id']

dup_datas_df = train_data_meta_info.loc[train_data_meta_info['id'] == dup_id.values[0]]
print(dup_datas_df)

특정 id가 중복되었지만, 나이가 다르다.   
id를 기준으로 구분을 하면 안 되겠다.

In [None]:
image_data_dir =  os.path.join(train_data_dir, 'images')

dup_datas_paths = dup_datas_df['path'].values
fig, axes = plt.subplots(4, 4, figsize=(18, 16))
axes = axes.flatten()
for i in range(len(axes)): axes[i].axis('off')

for path_idx, path in enumerate(dup_datas_paths):
    img_data_path = os.path.join(image_data_dir, path)
    titles = os.listdir(img_data_path)

    title_idx = 0
    for title in titles:
        if title[0] == '.':
            continue

        image_file_path = os.path.join(img_data_path, title)
        image_pil = Image.open(image_file_path)
        image = np.array(image_pil)
        axes_index = path_idx * 8 + title_idx
        axes[axes_index].imshow(image)
        axes[axes_index].set_title(title)
        title_idx += 1

plt.show()

In [None]:
train_data_meta_info.head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7))

sns.countplot(
    x='gender', data=train_data_meta_info,
    ax=ax,
    order=sorted(train_data_meta_info['gender'].unique()),
    )
        
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
ax.set(frame_on=False)
ax.tick_params(bottom=False)
ax.set_yticks([])
ax.set_ylabel('')
ax.set_xlabel('Gender', fontsize=13, fontweight='semibold')

def changeWidth(ax, new_width):
    for patch in ax.patches:
        current_width = patch.get_width()
        diff = current_width - new_width
        patch.set_width(new_width)
        patch.set_x(patch.get_x() + diff * .5)
        patch.set_linewidth(2.0)


def insertValueText(ax, val_height):
    for idx, patch in enumerate(ax.patches):
        height = patch.get_height()
        ax.text(
            patch.get_x() + patch.get_width() / 2.,
            height + val_height, 
            height, 
            ha='center', 
            size=10)

insertValueText(ax, 95)
changeWidth(ax, 0.3)

ax.set_title("Count Female and Male", fontsize=15, fontweight='semibold')
ax.margins(0.15, 0.25)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
sns.kdeplot(
    x='age',
    data=train_data_meta_info,
    ax=ax,
    hue='gender',
    fill=True,
    )

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)


In [None]:
def set_age_s(age):
    if age < 30:
        return 0
    elif age < 60:
        return 1
    else:
        return 2

train_data_meta_info["age's"] = train_data_meta_info['age'].apply(lambda x : set_age_s(x))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7))

sns.countplot(
    x="age's", data=train_data_meta_info,
    ax=ax,
    order=sorted(train_data_meta_info["age's"].unique()),
    hue='gender',
    )
        
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
ax.set(frame_on=False)
ax.tick_params(bottom=False)
ax.set_xticklabels(['~30', '30~60', '60~'])
ax.set_yticks([])
ax.set_ylabel('')
ax.set_xlabel("age's", fontsize=13, fontweight='semibold')

insertValueText(ax, 15)
changeWidth(ax, 0.3)

ax.set_title("Count Age's", fontsize=15, fontweight='semibold')
ax.margins(0.15, 0.15)

## 3.2. image data

In [None]:
# image_path_class = pd.DataFrame(None, columns = ['path', 'label'])

# for idx, row in tqdm(train_data_meta_info.iterrows(), total=train_data_meta_info.shape[0]):
#     for img_file in os.listdir(os.path.join(image_data_dir, row['path'])):
#         if img_file[0] == '.':
#             continue
        
#         if img_file.split('.')[0] == 'normal':
#             mask = 2
#         elif img_file.split('.')[0] == 'incorrect_mask':
#             mask = 1
#         else:
#             mask = 0

#         gender = 0 if row['gender'] == 'male' else 1
#         data = {
#             'path': os.path.join(image_data_dir, row['path'], img_file),
#             'label': mask * 6 + gender * 3 + row["age's"]
#         }

#         image_path_class = image_path_class.append(data, ignore_index=True)

In [None]:
# image_path_class.to_csv(os.path.join(train_data_dir, 'labeling.csv'))
image_path_class = pd.read_csv(os.path.join(train_data_dir, 'labeling.csv')).drop('Unnamed: 0', axis=1)

In [None]:
image_path_class.head(5)

In [None]:
image_path_class.describe(include='all')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7))

sns.countplot(
    x="label", data=image_path_class,
    ax=ax,
    order=sorted(image_path_class["label"].unique()),
    )
        
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
ax.set(frame_on=False)
ax.tick_params(bottom=False)
ax.set_yticks([])
ax.set_ylabel('')
ax.set_xlabel("")

insertValueText(ax, 55)
changeWidth(ax, 0.3)

ax.set_title("Count label", fontsize=15, fontweight='semibold')
ax.margins(0.15, 0.15)

label의 0-5(wear), 6-11(incorrect), 12-17(normal)의 분포는 같다.

나이대별 성별의 분포이다.

In [None]:
num_classes = 18
image_pil_list = [[] for _ in range(num_classes)]

In [None]:
for idx, row in tqdm(image_path_class.iterrows(), total=image_path_class.shape[0]):
    image_pil = Image.open(row['path'])
    image_label = image_path_class['label'][idx]
    image_pil_list[image_label].append(image_pil)

In [None]:
image_element_np = np.array(Image.open(image_path_class['path'][0]))
print(f'image shape: {image_element_np.shape}')

In [None]:
for label, image_pil_list_label in enumerate(image_pil_list):
    print(f"Convert label {label} image to nd.array")
    for idx, image_pil in tqdm(enumerate(image_pil_list_label), total=len(image_pil_list_label)):
        image_pil_list[label][idx] = np.array(image_pil)
    image_pil_list[label] = np.array(image_pil_list_label)

image_pil_np = np.array(image_pil_list)

In [None]:
image_pil_np[0].shape

In [None]:
image_pil_np_avg = [0] * num_classes
for label, image_pil_np_label in enumerate(image_pil_np):
    print(f"Calculate label {label} mean")
    image_pil_np_avg[label] = np.mean(image_pil_np_label, axis=0)

image_pil_np_avg = np.array(image_pil_np_avg)
image_pil_np_avg.shape
    

In [None]:
image_pil_np_avg_int = [0] * num_classes

for idx in tqdm(range(num_classes), total=num_classes):
    image_pil_np_avg_int[idx] = (image_pil_np_avg[idx]).astype(int)

image_pil_np_avg_int = np.array(image_pil_np_avg_int)
image_pil_np_avg_int.shape

In [None]:
fig, axes = plt.subplots(3, 6, figsize=(20, 17))
fig.set_facecolor('white')

axes = axes.flatten()

for idx in tqdm(range(num_classes), total=num_classes):
    axes[idx].imshow((image_pil_np_avg_int[idx]))    
    axes[idx].tick_params(bottom=False)
    axes[idx].set_yticks([])
    axes[idx].set_xticks([])
    axes[idx].set_ylabel('')
    axes[idx].set_xlabel(f'Class - {idx}')

plt.show()

In [None]:
label_0_element_idx = image_path_class.index[image_path_class['label'] == 0][0]
label_0_element_path = image_path_class.loc[label_0_element_idx].values[0]

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(10, 16))
fig.set_facecolor('white')

origin_image = np.array(Image.open(label_0_element_path))
new_image_plus = image_pil_np_avg_int[0] + origin_image
new_image_avg = ((image_pil_np_avg_int[0] + np.array(Image.open(label_0_element_path))) / 2).astype(int)
new_image_minus = image_pil_np_avg_int[0] - origin_image

images = [origin_image, new_image_plus, new_image_avg, new_image_minus]
titles = ['origin', 'average+origin', '(origin+average)/2', 'average-origin']

for idx in tqdm(range(4), total=4):
    axes[idx][0].imshow(images[idx])
    axes[idx][0].set_xlabel(titles[idx])

    axes[idx][1].imshow(images[idx][:, :, 0])
    axes[idx][1].set_xlabel(titles[idx]+'-b')

    axes[idx][2].imshow(images[idx][:, :, 1])
    axes[idx][2].set_xlabel(titles[idx]+'-g')

    axes[idx][3].imshow(images[idx][:, :, 2])
    axes[idx][3].set_xlabel(titles[idx]+'-r')

axes = axes.flatten()
for ax in axes:
    ax.tick_params(bottom=False)
    ax.set_yticks([])
    ax.set_xticks([])
    ax.set_ylabel('')

plt.show()

In [None]:
reshape_image_pil_np_avg_int = image_pil_np_avg_int.transpose((0, 3, 1, 2))
reshape_image_pil_np_avg_int.shape

In [None]:
image_pil_label_np_avg_int = [0] * num_classes

for idx in tqdm(range(num_classes), total=num_classes):
    image_pil_label_np_avg_int[idx] = np.mean(image_pil_np_avg_int[idx], axis=2).astype(int)

image_pil_label_np_avg_int[0].shape

In [None]:
fig, axes = plt.subplots(6, 3, figsize=(20, 34))
fig.set_facecolor('white')
axes = axes.flatten()

for idx in tqdm(range(num_classes), total=num_classes):
    axes[idx].hist(
        image_pil_label_np_avg_int[idx].ravel(),
        bins=256,
        range=(0,256),
    )
    axes[idx].set_xlabel(f'Class - {idx}')
    axes[idx].set_ylim(0, 13000)

    colors = ('blue', 'green', 'red')
    for channel_idx, color in enumerate(colors):
        hist_value, _ = np.histogram(
            reshape_image_pil_np_avg_int[idx][channel_idx].ravel(),
            bins=256,
            range=(0,256),
            )
        axes[idx].plot(
            hist_value,
            color=color,
            alpha=0.5,
        )

plt.show()