## 데이터 분석 노트북

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

path = Path('/USER/data')
train_img_path = path /'train-image/image'
train_csv = pd.read_csv(path / 'train-metadata.csv', low_memory=False)
test_csv = pd.read_csv(path / 'test-metadata.csv', low_memory=False)

## Data Check

In [7]:
print(f'train_csv shape : {train_csv.shape}')
print(f'test_csv shape : {train_csv.shape}')

train_csv shape : (401059, 55)
test_csv shape : (401059, 55)


In [8]:
from PIL import Image
from glob import glob
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# 시드 고정 
SEED = 12

# 메타데이터 헤드 확인
train_csv.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [9]:
# 결측치 확인
train_csv.isna().sum()

isic_id                              0
target                               0
patient_id                           0
age_approx                        2798
sex                              11517
anatom_site_general               5756
clin_size_long_diam_mm               0
image_type                           0
tbp_tile_type                        0
tbp_lv_A                             0
tbp_lv_Aext                          0
tbp_lv_B                             0
tbp_lv_Bext                          0
tbp_lv_C                             0
tbp_lv_Cext                          0
tbp_lv_H                             0
tbp_lv_Hext                          0
tbp_lv_L                             0
tbp_lv_Lext                          0
tbp_lv_areaMM2                       0
tbp_lv_area_perim_ratio              0
tbp_lv_color_std_mean                0
tbp_lv_deltaA                        0
tbp_lv_deltaB                        0
tbp_lv_deltaL                        0
tbp_lv_deltaLB           

In [10]:
test_csv.head()

Unnamed: 0,isic_id,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,...,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,attribution,copyright_license
0,ISIC_0015657,IP_6074337,45.0,male,posterior torso,2.7,TBP tile: close-up,3D: XP,22.80433,20.00727,...,0.304827,1.281532,2.299935,0.479339,20,-155.0651,1511.222,113.9801,Memorial Sloan Kettering Cancer Center,CC-BY
1,ISIC_0015729,IP_1664139,35.0,female,lower extremity,2.52,TBP tile: close-up,3D: XP,16.64867,9.657964,...,0.0,1.27194,2.011223,0.42623,25,-112.36924,629.535889,-15.019287,"Frazer Institute, The University of Queensland...",CC-BY
2,ISIC_0015740,IP_7142616,65.0,male,posterior torso,3.16,TBP tile: close-up,3D: XP,24.25384,19.93738,...,0.230742,1.080308,2.705857,0.366071,110,-84.29282,1303.978,-28.57605,FNQH Cairns,CC-BY


In [11]:
test_csv.isna().sum()

isic_id                        0
patient_id                     0
age_approx                     0
sex                            0
anatom_site_general            0
clin_size_long_diam_mm         0
image_type                     0
tbp_tile_type                  0
tbp_lv_A                       0
tbp_lv_Aext                    0
tbp_lv_B                       0
tbp_lv_Bext                    0
tbp_lv_C                       0
tbp_lv_Cext                    0
tbp_lv_H                       0
tbp_lv_Hext                    0
tbp_lv_L                       0
tbp_lv_Lext                    0
tbp_lv_areaMM2                 0
tbp_lv_area_perim_ratio        0
tbp_lv_color_std_mean          0
tbp_lv_deltaA                  0
tbp_lv_deltaB                  0
tbp_lv_deltaL                  0
tbp_lv_deltaLB                 0
tbp_lv_deltaLBnorm             0
tbp_lv_eccentricity            0
tbp_lv_location                0
tbp_lv_location_simple         0
tbp_lv_minorAxisMM             0
tbp_lv_nev

In [15]:
train_csv[train_csv['target']==1].shape

(393, 55)

In [20]:
train_csv[train_csv['target']==0].shape

(400666, 55)

**Check the Images**

1) 베이스라인 참고한데서 id를 unique()라고 한 거 보면 중복이 있는지 체크 부탁드립니다.
2) 메타 데이터 결측치가 있습니다. 


In [None]:
def get_img_path(image_id):
    return f"{train_img_path}/{image_id}.jpg"


# Count the Images
train_img_path = path / 'train-image/image'
train_images = glob(str(train_img_path) + '/*')

df_positive = train_csv[train_csv["target"] == 1].reset_index(drop=True)
df_negative = train_csv[train_csv["target"] == 0].reset_index(drop=True)

print(f"Total num of train_img set: {len(train_images)}")
print(f'Train metadata shape : {train_csv.shape}')
print(f'sum of the targets: {train_csv.target.sum()}')
print(f'{train_csv["patient_id"].unique().shape}')
print(df_positive.shape)
print(df_negative.shape)

# 비율 맞추기(baseline 참고시 1: 20으로 했는데 좀 아이디어 있으면 고치시면 됩니다!)
# 데이터 불균형 존재
df = pd.concat([df_positive, df_negative.iloc[:df_positive.shape[0]*20, :]])
print("filtered>", df.shape, df.target.sum(), df["patient_id"].unique().shape)
df['file_path'] = df['isic_id'].apply(get_img_path)
df = df[ df["file_path"].isin(train_images) ].reset_index(drop=True)
df

In [None]:
def get_img_path(image_id):
    return f"{train_img_path}/{image_id}.jpg"


# Count the Images
train_img_path = path / 'train-image/image'
train_images = glob(str(train_img_path) + '/*')

df_positive = train_csv[train_csv["target"] == 1].reset_index(drop=True)
df_negative = train_csv[train_csv["target"] == 0].reset_index(drop=True)

print(f"Total num of train_img set: {len(train_images)}")
print(f'Train metadata shape : {train_csv.shape}')
print(f'sum of the targets: {train_csv.target.sum()}')
print(f'{train_csv["patient_id"].unique().shape}')
print(df_positive.shape)
print(df_negative.shape)

# 비율 맞추기(baseline 참고시 1: 20으로 했는데 좀 아이디어 있으면 고치시면 됩니다!)
# 데이터 불균형 존재
df = pd.concat([df_positive, df_negative.iloc[:df_positive.shape[0]*20, :]])
print("filtered>", df.shape, df.target.sum(), df["patient_id"].unique().shape)
df['file_path'] = df['isic_id'].apply(get_img_path)
df = df[ df["file_path"].isin(train_images) ].reset_index(drop=True)
df