In [None]:
import os
import sys
import glob
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from PIL import Image
import cv2
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
# =======================================
## Directory Settings ## 
# =======================================
ROOT_DIR = '/kaggle/'
INPUT_DIR = os.path.join(ROOT_DIR, 'input/siim-covid19-detection/')

In [None]:
os.listdir(INPUT_DIR)

In [None]:
train_image = pd.read_csv(os.path.join(INPUT_DIR, 'train_image_level.csv'))
train_study = pd.read_csv(os.path.join(INPUT_DIR, 'train_study_level.csv'))
sample_submission = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))

In [None]:
display(train_image.head())
display(train_study.head())
display(sample_submission.head())

In [None]:
print(f'len(train_image): {len(train_image)}')
print(f'len(train_study): {len(train_study)}')

In [None]:
print(train_image['id'].nunique())
print(train_image['StudyInstanceUID'].nunique())

In [None]:
train_image_ = train_image.groupby('StudyInstanceUID').count()
train_image_

In [None]:
sns.countplot(train_image_['id'])

In [None]:
train_image_[train_image_['id']>2]

In [None]:
path = glob.glob(f'../input/siim-covid19-detection/train/0fd2db233deb/*/*')

StudyinstanceUIDが2つ以上存在するため、患者の重複がある。今回はこれらは全てgroup化して、同じfoldに固める必要がある。

In [None]:
sample_submission

In [None]:
train_study.columns

In [None]:
train_study['sum'] = train_study.apply(lambda row: row['Negative for Pneumonia'] + row['Typical Appearance'] + row['Indeterminate Appearance'] + row['Atypical Appearance'], axis=1)
train_study

In [None]:
train_study['sum'].unique()

train_studyのラベルは重複がないため、完全に１クラス分類。

In [None]:
train_study[['Negative for Pneumonia', 'Typical Appearance',
       'Indeterminate Appearance', 'Atypical Appearance']].sum(axis=0)

偏りはある程度あるが、思ったほど大きくない。

In [None]:
os.listdir(INPUT_DIR)

In [None]:
TRAIN_PATH = os.path.join(INPUT_DIR, 'train')
TEST_PATH = os.path.join(INPUT_DIR, 'test')

In [None]:
train_image.head()

In [None]:
train_study['study_id'] = train_study['id'].apply(lambda x: x.split('_')[0])
train_image['image_id'] = train_image['id'].apply(lambda x: x.split('_')[0])

In [None]:
train = pd.merge(train_image, train_study, left_on='StudyInstanceUID', right_on='study_id')
train = train.drop(['id_x', 'id_y', 'StudyInstanceUID', 'sum'], axis=1)

In [None]:
train

In [None]:
train_group = train.groupby('study_id').count()
train_group[train_group['image_id']>2]

In [None]:
paths = glob.glob(os.path.join(TRAIN_PATH, '8943d1d85097', '*/*'))
paths

In [None]:
def fix_inverted_radiograms(data, img):
    '''Fixes inverted radiograms - with PhotometricInterpretation == "MONOCHROME1"
    data: the .dcm dataset
    img: the .dcm pixel_array'''
    
    if data.PhotometricInterpretation == "MONOCHROME1":
        img = np.amax(img) - img
    
    img = img - np.min(img)
    img = img / np.max(img)
    img = (img * 255).astype(np.uint8)
    
    return img

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(24, 24))
for i, path in enumerate(paths):
    j = i % 3
    i = i // 3
    dataset = pydicom.dcmread(path)
    image = apply_voi_lut(dataset.pixel_array, dataset)
    image = fix_inverted_radiograms(dataset, image)
    axes[i, j].imshow(image, cmap='gray')

study_idが同じ画像たちは、ほぼ同じ画像になっている。一部がフリップされるぐらいか。基本的には一枚を使えばよさそうだが、余裕があるなら全部使ってstudy levelを予測してアンサンブルするのもいいのかもしれない。