Simple EDA process

install GDCM package

In [1]:
!conda install -c conda-forge gdcm -y

^C

CondaError: KeyboardInterrupt



Importing necessary packages

In [None]:
import numpy as np
import pandas as pd
import os
import pydicom
import glob
from tqdm.notebook import tqdm
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
from skimage import exposure
import cv2
import warnings
from fastai.vision.all import *
from fastai.medical.imaging import *
warnings.filterwarnings('ignore')

Listing all the files available

In [None]:
dataset_path = Path('../input/siim-covid19-detection')
l1=dataset_path.ls()
for l in l1:
    print(l)

Viewing Study Level CSV

In [None]:
train_study_df = pd.read_csv(dataset_path/'train_study_level.csv')
print(train_study_df.shape)
train_study_df.head()

Storing unique studyIDs

In [None]:
lst = np.unique(train_study_df.id)
len(lst)

Print unique classes in Study Level
Note: Target classes are One Hot Encoded

In [None]:
study_classes = ['Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']
np.unique(train_study_df[study_classes].values, axis=0)

Plotting the Class vs Frequency graph - Study Level

In [None]:
plt.figure(figsize=(10,5))
plt.bar([1,2,3,4], train_study_df[study_classes].values.sum(axis=0))
plt.xticks([1,2,3,4], study_classes)
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

Viewing Image Level CSV

In [None]:
train_image_df = pd.read_csv(dataset_path/'train_image_level.csv')
print(train_image_df.shape)
train_image_df.head()

Splitting the label attribute of image level CSV

In [None]:
train_image_df['split_label'] = train_image_df.label.apply(lambda x:[x.split()[offs:offs+6] for offs in range(0, len(x.split()),6)])
train_image_df.head()

Finding Class Frequency and Area under box frequency

In [None]:
classes_freq = []
bbox_areas = []
for i in range(len(train_image_df)):
    for j in train_image_df.iloc[i].split_label:
        classes_freq.append(j[0])
        bbox_areas.append((float(j[4])-float(j[2]))*(float(j[5])*float(j[3])))
plt.hist(classes_freq)
plt.ylabel('Frequency')

Plotting Bounding Box Areas

In [None]:
plt.hist(bbox_areas)
plt.ylabel('Frequency')

Functions to convert .dcm to numpy arrays

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data*255).astype(np.uint8)
    return data

def plot_img(img, size=(7,7), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()
    
def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500, 500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()

Plotting x-ray images

In [None]:
dicom_paths = get_dicom_files(dataset_path/'train')
imgs = [dicom2array(path) for path in dicom_paths[:4]]
plot_imgs(imgs)

Study with more than 3 images

In [None]:
num_images_per_study = []
for i in (dataset_path/'train').ls():
    num_images_per_study.append(len(get_dicom_files(i)))
    if len(get_dicom_files(i))>3:
        print(f'Study {i} has {len(get_dicom_files(i))} images')
plt.hist(num_images_per_study)
plt.show()

Extracting image paths

In [None]:
def image_path(row):
    study_path = dataset_path/'train'/row.StudyInstanceUID
    for i in get_dicom_files(study_path):
        if row.id.split('_')[0] == i.stem:
            return i
train_image_df['image_path'] = train_image_df.apply(image_path, axis=1)
train_image_df.head()

Plotting bounding box in x-ray images

In [None]:
imgs = []
image_paths = train_image_df['image_path'].values
# map label_id to specify color
thickness = 10
scale = 5
for i in range(8):
    image_path = random.choice(image_paths)
    print(image_path)
    img = dicom2array(path=image_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    for i in train_image_df.loc[train_image_df['image_path'] == image_path].split_label.values[0]:
        if i[0] == 'opacity':
            img = cv2.rectangle(img,
                                (int(float(i[2])/scale), int(float(i[3])/scale)),
                                (int(float(i[4])/scale), int(float(i[5])/scale)),
                                [255,0,0], thickness)
    img = cv2.resize(img, (500,500))
    imgs.append(img)
plot_imgs(imgs, cmap=None)