In [1]:
# Machine Learning and Data Science Imports
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
import tensorflow_hub as hub
from PIL import Image
import numpy as np
import pandas as pd
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from skimage import exposure
import scipy
import cv2

# Built-In Imports
from pathlib import Path
import warnings
import urllib
import zipfile
import shutil
import os
import re
import gc
import math
import time
from tqdm.notebook import tqdm  

# Visualization Imports
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.patches as patches
from matplotlib.colors import ListedColormap
FIG_FONT = dict(family="Helvetica, Arial", size=14, color="#7f7f7f")
LABEL_COLORS = [px.colors.label_rgb(px.colors.convert_to_RGB_255(x)) for x in sns.color_palette("Spectral", 15)]
LABEL_COLORS_WOUT_NO_FINDING = LABEL_COLORS[:8]+LABEL_COLORS[9:]


In [2]:
def get_dicom_img_paths(directory):
    """Retrieve DICOM images file paths."""
    return list(directory.rglob("*.dicom"))

def load_data(file_path):
    """Load data from a CSV file."""
    return pd.read_csv(file_path)

def process_and_resize_xray(path, size=None, keep_ratio=False, resample=Image.LANCZOS, voi_lut=True, fix_monochrome=True):
    """
    Reads a DICOM file, processes it for a human-friendly view, and optionally resizes the image.
    """
    try:
        dicom = pydicom.read_file(path)
    except Exception as e:
        raise IOError(f"Could not read DICOM file: {path}") from e

    original_dims = dicom.pixel_array.shape

    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array

    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data

    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    
    data_image = Image.fromarray(data)
    
    if size is not None:
        if keep_ratio:
            data_image.thumbnail((size, size), resample)
        else:
            data_image = data_image.resize((size, size), resample)
    
    return data_image, original_dims

def process_directory(load_dir, save_dir, size=None, keep_ratio=False):
    """
    Processes and resizes DICOM images in a directory, saving them in another directory.
    """
    os.makedirs(save_dir, exist_ok=True)
    image_id, dim_Z, dim_Y = [], [], []

    for file in tqdm(os.listdir(load_dir)):
        if not file.endswith('.dicom'):
            continue

        processed_image, original_dims = process_and_resize_xray(os.path.join(load_dir, file), size=size, keep_ratio=keep_ratio)
        processed_image.save(os.path.join(save_dir, file.replace('.dicom', '.png')))
        
        image_id.append(file.replace('.dicom', ''))
        dim_Z.append(original_dims[0])  # Height
        dim_Y.append(original_dims[1])  # Width

    return pd.DataFrame({'image_id': image_id, 'dim_Z': dim_Z, 'dim_Y': dim_Y})

In [3]:
# Set pandas options
pd.options.mode.chained_assignment = None

# %% [markdown]
# # Data Directory Setup

# %% [code]
# Define the root data directory
DATA_DIR = Path("data")

# Define the paths to the training and testing DICOM folders respectively
TRAIN_DIR = DATA_DIR / "train"
TEST_DIR = DATA_DIR / "test"

# Check the existence of the directories
print(f"Existence of DATA Folder: {DATA_DIR.exists()}")
print(f"Existence of TRAIN Folder: {TRAIN_DIR.exists()}")
print(f"Existence of TEST Folder: {TEST_DIR.exists()}")

Existence of DATA Folder: True
Existence of TRAIN Folder: True
Existence of TEST Folder: True


In [4]:
# File Paths
train_dicom_paths = get_dicom_img_paths(TRAIN_DIR)
test_dicom_paths = get_dicom_img_paths(TEST_DIR)

# Load DataFrames
train_df = load_data(DATA_DIR / "train.csv")
submission_df = load_data(DATA_DIR / "sample_submission.csv")

# Display Information
print(f"\nNumber of training files: {len(train_dicom_paths)}")
print(f"Number of testing files: {len(test_dicom_paths)}")

print("\nTraining DataFrame first 5 rows:\n")
display(train_df.head())

print("\nSample Submission DataFrame first 5 rows:\n")
display(submission_df.head())

# Process Images and Save Dimensions
save_dir = 'Proc_data/train/'
size = 256  # Resize to 256x256
keep_ratio = False  # Do not keep the original aspect ratio

dimensions_df = process_directory(str(TRAIN_DIR), save_dir, size=size, keep_ratio=keep_ratio)

# Save the DataFrame to a CSV file
csv_file_path = 'Proc_data/Original_Image_Dimensions.csv'
dimensions_df.to_csv(csv_file_path, index=False)

print(f'Saved image dimensions to {csv_file_path}')


Number of training files: 10
Number of testing files: 0

Training DataFrame first 5 rows:



Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,



Sample Submission DataFrame first 5 rows:



Unnamed: 0,image_id,PredictionString
0,002a34c58c5b758217ed1f584ccbcfe9,14 1 0 0 1 1
1,004f33259ee4aef671c2b95d54e4be68,14 1 0 0 1 1
2,008bdde2af2462e86fd373a445d0f4cd,14 1 0 0 1 1
3,009bc039326338823ca3aa84381f17f1,14 1 0 0 1 1
4,00a2145de1886cb9eb88869c85d74080,14 1 0 0 1 1


  0%|          | 0/10 [00:00<?, ?it/s]



Saved image dimensions to Proc_data/Original_Image_Dimensions.csv
