In [1]:
import os
import pydicom
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import math

In [37]:
dataset_path = 'sample_images'
csv_file = 'stage1_labels2.csv'
labels_df = pd.read_csv(csv_file)

In [38]:
patients = os.listdir(dataset_path)

In [39]:
labels_df.head()

Unnamed: 0,id,cancer
0,0a38e7597ca26f9374f8ea2770ba870d,0
1,R_056,1
2,0ddeb08e9c97227853422bd71a2a695e,0
3,00cba091fa4ad62cc3200a657aeb957e,0
4,fbe0c3d6e4a50ca1c1bd3101515d0ab4,0


In [40]:
for patient in patients :
    label = labels_df.loc[patient, 'cancer']
    path = dataset_path + '/' + patient
    slices = [pydicom.dcmread(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    print(len(slices),slices[0].pixel_array.shape)

KeyError: '00cba091fa4ad62cc3200a657aeb957e'

In [41]:
len(patients)

74

In [42]:
IMG_PXL_SIZE = 150

for patient in patients[:1] :
    label = labels_df.loc[patient, 'cancer']
    path = dataset_path + '/' + patient
    slices = [pydicom.dcmread(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    fig = plt.figure()
    for num,each_slice in enumerate(slices[:16]) :
        y = fig.add_subplot(4,4,num+1)
        new_image = cv2.resize(np.array(each_slice.pixel_array),(IMG_PXL_SIZE,IMG_PXL_SIZE))
        y.imshow(new_image)
    plt.show()

KeyError: '00cba091fa4ad62cc3200a657aeb957e'

In [43]:
# Helper function to pad or crop slices
def adjust_slices(slices, target_count):
    if len(slices) < target_count:
        # Pad by repeating slices
        pad_count = target_count - len(slices)
        slices = np.pad(slices, ((0, pad_count), (0, 0), (0, 0)), mode="wrap")
    elif len(slices) > target_count:
        # Crop to target count
        start_idx = (len(slices) - target_count) // 2
        slices = slices[start_idx : start_idx + target_count]
    return slices

In [44]:
TARGET_SLICE_COUNT = 64  # Number of slices per patient
TARGET_SLICE_DIMENSIONS = (128, 128)  # Dimensions of each slice

In [45]:
# Helper function: Preprocess a single patient's folder
def preprocess_patient(patient_folder):
    slices = []
    for file_name in sorted(os.listdir(patient_folder)):
        file_path = os.path.join(patient_folder, file_name)
        try:
            ds = pydicom.dcmread(file_path)
            img = ds.pixel_array
            resized_img = cv2.resize(img, TARGET_SLICE_DIMENSIONS)
            slices.append(resized_img)
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    return adjust_slices(slices, TARGET_SLICE_COUNT)

In [46]:
labels = labels_df.set_index("id").to_dict()["cancer"]

In [47]:
# Step 2: Preprocess data
data = []
targets = []
for patient_id in os.listdir(dataset_path):
    patient_folder = os.path.join(dataset_path, patient_id)
    if os.path.isdir(patient_folder) and patient_id in labels:
        print(f"Processing patient: {patient_id}")
        processed_slices = preprocess_patient(patient_folder)
        data.append(processed_slices)
        targets.append(labels[patient_id])

Processing patient: 00cba091fa4ad62cc3200a657aeb957e
Processing patient: 00edff4f51a893d80dae2d42a7f45ad1
Processing patient: 0a099f2549429d29b32f349e95fb2244
Processing patient: 0a0c32c9e08cc2ea76a71649de56be6d
Processing patient: 0a38e7597ca26f9374f8ea2770ba870d
Processing patient: 0acbebb8d463b4b9ca88cf38431aac69
Processing patient: 0b20184e0cd497028bdd155d9fb42dc9
Processing patient: 0bd0e3056cbf23a1cb7f0f0b18446068
Processing patient: 0c0de3749d4fe175b7a5098b060982a1
Processing patient: 0c37613214faddf8701ca41e6d43f56e
Processing patient: 0c59313f52304e25d5a7dcf9877633b1
Processing patient: 0c60f4b87afcb3e2dfa65abbbf3ef2f9
Processing patient: 0c98fcb55e3f36d0c2b6507f62f4c5f1
Processing patient: 0c9d8314f9c69840e25febabb1229fa4
Processing patient: 0ca943d821204ceb089510f836a367fd
Processing patient: 0d06d764d3c07572074d468b4cff954f
Processing patient: 0d19f1c627df49eb223771c28548350e
Processing patient: 0d2fcf787026fece4e57be167d079383
Processing patient: 0d941a3ad6c889ac451caf89c4

In [48]:
# Convert to NumPy arrays
data = np.array(data)
targets = np.array(targets)

In [49]:
data.shape

(74, 64, 128, 128)

In [50]:
targets.shape

(74,)

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
from sklearn.model_selection import train_test_split

# Split data into train and validation sets (80% train, 20% validation)
X_train, X_valid, y_train, y_valid = train_test_split(data, targets, test_size=0.3, random_state=42)

In [56]:
# Step 4: Save preprocessed data
np.save("datasets/X_train.npy", X_train)
np.save("datasets/y_train.npy", y_train)
np.save("datasets/X_valid.npy", X_valid)
np.save("datasets/y_valid.npy", y_valid)


print("Data preprocessing complete!")
print(f"Train data:X {X_train.shape} - Y {y_train.shape}, Validation data:X {X_valid.shape} - Y {y_valid.shape}")

Data preprocessing complete!
Train data:X (51, 64, 128, 128) - Y (51,), Validation data:X (23, 64, 128, 128) - Y (23,)


In [55]:
labels_df['cancer'].value_counts()

0    37
1    37
Name: cancer, dtype: int64