In [1]:
import os
import numpy as np
import pandas as pd
import pydicom
import cv2
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
# Load the CSV file with image paths and labels
df = pd.read_csv('RSNA_breast_cancer_data.csv')

# Function to load DICOM images and resize them to 224x224 (for ResNet)
def load_dicom_image(image_path, target_size=(224, 224)):
    dicom = pydicom.dcmread(image_path)
    image = dicom.pixel_array
    image_resized = cv2.resize(image, target_size)
    image_normalized = image_resized / np.max(image_resized)  # Normalize pixel values
    # Convert to 3 channels (RGB) by repeating the grayscale image
    image_rgb = np.stack([image_normalized] * 3, axis=-1)
    return image_rgb

# Load image paths and labels
image_paths = df['image_path'].values
labels = df['cancer'].values

# Load and preprocess all images
images = np.array([load_dicom_image(path) for path in image_paths])
labels = np.array(labels)

In [3]:
# Split data into training (70%), validation (10%), and test sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(images, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.60, random_state=42)  # 10% for val, 20% for test

# Normalize images to be between 0 and 1
X_train = X_train.astype('float32') / 255.0
X_val = X_val.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

# Data augmentation generator
train_datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Create validation data generator (just rescaling)
val_datagen = ImageDataGenerator()

In [4]:
# Define a function to create the model with epochs as a parameter
def create_model(optimizer='adam', dropout_rate=0.5, num_dense_units=256, activation='relu', epochs=10):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    base_model.trainable = False  # Freeze the base layers
    model = Sequential([
        base_model,
        GlobalAveragePooling2D(),
        Dense(num_dense_units, activation=activation),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')  # Binary classification
    ])
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    # Set epochs in model fitting during grid search
    model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val))
    return model

# Define the hyperparameters to tune
param_grid = {
    'optimizer': ['adam', 'rmsprop', SGD(learning_rate=1e-3, momentum=0.9)],
    'dropout_rate': [0.3, 0.5, 0.7],
    'num_dense_units': [128, 256, 512],
    'activation': ['relu', 'tanh'],
    'batch_size': [16, 32, 64],
    'learning_rate': [1e-3, 1e-4, 1e-5]
}

# Wrap the model for GridSearchCV
def model_builder(optimizer='adam', dropout_rate=0.5, num_dense_units=256, activation='relu', batch_size=16, epochs=30):
    model = create_model(optimizer, dropout_rate, num_dense_units, activation, epochs)
    return model

In [None]:
# Create a GridSearchCV object (without using KerasClassifier)
grid_search = GridSearchCV(estimator=model_builder(),
                           param_grid=param_grid,
                           cv=3,
                           verbose=1,
                           n_jobs=-1,
                           scoring = 'f1')

# Create augmented data generators for GridSearchCV
train_generator = train_datagen.flow(X_train, y_train, batch_size=16)
val_generator = val_datagen.flow(X_val, y_val, batch_size=16)

# Fit the model with grid search
grid_result = grid_search.fit(train_generator, validation_data=val_generator)

Epoch 1/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 1s/step - accuracy: 0.4974 - loss: 0.8491 - val_accuracy: 0.5193 - val_loss: 0.6984
Epoch 2/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 1s/step - accuracy: 0.5239 - loss: 0.7123 - val_accuracy: 0.5193 - val_loss: 0.6924
Epoch 3/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1s/step - accuracy: 0.4960 - loss: 0.7010 - val_accuracy: 0.4807 - val_loss: 0.6949
Epoch 4/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1s/step - accuracy: 0.4855 - loss: 0.6971 - val_accuracy: 0.5193 - val_loss: 0.6930
Epoch 5/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 1s/step - accuracy: 0.4736 - loss: 0.6968 - val_accuracy: 0.4807 - val_loss: 0.6941
Epoch 6/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 1s/step - accuracy: 0.5043 - loss: 0.6926 - val_accuracy: 0.5193 - val_loss: 0.6924
Epoch 7/30
[1m52/52[0m [32m━━━━━━━━━━

AbortedError: Graph execution error:

Detected at node StatefulPartitionedCall/sequential_1_1/resnet50_1/conv5_block1_2_conv_1/BiasAdd defined at (most recent call last):
<stack traces unavailable>
Operation received an exception:Status: 1, message: could not create a memory object, in file tensorflow/core/kernels/mkl/mkl_conv_ops.cc:1112
	 [[{{node StatefulPartitionedCall/sequential_1_1/resnet50_1/conv5_block1_2_conv_1/BiasAdd}}]] [Op:__inference_multi_step_on_iterator_57101]

In [None]:
# Get the best parameters and the best score
print(f"Best score: {grid_result.best_score_}")
print(f"Best parameters: {grid_result.best_params_}")

In [None]:
# # Get the best model with the best parameters
# best_model = grid_result.best_estimator_.model

# # Retrain the best model on the full training data
# history = best_model.fit(
#     X_train, y_train, 
#     epochs=30,  # Specify the number of epochs
#     batch_size=32,  # Specify batch size
#     validation_data=(X_val, y_val)  # Validation data
# )

In [None]:
# Evaluate the best model on the test set
test_loss, test_acc = best_model.evaluate(X_test, y_test)
print(f'\nTest accuracy: {test_acc}')

In [None]:
import matplotlib.pyplot as plt

# Create subplots for accuracy and loss
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Plot accuracy
ax1.plot(history.history['accuracy'], label='Train Accuracy')
ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Accuracy')
ax1.set_title('Accuracy over Epochs')
ax1.legend()

# Plot loss
ax2.plot(history.history['loss'], label='Train Loss')
ax2.plot(history.history['val_loss'], label='Validation Loss')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Loss')
ax2.set_title('Loss over Epochs')
ax2.legend()

# Display the plots
plt.tight_layout()
plt.show()