In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from PIL import Image
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.metrics import Recall
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
import tensorflow as tf

# Set train and test directories, and set 
train_dir = 'histopathologic-cancer-detection/train'
test_dir = 'histopathologic-cancer-detection/test'
sample_submission = pd.read_csv('histopathologic-cancer-detection/sample_submission.csv')
train_labels = pd.read_csv('histopathologic-cancer-detection/train_labels.csv')

In [None]:
print(train_labels.info())

print(train_labels.head())

print(sample_submission.info())

In [None]:
# Visualize some of the images
def show_images(ids, labels, path, title):
    plt.figure(figsize=(15, 5))
    for i, (img_id, label) in enumerate(zip(ids, labels)):
        img_path = os.path.join(path, img_id + '.tif')
        img = Image.open(img_path)
        plt.subplot(1, len(ids), i+1)
        plt.imshow(img)
        plt.title(f"Label: {label}")
        plt.axis('off')
    plt.suptitle(title)
    plt.show()

# Showing 5 examples of images without cancer
show_images(train_labels[train_labels['label'] == 0]['id'][:5], [0]*5, train_dir, "Examples Without Cancer")

# Showing 5 examples of images with cancer
show_images(train_labels[train_labels['label'] == 1]['id'][:5], [1]*5, train_dir, "Examples With Cancer")

In [None]:
# Print the length of the train and test dirs
num_train_images = len(os.listdir(train_dir))
num_test_images = len(os.listdir(test_dir))
total_len_of_dataset = num_train_images + num_test_images
print(f"Number of training images: " + str(round(num_train_images / total_len_of_dataset, 2)))
print(f"Number of test images: " + str(round(num_test_images / total_len_of_dataset, 2)))

In [None]:
# Print the size of the images and the color channels
sample_image_path = os.path.join(train_dir, os.listdir(train_dir)[0])
sample_image = Image.open(sample_image_path)
print(f"Sample image dimensions: {sample_image.size}")
print(f"Number of channels in the sample image: {sample_image.mode}\n\n")

# Print the number of positive and negative samples in the training set
plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=train_labels)
plt.title('Distribution of Labels in Training Set')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [None]:
# Image preprocessing
target_size = (60, 60) # Downscaling
batch_size = 256

train_datagen = ImageDataGenerator(rescale=1./255, # Normalize from 0-255 to 0-1
                                   validation_split=.25
                                  )

# The label must be converted from int64 to string to work with the flow_from_dataframe function
train_labels['label'] = train_labels['label'].astype(str)

# The file id's must have the .tif extension added to work with the flow_from_dataframe function
train_labels['id'] = train_labels['id'].apply(lambda x: x + '.tif')

train_data_generator = train_datagen.flow_from_dataframe(dataframe=train_labels,
                                                    directory=train_dir,
                                                    x_col='id',
                                                    y_col='label',
                                                    target_size=target_size,
                                                    batch_size=batch_size,
                                                    class_mode='binary',
                                                    workers=4,
                                                    use_multiprocessing=True,
                                                    subset='training',
                                                    shuffle=True)

validation_data_generator = train_datagen.flow_from_dataframe(dataframe=train_labels,
                                                    directory=train_dir,
                                                    x_col='id',
                                                    y_col='label',
                                                    target_size=target_size,
                                                    batch_size=batch_size,
                                                    class_mode='binary',
                                                    workers=4,
                                                    use_multiprocessing=True,
                                                    subset='validation',
                                                    shuffle=True)

In [None]:
# First model architecture
model = Sequential([
    # Input layer
    Input(shape=(target_size[0], target_size[1], 3)),
    # Convolutional layer with 16 filters, kernel size of 3x3, activation function ReLU
    Conv2D(16, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    # Second convolutional layer with 32 filters
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    # Third convolutional layer with 64 filters
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    # Dense layer with dropout for regularization
    Dense(256, activation='relu'),
    Dropout(0.15),
    # Output layer with a single neuron and sigmoid activation function for binary classification
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', Recall(name='recall')])

model.summary()

steps_per_epoch = train_data_generator.samples // train_data_generator.batch_size
validation_steps = validation_data_generator.samples // validation_data_generator.batch_size

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Training the model
history = model.fit(train_data_generator,
                    steps_per_epoch=steps_per_epoch,
                    epochs=10,
                    validation_data=validation_data_generator,
                    validation_steps=validation_steps,
                    callbacks=[early_stopping])

In [None]:
def plot_metric(history, metric_name):
    plt.plot(history.history[metric_name])
    plt.plot(history.history['val_' + metric_name])
    plt.title('Model ' + metric_name)
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

plt.figure(figsize=(12, 4))

plot_metric(history, 'accuracy')
plot_metric(history, 'loss')
plot_metric(history, 'recall')

In [None]:
# test_datagen = ImageDataGenerator(rescale=1./255, # Normalize from 0-255 to 0-1
#                                   )

# test_files = os.listdir(test_dir)
# test_dataframe = pd.DataFrame(test_files, columns=['id'])
# test_data_generator = test_datagen.flow_from_dataframe(dataframe=test_dataframe,
#                                                     directory=test_dir,
#                                                     x_col='id',
#                                                     y_col=None,
#                                                     target_size=target_size,
#                                                     class_mode=None,
#                                                     workers=4,
#                                                     use_multiprocessing=True)

In [None]:
# predictions = model.predict(test_data_generator)
# predictions = np.transpose(predictions)[0]
# print(f"Number of predictions: {len(predictions)}")
# print(f"Number of test samples: {len(test_dataframe)}")
# binary_predictions = (predictions >= 0.5).astype(int).flatten() # convert the sigmoid 0-1 to either 0 or 1
# print(len(binary_predictions))
# submission_dataframe = pd.DataFrame()
# submission_dataframe['id'] = test_dataframe['id'].apply(lambda x: x.split('.')[0]) # remove .tif from the id
# submission_dataframe['label'] = binary_predictions
# submission_dataframe.to_csv('submission.csv', index=False)