In [None]:
#### Building CNN Model ####

In [None]:
!pip install tensorflow

In [None]:
####### Using new normalized, merged image data #####

import os
import re

# Define the directory where your normalized images are stored
image_dir = '/sfs/weka/scratch/axu5pa/DS_4002_Project_3/PH2Dataset/normalized_images'

# Define a regular expression pattern to match your desired file names.
pattern = re.compile(r'^IMD\d{3}_merged\.bmp$')

# List all .bmp files in the directory that match the pattern
image_files = [f for f in os.listdir(image_dir) if f.endswith('.bmp') and pattern.match(f)]

# Create a dictionary to map image names (without .bmp) to file paths
image_dict = {os.path.splitext(f)[0]: os.path.join(image_dir, f) for f in image_files}

# Print the filtered list of image files for verification
print(f"Filtered Image Files: {image_files}")

In [None]:
##### Updating our data frame #####
import pandas as pd

df['Image Path'] = df['Image Path'].str.replace(r'(?<=/)(.*)(?=\.)', r'\1_merged', regex=True)
print(df['Image Path'].head())

# Check if there are any missing matches
missing_images = df[df['Image Path'].isnull()]

if not missing_images.empty:
    print("Some images were not found:")
    print(missing_images)
else:
    print("All images matched successfully.")

In [None]:
##### What our dataframe looks like #####
display(df)

In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Image Processing Libraries
from PIL import Image
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# TensorFlow and Keras for Deep Learning
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Scikit-learn for Data Preprocessing and Model Evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Define the directory where your normalized images are stored
image_dir = '/sfs/weka/scratch/axu5pa/DS_4002_Project_3/PH2Dataset/normalized_images'

# Define a regular expression pattern to match 'IMD___' followed by exactly three digits and ending with '.bmp'
pattern = re.compile(r'^IMD\d{3}(_merged)?\.bmp$')

# List all .bmp files in the directory that match the pattern
image_files = [f for f in os.listdir(image_dir) if f.endswith('.bmp') and pattern.match(f)]

# Create a dictionary to map image names (without .bmp) to file paths
image_dict = {os.path.splitext(f)[0]: os.path.join(image_dir, f) for f in image_files}

# Sort the keys in image_dict based on the numerical part of the image names
sorted_image_dict = {k: v for k, v in sorted(image_dict.items(), key=lambda item: int(re.search(r'\d+', item[0]).group()))}

# Sort the DataFrame by the numerical part of 'Image Name'
df['Numeric Image Name'] = df['Image Name'].str.extract(r'(\d+)').astype(int)  # Extract digits and convert to int
df = df.sort_values(by='Numeric Image Name')  # Sort by the extracted numeric value

# Drop the temporary 'Numeric Image Name' column
df = df.drop(columns=['Numeric Image Name'])

# Map the image paths from the dictionary to the DataFrame (with '_merged' suffix)
df['Image Path'] = df['Image Name'].apply(lambda x: image_dict.get(f'{x}_merged', None))

# Check how many successful mappings we have
successful_mappings = df[df['Image Path'].notna()]
print(f"\nNumber of successful image path mappings: {successful_mappings.shape[0]}")

# Check how many rows have failed mappings (NaN or empty 'Image Path')
failed_mappings = df[df['Image Path'].isna()]
print(f"\nNumber of failed image path mappings: {failed_mappings.shape[0]}")

# Check the first few rows with missing image paths to inspect the problem
if not failed_mappings.empty:
    print("\nRows with failed image path mappings (missing images):")
    print(failed_mappings[['Image Name', 'Image Path']].head())

# Drop rows where 'Image Path' is NaN or invalid
df = df[df['Image Path'].notna() & (df['Image Path'] != '')]

# If there are enough rows left to split
if df.shape[0] > 0:
    # Encode the 'Class' labels into integers (Benign = 0, Benign* = 1, Malignant = 2)
    label_encoder = LabelEncoder()
    df['Encoded Class'] = label_encoder.fit_transform(df['Class'])

    # Split the dataset into training and validation sets (80% training, 20% validation)
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['Encoded Class'], random_state=42)
    
    # Define image size and batch size
    IMG_SIZE = (224, 224)  # Example size (can adjust based on your model's requirement)
    BATCH_SIZE = 32       # Example batch size (can adjust based on your GPU memory)

    # Create ImageDataGenerators for data augmentation and normalization
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        horizontal_flip=True,
        vertical_flip=True,
        rotation_range=30,  # Increase rotation range
        zoom_range=0.3,     # Increase zoom range
        shear_range=0.3,    # Increase shear range
        brightness_range=[0.8, 1.2],  # Adjust brightness
        fill_mode='nearest'
    )

    val_datagen = ImageDataGenerator(rescale=1./255)

    # Create generators for training and validation sets
    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_df,
        x_col='Image Path',
        y_col='Encoded Class',
        target_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        class_mode='raw',  # raw because we're using encoded integers for labels
        shuffle=True
    )

    val_generator = val_datagen.flow_from_dataframe(
        dataframe=val_df,
        x_col='Image Path',
        y_col='Encoded Class',
        target_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        class_mode='raw',
        shuffle=False
    )
else:
    print("Not enough valid data to proceed with train-test split.")


In [None]:
# Load the pre-trained ResNet-50 model without the top layers (include_top=False)
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the base model layers so they are not trained during fine-tuning
base_model.trainable = False

# Add custom layers on top of ResNet-50 for our specific classification task
x = base_model.output
x = GlobalAveragePooling2D()(x)  # Global average pooling instead of flattening

# Add a fully connected layer with 1024 units and ReLU activation
x = Dense(1024, activation='relu')(x)

# Add a final output layer with softmax activation for multi-class classification (3 classes)
predictions = Dense(3, activation='softmax')(x)

# Create the full model by combining the base model and our custom layers
model = Model(inputs=base_model.input, outputs=predictions)

# Unfreeze the last few layers for fine-tuning
for layer in base_model.layers[-50:]:  # Unfreeze last 10 layers (adjust this number as needed)
    layer.trainable = True

# Recompile the model after unfreezing
model.compile(optimizer=Adam(learning_rate=0.00001),  # Lower learning rate for fine-tuning
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])



# Compile the model with Adam optimizer and categorical crossentropy loss function (since it's multi-class classification)
model.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary to check architecture
model.summary()

In [None]:
# Train the model for 10 epochs (you can adjust this based on your needs)
from tensorflow.keras.callbacks import EarlyStopping

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train with more epochs and early stopping
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.n // BATCH_SIZE,
    validation_data=val_generator,
    validation_steps=val_generator.n // BATCH_SIZE,
    epochs=20,  # Increase epochs
    callbacks=[early_stopping]  # Add early stopping callback
)

In [None]:
# Plot training & validation accuracy values over epochs
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='upper left')

# Plot training & validation loss values over epochs
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
#### More model evaluation ####

# Evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(val_generator)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Image Processing Libraries
from PIL import Image
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# TensorFlow and Keras for Deep Learning
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Scikit-learn for Data Preprocessing and Model Evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Define the directory where your normalized images are stored
image_dir = '/sfs/weka/scratch/axu5pa/DS_4002_Project_3/PH2Dataset/normalized_images'

# Define a regular expression pattern to match 'IMD___' followed by exactly three digits and ending with '.bmp'
pattern = re.compile(r'^IMD\d{3}(_merged)?\.bmp$')

# List all .bmp files in the directory that match the pattern
image_files = [f for f in os.listdir(image_dir) if f.endswith('.bmp') and pattern.match(f)]

# Create a dictionary to map image names (without .bmp) to file paths
image_dict = {os.path.splitext(f)[0]: os.path.join(image_dir, f) for f in image_files}

# Sort the keys in image_dict based on the numerical part of the image names
sorted_image_dict = {k: v for k, v in sorted(image_dict.items(), key=lambda item: int(re.search(r'\d+', item[0]).group()))}

# Sort the DataFrame by the numerical part of 'Image Name'
df['Numeric Image Name'] = df['Image Name'].str.extract(r'(\d+)').astype(int)  # Extract digits and convert to int
df = df.sort_values(by='Numeric Image Name')  # Sort by the extracted numeric value

# Drop the temporary 'Numeric Image Name' column
df = df.drop(columns=['Numeric Image Name'])

# Map the image paths from the dictionary to the DataFrame (with '_merged' suffix)
df['Image Path'] = df['Image Name'].apply(lambda x: image_dict.get(f'{x}_merged', None))

# Check how many successful mappings we have
successful_mappings = df[df['Image Path'].notna()]
print(f"\nNumber of successful image path mappings: {successful_mappings.shape[0]}")

# Check how many rows have failed mappings (NaN or empty 'Image Path')
failed_mappings = df[df['Image Path'].isna()]
print(f"\nNumber of failed image path mappings: {failed_mappings.shape[0]}")

# Check the first few rows with missing image paths to inspect the problem
if not failed_mappings.empty:
    print("\nRows with failed image path mappings (missing images):")
    print(failed_mappings[['Image Name', 'Image Path']].head())

# Define image size and batch size
IMG_SIZE = (224, 224)
BATCH_SIZE = 64

# Encode the 'Class' labels into integers (Benign = 0, Benign* = 1, Malignant = 2)
label_encoder = LabelEncoder()
df['Encoded Class'] = label_encoder.fit_transform(df['Class'])

# Split the dataset into training and validation sets (80% training, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['Encoded Class'], random_state=42)

# Print sizes of training and validation sets for debugging purposes
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Create ImageDataGenerators for data augmentation and normalization
train_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    rotation_range=50,
    zoom_range=0.5,
    shear_range=0.5,
    brightness_range=[0.6, 1.4],
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='Image Path',
    y_col='Encoded Class',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='raw',
    shuffle=True
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    x_col='Image Path',
    y_col='Encoded Class',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='raw',
    shuffle=False
)

# Load pre-trained ResNet-50 model without top layers (include_top=False)
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze base model layers initially for transfer learning
base_model.trainable = False

# Custom top layers added on top of ResNet-50 base model
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)  # Add Batch Normalization layer to stabilize training

x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)           # Dropout layer to prevent overfitting

# Change the number of output units to match the number of classes
predictions = Dense(3, activation='softmax')(x)  # 3 classes for 'Benign', 'Benign*', 'Malignant'

model = Model(inputs=base_model.input, outputs=predictions)

# Unfreeze last few layers of ResNet-50 for fine-tuning (experiment with different numbers)
for layer in base_model.layers:
    layer.trainable = True

# Compile model with Adam optimizer and lower learning rate for fine-tuning
model.compile(optimizer=Adam(learning_rate=0.00001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

# Define callbacks: EarlyStopping and ReduceLROnPlateau (learning rate scheduler)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr_on_plateau = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-7)

history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.n // BATCH_SIZE,
    validation_data=val_generator,
    validation_steps=max(1, val_generator.n // BATCH_SIZE),  # Ensure at least one step is executed during validation
    epochs=50,  # Increase epochs for better fine-tuning
    callbacks=[early_stopping, reduce_lr_on_plateau],
    verbose=2  # Increase verbosity to see detailed logs during training
)

# Plot training & validation accuracy/loss over epochs
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper left')

plt.tight_layout()
plt.show()

# Evaluate final model on validation set
val_loss, val_accuracy = model.evaluate(val_generator)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

In [None]:
from sklearn.metrics import classification_report

y_true = val_generator.labels
y_pred = np.argmax(model.predict(val_generator), axis=1)
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

In [None]:
#### Classification ####