# Data Loading

In [1]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Path for the dataset
data_dir = '/kaggle/input/cell-images-for-detecting-malaria/cell_images' 
class_labels = ['Parasitized', 'Uninfected']

# Parameters
img_height, img_width = 128, 128
batch_size = 32
train_split = 0.8

In [2]:
# ImageDataGenerator for training with augmentation
train_datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    validation_split=1 - train_split  # Define the validation split
)

# ImageDataGenerator for validation (without augmentation)
test_datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    validation_split=1 - train_split
)

# Load the training data with augmentation
train_generator = train_datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary',
    subset='training',
    shuffle=True,
    classes=class_labels
)

# Load the validation data without augmentation
test_generator = test_datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary',
    subset='validation',
    shuffle=False,
    classes=class_labels
)

Found 22048 images belonging to 2 classes.
Found 5510 images belonging to 2 classes.


In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np

class_dirs = [os.path.join(data_dir, class_name) for class_name in ['Parasitized', 'Uninfected']]

# Count files in each class folder for the training and test sets
train_split = 0.8
train_counts = []
test_counts = []

for class_dir in class_dirs:
    total_count = len(os.listdir(class_dir))
    train_count = int(total_count * train_split)
    test_count = total_count - train_count
    train_counts.append(train_count)
    test_counts.append(test_count)

# Plotting the class distribution
x = np.arange(len(class_labels))  # Label locations
width = 0.35  # Width of bars

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, train_counts, width, label='Training')
rects2 = ax.bar(x + width/2, test_counts, width, label='Test')

# Adding labels, title, and customizing ticks
ax.set_xlabel('Class')
ax.set_ylabel('Count')
ax.set_title('Class Distribution in Training and Test Sets')
ax.set_xticks(x)
ax.set_xticklabels(class_labels)
ax.legend()

# Adding data labels
def add_labels(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{int(height)}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')

add_labels(rects1)
add_labels(rects2)

plt.show()

In [None]:
from PIL import Image
import cv2
import matplotlib.pyplot as plt

# Initialize dictionaries to store sizes and pixel intensities
image_data = {'Parasitized': {'sizes': [], 'pixel_intensities': []},
              'Uninfected': {'sizes': [], 'pixel_intensities': []}}

# Iterate through the dataset once
for class_name in class_labels:
    class_path = os.path.join(data_dir, class_name)
    for img_name in os.listdir(class_path)[:1000]:
        if img_name.endswith(".png"):  # Ensure only .png files are processed
            img_path = os.path.join(class_path, img_name)

            # Get image size using PIL
            img = Image.open(img_path)
            image_data[class_name]['sizes'].append(img.size)

            # Get pixel intensities using OpenCV
            img_cv = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Convert to grayscale
            image_data[class_name]['pixel_intensities'].extend(img_cv.flatten())

In [None]:
# Plot the image dimensions distribution
plt.figure(figsize=(8, 6))
for class_name, data in image_data.items():
    widths, heights = zip(*data['sizes'])
    plt.scatter(widths, heights, alpha=0.5, label=class_name)
plt.title('Image Dimensions Distribution')
plt.xlabel('Width')
plt.ylabel('Height')
plt.legend()
plt.show()

In [None]:
# Plot the pixel intensity distribution
plt.figure(figsize=(10, 6))
for class_name, data in image_data.items():
    plt.hist(data['pixel_intensities'], bins=50, alpha=0.6, label=class_name)
plt.title('Pixel Intensity Distribution')
plt.xlabel('Pixel Intensity')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
color_channels = {'Parasitized': {'R': [], 'G': [], 'B': []}, 
                  'Uninfected': {'R': [], 'G': [], 'B': []}}

for class_name in class_labels:
    class_path = os.path.join(data_dir, class_name)
    for img_name in os.listdir(class_path)[:500]:
        if img_name.endswith(".png"):
            img_path = os.path.join(class_path, img_name)
            img = cv2.imread(img_path)
            color_channels[class_name]['R'].extend(img[:, :, 2].flatten())
            color_channels[class_name]['G'].extend(img[:, :, 1].flatten())
            color_channels[class_name]['B'].extend(img[:, :, 0].flatten())

# Plot the RGB distributions
plt.figure(figsize=(15, 5))
for i, (class_name, channels) in enumerate(color_channels.items()):
    plt.subplot(1, 2, i + 1)
    plt.hist(channels['R'], bins=50, alpha=0.5, color='red', label='Red Channel')
    plt.hist(channels['G'], bins=50, alpha=0.5, color='green', label='Green Channel')
    plt.hist(channels['B'], bins=50, alpha=0.5, color='blue', label='Blue Channel')
    plt.title(f'Color Channel Distribution - {class_name}')
    plt.xlabel('Pixel Intensity')
    plt.ylabel('Frequency')
    plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import os
import matplotlib.pyplot as plt
from PIL import Image

def plot_class_samples(data_dir, class_labels, num_samples=5):

    fig, axes = plt.subplots(len(class_labels), num_samples, figsize=(15, len(class_labels) * 3))
    fig.suptitle("Sample Images from Each Class", fontsize=16, y=0.92)

    for i, class_name in enumerate(class_labels):
        class_path = os.path.join(data_dir, class_name)
        # Get all image files from the class directory
        image_files = [f for f in os.listdir(class_path) if f.endswith(('.png', '.jpg', '.jpeg'))][:num_samples]

        for j, img_file in enumerate(image_files):
            img_path = os.path.join(class_path, img_file)
            image = Image.open(img_path)

            # Plot the image
            ax = axes[i, j] if len(class_labels) > 1 else axes[j]
            ax.imshow(image)
            ax.axis('off')
            ax.set_title(class_name if j == 0 else "")
    
    plt.tight_layout()
    plt.show()

plot_class_samples(data_dir, class_labels, num_samples=5)

# CNN Modelling

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Define the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(img_height, img_width, 3)),
    MaxPooling2D((2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Display the model architecture
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

# Plot and save the model architecture
plot_model(
    model,
    to_file='cnn_model_architecture.png',  # Saves the plot as a PNG file
    show_shapes=True,                      # Displays the shape of each layer's output
    show_layer_names=True,                 # Displays the layer names
    expand_nested=False                    # Expands nested layers (if any)
)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# Path to save the best model
checkpoint_path = "best_model_cnn.keras"

# Create a ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path,  # Path to save the model
    monitor='val_loss',        # Metric to monitor
    save_best_only=True,       # Save only the best model
    mode='min',                # Mode to minimize val_loss
    verbose=1                  # Print saving details
)

# Create an EarlyStopping callback
early_stopping_callback = EarlyStopping(
    monitor='val_loss',       # Metric to monitor
    patience=5,               # Number of epochs with no improvement to stop
    mode='min',               # Mode to minimize val_loss
    verbose=1                 # Print stopping details
)

In [None]:
# Train the model
history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=test_generator,
    callbacks=[checkpoint_callback, early_stopping_callback]  # Added callbacks
)