# Data Loading

In [1]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Path for the dataset
data_dir = '/kaggle/input/cell-images-for-detecting-malaria/cell_images' 
class_labels = ['Parasitized', 'Uninfected']

# Parameters
img_height, img_width = 128, 128
batch_size = 32
train_split = 0.8

In [2]:
# ImageDataGenerator for training with augmentation
train_datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    validation_split=1 - train_split  # Define the validation split
)

# ImageDataGenerator for validation (without augmentation)
test_datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    validation_split=1 - train_split
)

# Load the training data with augmentation
train_generator = train_datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary',
    subset='training',
    shuffle=True,
    classes=class_labels
)

# Load the validation data without augmentation
test_generator = test_datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary',
    subset='validation',
    shuffle=False,
    classes=class_labels
)

Found 22048 images belonging to 2 classes.
Found 5510 images belonging to 2 classes.


In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np

class_dirs = [os.path.join(data_dir, class_name) for class_name in ['Parasitized', 'Uninfected']]

# Count files in each class folder for the training and test sets
train_split = 0.8
train_counts = []
test_counts = []

for class_dir in class_dirs:
    total_count = len(os.listdir(class_dir))
    train_count = int(total_count * train_split)
    test_count = total_count - train_count
    train_counts.append(train_count)
    test_counts.append(test_count)

# Plotting the class distribution
x = np.arange(len(class_labels))  # Label locations
width = 0.35  # Width of bars

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, train_counts, width, label='Training')
rects2 = ax.bar(x + width/2, test_counts, width, label='Test')

# Adding labels, title, and customizing ticks
ax.set_xlabel('Class')
ax.set_ylabel('Count')
ax.set_title('Class Distribution in Training and Test Sets')
ax.set_xticks(x)
ax.set_xticklabels(class_labels)
ax.legend()

# Adding data labels
def add_labels(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{int(height)}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')

add_labels(rects1)
add_labels(rects2)

plt.show()

In [None]:
from PIL import Image
import cv2
import matplotlib.pyplot as plt

# Initialize dictionaries to store sizes and pixel intensities
image_data = {'Parasitized': {'sizes': [], 'pixel_intensities': []},
              'Uninfected': {'sizes': [], 'pixel_intensities': []}}

# Iterate through the dataset once
for class_name in class_labels:
    class_path = os.path.join(data_dir, class_name)
    for img_name in os.listdir(class_path)[:1000]:
        if img_name.endswith(".png"):  # Ensure only .png files are processed
            img_path = os.path.join(class_path, img_name)

            # Get image size using PIL
            img = Image.open(img_path)
            image_data[class_name]['sizes'].append(img.size)

            # Get pixel intensities using OpenCV
            img_cv = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Convert to grayscale
            image_data[class_name]['pixel_intensities'].extend(img_cv.flatten())