### Importing the libraries 

In [1]:
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
import warnings
from PIL import Image
import cv2
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
from sklearn.utils.class_weight import compute_class_weight

ImportError: cannot import name 'VisibleDeprecationWarning' from 'numpy' (unknown location)

### Reading both our train and test data & EDA

In [None]:
# Define paths
dataset_dir = 'dataset'
train_dir = os.path.join(dataset_dir, 'train')
test_dir = os.path.join(dataset_dir, 'test')

# Function to display sample images
def display_samples(image_dir, label, num_samples=5):
    folder = os.path.join(image_dir, label)
    images = os.listdir(folder)[:num_samples]
    plt.figure(figsize=(15, 5))
    for i, img_name in enumerate(images):
        img_path = os.path.join(folder, img_name)
        img = Image.open(img_path)
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(img)
        plt.title(f"{label.capitalize()} Sample {i + 1}")
        plt.axis('off')
    plt.show()
    

# Display samples from training set
display_samples(train_dir, 'benign')
display_samples(train_dir, 'malignant')

Looking at these benign samples, we can already observe some challenges in the dataset;

1. Variability in Skin Tone & Lighting

Some images appear pinkish, while others have a neutral skin tone. This variation in color may affect model performance if not handled correctly.

2. Presence of Artifacts

In some images, there are hairs, scratches, and other marks that might not be related to the lesion itself. These could introduce noise into the model.

3. Uneven Focus & Resolution

Some images seem sharper than others, which may impact feature extraction.

#### Checking resolution consistency

In [None]:
# Define function to check image resolutions
def check_image_resolutions(directory):
    widths, heights = [], []
    
    for label in ['benign', 'malignant']:
        folder = os.path.join(directory, label)
        for img_name in os.listdir(folder):
            img_path = os.path.join(folder, img_name)
            img = Image.open(img_path)
            widths.append(img.width)
            heights.append(img.height)
    
    return widths, heights

# Get image resolutions from train dataset
widths, heights = check_image_resolutions(train_dir)

# Plot distribution of image resolutions
plt.figure(figsize=(12, 5))
sns.scatterplot(x=widths, y=heights, alpha=0.5)
plt.xlabel("Width")
plt.ylabel("Height")
plt.title("Image Resolution Distribution")
plt.show()

The single clustered point in the scatter plot indicates that there is no significant variation in image sizes, meaning we don’t need extra resizing adjustments.

#### Pixel Intensity Distribution (Histogram Analysis)

Finding the distribution of benign and malignant cases helps identify potential class imbalances, which can influence model performance.

In [None]:
# Function to count images in each category
def count_images(directory):
    categories = ['benign', 'malignant']
    counts = {category: len(os.listdir(os.path.join(directory, category))) for category in categories}
    return counts

# Count images in train and test sets
train_counts = count_images(train_dir)
test_counts = count_images(test_dir)

# Display counts
print(f"Training Set - Benign: {train_counts['benign']}, Malignant: {train_counts['malignant']}")
print(f"Testing Set - Benign: {test_counts['benign']}, Malignant: {test_counts['malignant']}")

While the testing set is balanced, the training set exhibits a slight imbalance, with benign cases outnumbering malignant ones. Although this imbalance isn't severe, it's essential to address it to ensure robust model performance.

#### Visualizing the class distribution provides a clearer picture of any imbalances.

In [None]:
# Function to plot class distribution
def plot_class_distribution(counts, title):
    categories = list(counts.keys())
    values = list(counts.values())
    
    plt.figure(figsize=(8, 5))
    plt.bar(categories, values, color=['blue', 'red'])
    plt.xlabel('Category')
    plt.ylabel('Number of Images')
    plt.title(title)
    plt.show()

# Plot class distribution for training and testing sets
plot_class_distribution(train_counts, 'Training Set Class Distribution')
plot_class_distribution(test_counts, 'Testing Set Class Distribution')

#### Addressing class imbalance
To further enhance model performance and address class imbalance, consider augmenting both classes and incorporating class weights during training. These strategies collectively contribute to a more robust and generalizable model.

In [None]:
# Define data augmentation strategies
data_gen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Generator for benign class
benign_gen = data_gen.flow_from_directory(
    train_dir,
    classes=['benign'],
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)

# Generator for malignant class
malignant_gen = data_gen.flow_from_directory(
    train_dir,
    classes=['malignant'],
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)

In [None]:
#Combine Generators
def combined_generator(gen1, gen2):
    while True:
        batch1 = gen1.next()
        batch2 = gen2.next()
        images = np.concatenate((batch1[0], batch2[0]), axis=0)
        labels = np.concatenate((batch1[1], batch2[1]), axis=0)
        yield images, labels

train_generator = combined_generator(benign_gen, malignant_gen)

#Adjust Class Weights
 
#Define class labels
class_labels = ['benign', 'malignant']

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(class_labels),
    y=class_labels
)

# Convert to dictionary
class_weights = dict(enumerate(class_weights))

In [None]:
#Pass the class_weights dictionary to the fit or fit_generator method during model training.