In [5]:
import pandas as pd
import numpy as np
import os
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, random_split, WeightedRandomSampler, Subset
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torchvision.datasets import ImageFolder
import torch.optim as optim
import shutil

In [4]:
# Path to the original image directory
base_dir = "HistopathologyImages"

cancer_image_count = 0
benign_image_count = 0

# Traverse through patient directories
for patient_id in os.listdir(base_dir):
    patient_path = os.path.join(base_dir, patient_id)
    
    # Check if the path is a directory
    if os.path.isdir(patient_path):
        # Count benign files
        benign_dir = os.path.join(patient_path, '0')
        if os.path.isdir(benign_dir):
            benign_image_count += len(os.listdir(benign_dir))
        
        # Count cancerous files
        cancerous_dir = os.path.join(patient_path, '1')
        if os.path.isdir(cancerous_dir):
            cancer_image_count += len(os.listdir(cancerous_dir))

print(f"Number of benign image files: {benign_image_count}")
print(f"Number of cancerous image files: {cancer_image_count}")

Number of benign image files: 198738
Number of cancerous image files: 78786


In [6]:
# Function to move files to respective directories
def move_files(file_paths, labels, destination):
    for file_path, label in zip(file_paths, labels):
        label_dir = os.path.join(destination, label)
        os.makedirs(label_dir, exist_ok=True)
        shutil.move(file_path, label_dir)

In [7]:
# Move files from original directory to new one with separate cancerous and benign subdirectories
output_dir = 'HistopathologySplit'  # Directory to store the split data

# Create training and validation directories
train_dir = os.path.join(output_dir, 'train')
val_dir = os.path.join(output_dir, 'val')

# Initialize lists for image paths and labels
image_paths = []
labels = []

# Collect image file paths and labels
for patient_id in os.listdir(base_dir):
    patient_path = os.path.join(base_dir, patient_id)
    
    if os.path.isdir(patient_path):
        for label in ['0', '1']:
            label_dir = os.path.join(patient_path, label)
            if os.path.isdir(label_dir):
                for image_name in os.listdir(label_dir):
                    image_path = os.path.join(label_dir, image_name)
                    image_paths.append(image_path)
                    labels.append(label)

# Create a stratified train-test split with 20% being used for validation data.
train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, stratify=labels, random_state=42
)

# Move training and validation files
move_files(train_paths, train_labels, train_dir)
move_files(val_paths, val_labels, val_dir)

print(f"Training and validation datasets created at '{output_dir}'.")
print(f"Number of training images: {len(train_paths)}")
print(f"Number of validation images: {len(val_paths)}")

Training and validation datasets created at 'HistopathologySplit'.
Number of training images: 222019
Number of validation images: 55505


In [10]:
# Define image transformations for the training and validation datasets
train_transforms = transforms.Compose([transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.15, contrast=0.1, saturation=0.2, hue=0.05),
    transforms.RandomRotation(degrees=15),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.1), ratio=(0.3, 3.3)),
    transforms.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 5)),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])                                       
                                      ])

val_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [13]:
# Load the dataset with the defined transformations
train_dataset = datasets.ImageFolder(root=f'{output_dir}/train', transform=train_transforms)
val_dataset = datasets.ImageFolder(root=f'{output_dir}/val', transform=val_transforms)

# Extract class labels from the dataset
targets = train_dataset.targets

# Calculate class weights, which are the inverse of class frequencies. Classes with fewer samples will get assigned a higher 
# weight (ensuring that the minority class receives a higher weight, making it more likely to be sampled during training).
class_counts = np.bincount(targets)  # Count the number of samples per class
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
sample_weights = torch.tensor([class_weights[label] for label in targets])

# Create the WeightedRandomSampler using the class weights
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# Create DataLoaders with the sampler for training
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler, num_workers=4)

# Create DataLoader for validation without a sampler
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

# Print sample stats (optional)
print(f"Number of training samples: {len(train_loader.dataset)}")
print(f"Number of validation samples: {len(val_loader.dataset)}")

Number of training samples: 222019
Number of validation samples: 55505
