In [34]:
import pandas as pd
import numpy as np
import os
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, random_split, WeightedRandomSampler, Subset
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torchvision.datasets import ImageFolder
import torch.optim as optim
import shutil
import torch.nn.functional as F

In [2]:
# Path to the original image directory
base_dir = "HistopathologyImages"

cancer_image_count = 0
benign_image_count = 0

# Traverse through patient directories
for patient_id in os.listdir(base_dir):
    patient_path = os.path.join(base_dir, patient_id)
    
    # Check if the path is a directory
    if os.path.isdir(patient_path):
        # Count benign files
        benign_dir = os.path.join(patient_path, '0')
        if os.path.isdir(benign_dir):
            benign_image_count += len(os.listdir(benign_dir))
        
        # Count cancerous files
        cancerous_dir = os.path.join(patient_path, '1')
        if os.path.isdir(cancerous_dir):
            cancer_image_count += len(os.listdir(cancerous_dir))

print(f"Number of benign image files: {benign_image_count}")
print(f"Number of cancerous image files: {cancer_image_count}")

Number of benign image files: 0
Number of cancerous image files: 0


In [5]:
# Function to move files to respective directories
def move_files(file_paths, labels, destination):
    for file_path, label in zip(file_paths, labels):
        label_dir = os.path.join(destination, label)
        os.makedirs(label_dir, exist_ok=True)
        shutil.move(file_path, label_dir)

# Output directory to store the split data
output_dir = 'HistopathologySplit'

In [4]:
# Create training, validation, and test directories
train_dir = os.path.join(output_dir, 'train')
val_dir = os.path.join(output_dir, 'val')
test_dir = os.path.join(output_dir, 'test')

# Initialize lists for image paths and labels
image_paths = []
labels = []

# Collect image file paths and labels
for patient_id in os.listdir(base_dir):
    patient_path = os.path.join(base_dir, patient_id)
    
    if os.path.isdir(patient_path):
        for label in ['0', '1']:
            label_dir = os.path.join(patient_path, label)
            if os.path.isdir(label_dir):
                for image_name in os.listdir(label_dir):
                    image_path = os.path.join(label_dir, image_name)
                    image_paths.append(image_path)
                    labels.append(label)

# Create a stratified train-validation-test split (20% for validation, 10% for test)
train_paths, temp_paths, train_labels, temp_labels = train_test_split(
    image_paths, labels, test_size=0.3, stratify=labels, random_state=42
)

# Further split the temp set into validation and test (2/3 for validation, 1/3 for test)
val_paths, test_paths, val_labels, test_labels = train_test_split(
    temp_paths, temp_labels, test_size=1/3, stratify=temp_labels, random_state=42
)

# Move training, validation, and test files
move_files(train_paths, train_labels, train_dir)
move_files(val_paths, val_labels, val_dir)
move_files(test_paths, test_labels, test_dir)

print(f"Training, validation, and test datasets created at '{output_dir}'.")
print(f"Number of training images: {len(train_paths)}")
print(f"Number of validation images: {len(val_paths)}")
print(f"Number of test images: {len(test_paths)}")

Training, validation, and test datasets created at 'HistopathologySplit'.
Number of training images: 194266
Number of validation images: 55505
Number of test images: 27753


In [40]:
# Define image transformations for the training and validation datasets
train_transforms = transforms.Compose([transforms.RandomRotation(degrees=15),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.15, contrast=0.1, saturation=0.2, hue=0.05),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.1), ratio=(0.3, 3.3)),
    transforms.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 5)),
    # transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])                                       
                                      ])

val_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [51]:
# Load the dataset with the defined transformations
train_dataset = datasets.ImageFolder(root=f'{output_dir}/train', transform=train_transforms)
val_dataset = datasets.ImageFolder(root=f'{output_dir}/val', transform=val_transforms)
test_dataset = datasets.ImageFolder(root=f'{output_dir}/test', transform=test_transforms)

# Extract class labels from the dataset
targets = train_dataset.targets

# Calculate class weights, which are the inverse of class frequencies. Classes with fewer samples will get assigned a higher 
# weight (ensuring that the minority class receives a higher weight, making it more likely to be sampled during training).
class_counts = np.bincount(targets)  # Count the number of samples per class
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

sample_weights = [class_weights_tensor[label].item() for label in targets]  # Use train_labels for correct indexing

# Create the WeightedRandomSampler
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# Create DataLoaders with the sampler for training
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler, num_workers=4)

# Create DataLoader for validation and test without a sampler
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# Print sample stats (optional)
print(f"Number of training samples: {len(train_loader.dataset)}")
print(f"Number of validation samples: {len(val_loader.dataset)}")
print(f"Number of test samples: {len(test_loader.dataset)}")

Number of training samples: 194266
Number of validation samples: 55505
Number of test samples: 27753


  class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)


In [52]:
# Test if the loader works
images, labels = next(iter(train_loader))
print(images.shape)  # Check tensor shape

AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/abey/.local/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/abey/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/abey/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/abey/.local/lib/python3.10/site-packages/torchvision/datasets/folder.py", line 247, in __getitem__
    sample = self.transform(sample)
  File "/home/abey/.local/lib/python3.10/site-packages/torchvision/transforms/transforms.py", line 95, in __call__
    img = t(img)
  File "/home/abey/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/abey/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/abey/.local/lib/python3.10/site-packages/torchvision/transforms/transforms.py", line 1731, in forward
    if value is not None and not (len(value) in (1, img.shape[-3])):
  File "/usr/lib/python3/dist-packages/PIL/Image.py", line 519, in __getattr__
    raise AttributeError(name)
AttributeError: shape. Did you mean: 'save'?


In [43]:
class CNN(nn.Module):
    def __init__(self, num_classes=2):
        super(CNN, self).__init__()

        # First Convolutional Block for 3 input channels RGB
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.bn1 = nn.BatchNorm2d(32) #Normalizes the convolution output to have a mean close to 0 and a standard deviation close to 1.
        self.dropout1 = nn.Dropout2d(0.2)  # Apply dropout on feature maps

        # Second Convolutional Block
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.dropout2 = nn.Dropout2d(0.2)  # Apply dropout on feature maps

        # Third Convolutional Block
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.bn3 = nn.BatchNorm2d(128)
        self.dropout3 = nn.Dropout2d(0.2)  # Apply dropout on feature maps

        # Fully Connected Layer Block. Second FC layer will output the number of classes.
        self.fc1 = nn.Linear(128 * 6 * 6, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.dropout_fc = nn.Dropout(0.5)  # Apply dropout after fully connected layer

    def forward(self, x):
        # Apply the first convolutional block (Conv -> BatchNorm -> ReLU -> Pool -> Dropout)
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)
        x = self.dropout1(x)  # Apply dropout after convolutional layer

        # Apply the second convolutional block
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)
        x = self.dropout2(x)  # Apply dropout after convolutional layer

        # Apply the third convolutional block
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool3(x)
        x = self.dropout3(x)  # Apply dropout after convolutional layer

        # Flatten the output of the convolutional layers
        x = x.view(-1, 128 * 6 * 6)

        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.dropout_fc(x)  # Apply dropout after fully connected layer
        x = self.fc2(x)

        return x


In [44]:
# Instantiate model, define loss function and optimizer
model = CNN()
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001) # Add L2 regularization to the optimizer

# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_train += (predicted == labels).sum().item()
            total_train += labels.size(0)

        train_accuracy = 100 * correct_train / total_train
        avg_train_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.2f}%")

        # Validation step
        model.eval()
        val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct_val += (predicted == labels).sum().item()
                total_val += labels.size(0)

        val_accuracy = 100 * correct_val / total_val
        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.2f}%\n")

        # Save the model if validation loss has improved
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), save_path)
            print(f"Model saved with validation loss: {avg_val_loss:.4f}")

In [45]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)

AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/abey/.local/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/abey/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/abey/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/abey/.local/lib/python3.10/site-packages/torchvision/datasets/folder.py", line 247, in __getitem__
    sample = self.transform(sample)
  File "/home/abey/.local/lib/python3.10/site-packages/torchvision/transforms/transforms.py", line 95, in __call__
    img = t(img)
  File "/home/abey/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/abey/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/abey/.local/lib/python3.10/site-packages/torchvision/transforms/transforms.py", line 1731, in forward
    if value is not None and not (len(value) in (1, img.shape[-3])):
  File "/usr/lib/python3/dist-packages/PIL/Image.py", line 519, in __getattr__
    raise AttributeError(name)
AttributeError: shape. Did you mean: 'save'?


In [29]:
# Simplified dataset and transform
# transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
# ])

train_transforms = transforms.Compose([transforms.RandomRotation(degrees=15),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.15, contrast=0.1, saturation=0.2, hue=0.05),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.1), ratio=(0.3, 3.3)),
    transforms.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 5)),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])                                       
                                      ])

# Temporary use of a smaller dataset for debugging
train_dataset = datasets.ImageFolder(root=f'{output_dir}/train', transform=transform)
# Check sampler by adding it back after confirming transformations are working
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler, num_workers=4)

# Test if the loader works without the sampler
images, labels = next(iter(train_loader))
print(images.shape)  # Check tensor shape

torch.Size([32, 3, 50, 50])
