In [4]:
import torch
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.models import resnet50
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [6]:
# Define transforms for data preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [28]:
class ISIC2017Dataset(Dataset):
    def __init__(self, data_dir, labels_file, transform=None):
        self.transform = transform
        self.data_dir = data_dir
        self.labels_file = labels_file

        self.data = []
        self.classes = []
        self.missing_files = []

        self._load_data()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = os.path.join(self.data_dir, self.data[idx][0])
        try:
            image = Image.open(image_path).convert("RGB")
        except FileNotFoundError:
            # If file not found, add to missing_files list and skip the image
            self.missing_files.append(image_path)
            return None, None

        label = self.data[idx][1]

        if self.transform:
            image = self.transform(image)

        return image, label

    def _load_data(self):
        df = pd.read_csv(self.labels_file)

        for idx, row in df.iterrows():
            image_name = row["image_id"]
            melanoma_label = row["melanoma"]
            seborrheic_keratosis_label = row["seborrheic_keratosis"]
            nevus_label = row.get("nevus", -1)  # Assign a default value if "nevus" column is missing

            # Modify the labels to create three classes: melanoma, nevus, seborrheic keratosis
            if melanoma_label == 1:
                label = 0  # Melanoma
            elif seborrheic_keratosis_label == 1:
                label = 2  # Seborrheic Keratosis
            else:
                label = 1  # Nevus

            # Append the image name and label to the data list
            self.data.append((image_name, label))

            # Add the label to the classes list if not already present
            if label not in self.classes:
                self.classes.append(label)

In [29]:
def train(model, criterion, optimizer, train_loader, valid_loader, num_epochs=50, patience=5):
    best_auc = 0.0
    best_epoch = 0
    early_stop_count = 0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        train_loss /= len(train_loader.dataset)

        # Validate the model
        model.eval()
        valid_loss = 0.0
        all_labels = []
        all_probs = []
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                valid_loss += loss.item() * inputs.size(0)
                probs = torch.softmax(outputs, dim=1)
                all_labels.extend(labels.cpu().numpy())
                all_probs.extend(probs.cpu().numpy())

        valid_loss /= len(valid_loader.dataset)
        valid_auc = roc_auc_score(all_labels, all_probs, multi_class='ovr', average='weighted')

        print(f"Epoch [{epoch + 1}/{num_epochs}] - "
              f"Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}, Valid AUC: {valid_auc:.4f}")

        # Check for early stopping
        if valid_auc > best_auc:
            best_auc = valid_auc
            best_epoch = epoch
            early_stop_count = 0

            # Save the best model checkpoint
            torch.save(model.state_dict(), "C:\Work\Cancer Detection\2ndModel\model.pth")
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break
    model.load_state_dict(torch.load("C:\Work\Cancer Detection\2ndModel\model.pth"))
    return best_epoch

In [30]:
# Load the dataset paths

train_data_dir = "C:\Work\Cancer Detection\Data\Train"
val_data_dir = "C:\Work\Cancer Detection\Data\Val"
test_data_dir = "C:\Work\Cancer Detection\Data\Test"
train_labels_file = "C:\Work\Cancer Detection\Data\Train.csv"
val_labels_file = "C:\Work\Cancer Detection\Data\Val.csv"
test_labels_file = "C:\Work\Cancer Detection\Data\Test.csv"
num_epochs = 10

learning_rate = 0.001 #0.0001

# Create train and validation datasets
train_dataset = ISIC2017Dataset(train_data_dir, train_labels_file, transform)
valid_dataset = ISIC2017Dataset(val_data_dir, val_labels_file, transform)
test_dataset = ISIC2017Dataset(test_data_dir, test_labels_file, transform)

# Increase the batch size to 64
batch_size = 64

# Create data loaders for training, validation, and test data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Filter out None values from the datasets
train_dataset.data = [data for data in train_dataset.data if data[0] is not None]
valid_dataset.data = [data for data in valid_dataset.data if data[0] is not None]
test_dataset.data = [data for data in test_dataset.data if data[0] is not None]

# Print the missing files
print("Missing Files:", train_dataset.missing_files)

Missing Files: []


In [31]:
# Define the ResNet-50 model with dropout
class ResNetWithDropout(nn.Module):
    def __init__(self):
        super(ResNetWithDropout, self).__init__()
        self.resnet_model = resnet50(pretrained=True)
        num_ftrs = self.resnet_model.fc.in_features
        self.resnet_model.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(num_ftrs, len(train_dataset.classes))
        )

    def forward(self, x):
        return self.resnet_model(x)

In [32]:
# Train the model and get the best epoch using early stopping
model = ResNetWithDropout().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
best_epoch = train(model, criterion, optimizer, train_loader, valid_loader)



TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>