## Load Data

In [12]:
import numpy as np
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import cv2
import glob as gb
import tqdm

In [13]:
np.random.seed(0)

In [14]:
# Define paths for training, testing, and prediction data

data_dir = './melanoma_cancer_dataset'
train_data = data_dir + '/train'
test_data = data_dir + '/test'

In [15]:

# Function to load images and labels
def load_data(data_path, max_images=None):
    images = []
    labels = []
    class_ = {'benign': 0, 'malignant': 1}
    for folder in os.listdir(data_path):
        data = gb.glob(pathname=data_path + "/" + folder + '/*.jpg')
        # Initialize a counter to keep track of how many images have been loaded
        counter = 0
        for img_path in data:
            img = cv2.imread(img_path)
            img_resized = cv2.resize(img, (100, 100))
            images.append(img_resized)
            labels.append(class_[folder])
            counter += 1
            # If the counter reaches the max_images limit, break from the loop
            if max_images is not None and counter >= max_images:
                break
    return np.array(images), np.array(labels)


In [16]:
# Load training and test data
# using only small subset of data for now to speed up the process
X_train, y_train = load_data(train_data)
X_test, y_test = load_data(test_data)

print(X_train.shape, y_train.shape)

(9605, 100, 100, 3) (9605,)


In [30]:
# Normalize the data
scaler = MinMaxScaler()

X_train_reshape = X_train.reshape(X_train.shape[0], -1)
X_test_reshape = X_test.reshape(X_test.shape[0], -1)

X_train_scaled = scaler.fit_transform(X_train_reshape).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test_reshape).reshape(X_test.shape)
print(X_train_scaled.shape)

(9605, 100, 100, 3)


In [38]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader

In [46]:
# Define data transformations
train_transforms = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
])

val_transforms = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.ToTensor(),
])

# Load the dataset
train_dataset = datasets.ImageFolder(root='melanoma_cancer_dataset/train', transform=train_transforms)
val_dataset = datasets.ImageFolder(root='melanoma_cancer_dataset/test', transform=val_transforms)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [47]:
dataiter = iter(train_loader)
images, labels = next(dataiter)

# Check the size of the first image and its label in the batch
print(f"Size of image: {images[0].size()}")
print(f"Label: {labels[0]}")

Size of image: torch.Size([3, 100, 100])
Label: 0


## CNN Model

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [22]:
class CNN(nn.Module):
    def __init__(self, num_classes=2):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        # After three rounds of pooling, size is 100 / 2 / 2 / 2 = 12.5, which rounds down to 12
        # Conv3 output is 64 channels by 12x12 spatial size
        self.fc1 = nn.Linear(64 * 12 * 12, 512)
        self.fc2 = nn.Linear(512, num_classes)
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        # Flatten the tensor for the fully connected layer
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [23]:
model = CNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [48]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

KeyboardInterrupt: 

In [None]:
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)

In [None]:
correct = 0
total = 0
with torch.no_grad():  # Disable gradient tracking
    for data in val_loader:
        images, labels = data
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)  # Get the index of the max log-probability
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the model on the test images: {100 * correct / total}%')
