In [None]:
from datasets import load_dataset

In [None]:
hf_repo_id = "preetsojitra/binary-2K-samples-skin-lesion-HM10000"
dataset = load_dataset(hf_repo_id)

In [None]:
dataset

In [None]:
train_ds = dataset["train"]

In [None]:
train_ds[1500]["image"]

In [None]:
train_ds[1500]["label"]

In [None]:
from torchvision import transforms

In [None]:
data_tranforms = transforms.Compose([
    transforms.Resize((128,128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5]),
])

In [None]:
import torch
from torch.utils.data import Dataset

In [None]:
class SkinDataset(Dataset):
  def __init__(self, hf_dataset, transforms=None):
    self.dataset = hf_dataset
    self.transforms = transforms
    self.label_map = {
        "benign":0,
        "malignant": 1
    }

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    item = self.dataset[idx]

    image = item["image"]
    if self.transforms:
      image = self.transforms(image)

    label = item["label"]
    label_int = self.label_map[label]

    label = torch.tensor(label_int, dtype=torch.long)

    return image, label

In [None]:
train_pyt_ds = SkinDataset(dataset["train"], data_tranforms)
test_pyt_ds = SkinDataset(dataset["test"], data_tranforms)

In [None]:
train_pyt_ds[0]

In [None]:
img, label = train_pyt_ds[0]
img.shape

In [None]:
from torch.utils.data import DataLoader

In [None]:
BATCH_SIZE=32

train_loader = DataLoader(train_pyt_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_pyt_ds, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
images, labels = next(iter(train_loader))
images.shape, labels.shape

In [None]:
import torch.nn as nn

In [None]:
class SimpleCNN(nn.Module):
  def __init__(self, num_classes=2):
    super(SimpleCNN, self).__init__()

    self.conv1 = nn.Sequential(
        nn.Conv2d(in_channels=3,out_channels=16, kernel_size=3, stride=1, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2)
    )
    self.conv2 = nn.Sequential(
        nn.Conv2d(in_channels=16,out_channels=32, kernel_size=3, stride=1, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2)
    )
    self.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear(in_features=32*32*32, out_features=128),
        nn.ReLU(),
        nn.Linear(in_features=128, out_features=2)
    )

  def forward(self, x):
      x = self.conv1(x)
      x = self.conv2(x)
      x = self.classifier(x)
      return x

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SimpleCNN().to(device)
print(model)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
NUM_EPOCHS=5

train_loss = []
batch_loss = []

for epoch in range(NUM_EPOCHS):
  model.train()
  running_loss = 0.0

  print(f"\n Epoch {epoch+1}/{NUM_EPOCHS}")
  print("-"*60)

  total_batches = len(train_loader)
  for batch_idx, (images, labels) in enumerate(train_loader):
    images, labels = images.to(device), labels.to(device)

    outputs = model(images)
    loss = loss_fn(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    batch_loss.append(loss.item())

    if (batch_idx + 1) % 10 == 0 or (batch_idx + 1) == total_batches:
      avg_loss_so_far = running_loss / (batch_idx + 1)
      print(f"Batch {batch_idx + 1}/{total_batches} | Loss: {loss:.4f} | Avg loss: {avg_loss_so_far:.4f}")

  avg_train_loss = running_loss / total_batches
  train_loss.append(avg_train_loss)
  print(f"\n Epoch {epoch+1} Summary | Average Training Loss: {avg_train_loss:.4f}")

In [None]:
model.eval() # Put the model in evaluation mode (important!)
correct_predictions = 0
total_samples = 0

total_batches = len(test_loader)

with torch.no_grad(): # We don't need to calculate gradients during evaluation
    for batch_idx, (images, labels) in enumerate(test_loader):
        images, labels = images.to(device), labels.to(device)
        
        # Get model predictions
        outputs = model(images)
        
        # The output is logits. We get the predicted class by finding the index of the max logit.
        _, predicted = torch.max(outputs.data, 1)
        
        total_samples += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()
        
        # Print progress every 10 batches (adjust as needed)
        if (batch_idx + 1) % 10 == 0 or (batch_idx + 1) == total_batches:
            current_accuracy = (correct_predictions / total_samples) * 100
            print(f"  Batch [{batch_idx+1}/{total_batches}] | Samples: {total_samples} | Current Accuracy: {current_accuracy:.2f}%")

accuracy = (correct_predictions / total_samples) * 100
print("\n" + "=" * 50)
print(f"Final Model Accuracy on Test Set: {accuracy:.2f}%")
print(f"Correct Predictions: {correct_predictions}/{total_samples}")
print("=" * 50)