# Veggie16 Experiment 1

The final experiment conducted using the proposed Veggie16 model. This notebook has been adapted to run on the Kaggle kernel.

The experiment aims to maximize the potential of the Veggie16 architecture, building on takeaways from previous experiments. The goal is to achieve the best possible parameterization of the Veggie16 architecture.

In [None]:
import os

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils as utils
from torchvision import datasets, transforms

import histopathology as hcd # Helper code stored as Kaggle Script

Path to the Histopathologic Cancer Detection dataset, preloaded into this Kaggle kernel.

In [None]:
DATASET_PATH = '/kaggle/input/histopathologic-cancer-detection'

### Relevant Model Decisions:

**Transforms:**

1. Morphological closing
2. Random resized crop to 224x224
3. Random horizontal flip
4. `ToTensor()`
5. Channel-wise mean normalization 

**Model:** Veggie16 network adapted from VGG-16 architecture. Convolutional layers pretrained and frozen.

**Criterion:** Cateorical Cross-Entropy

**Optimizer:** Adam

**Training Hyperparameters:** (See Below)

In [None]:
# Train/validation split distribution
train_pct = 80
# Training parameters
batch_size = 50
num_epochs = 25
# Optimizer parameters
learning_rate = 1e-4
# For mean normalization. Computed over the dataset.
rgb_means = [0.70025474, 0.54378763, 0.6961546]
rgb_stds = [0.23917262, 0.28227101, 0.2156419]

### Transform, Split, and Load the Dataset

Get the PCam dataset (Histopathologic Cancer Detection) and apply the transforms.

In [None]:
image_dir = os.path.join(DATASET_PATH, 'train')
csv_path = os.path.join(DATASET_PATH, 'train_labels.csv')
pcam_dataset = hcd.dataset.PCam(image_dir, csv_path, 
                    transforms.Compose([
                        hcd.transforms.ToClosed(),
                        transforms.RandomRotation(90),
                        transforms.CenterCrop(48),
                        transforms.ToTensor(),
                        hcd.transforms.ToNormalized(rgb_means, rgb_stds),
                    ]))
print(f'PCam has {len(pcam_dataset)} samples')

Partition the dataset into a training and validation set.

In [None]:
train_size = int(train_pct/100 * len(pcam_dataset))
val_size = len(pcam_dataset) - train_size
print(f'Splitting PCam {train_pct}%/{100-train_pct}% into train/validation sets')
# Seed PRNG with the answer to the Ultimate Question of Life, the Universe, and Everything
train_set, val_set = utils.data.random_split(pcam_dataset, 
                                             [train_size, val_size],
                                             generator=torch.Generator().manual_seed(42))

Create Pytorch Dataloaders for each dataset.

In [None]:
train_loader = utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=4, shuffle=False)
val_loader = utils.data.DataLoader(val_set, batch_size=batch_size, num_workers=4)

### Use GPU for Training

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

### Define the Model, Criterion, and Optimizer

If using the Kaggle kernel, turn on the internet to run this cell so it can download the pretrained VGG-16 weights. 

In [None]:
model = hcd.models.Veggie16(pretrained=True, freeze_weights=False).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
print(model)

### Create a `Trainer` and Load Weights

In [None]:
trainer = hcd.training.Trainer(model, device, train_loader, val_loader)
# trainer.load_checkpoint()

### Train Model on Training Partition

In [None]:
res = trainer.train(criterion, optimizer, num_epochs=num_epochs, output_freq=1000)
losses_tr, losses_va, accs_tr, accs_va = res

### Plot Loss and Accuracy over the epochs.

In [None]:
fig, ax = hcd.evaluation.plot_loss_and_accuracy(losses_tr, accs_tr)
plt.savefig('/kaggle/working/rates.png')

In [None]:
import numpy as np

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
eps = np.arange(1, num_epochs+1)
ax[0].plot(eps, np.array(losses_tr)/len(train_loader), 'r-', label='Train Loss')
ax[0].plot(eps, np.array(losses_va)/len(val_loader), 'g-', label='Validation Loss')
ax[0].set_ylabel('Loss per Batch')
ax[0].set_xlabel('Epoch')
ax[0].legend()
ax[0].set_title(f'{model.__class__.__name__} Loss/Batch over 25 Epochs')
ax[1].plot(eps, np.array(accs_tr), 'b-', label='Train Accuracy')
ax[1].plot(eps, np.array(accs_va), 'y-', label='Validation Accuracy')
ax[1].set_ylabel('Accuracy')
ax[1].set_xlabel('Epoch')
ax[1].legend()
ax[1].set_title(f'{model.__class__.__name__} Accuracy over 25 Epochs')
plt.savefig('./veggie16_revised_rates.png')

### Evaluate Model on Validation Partition

In [None]:
score, accuracy, loss = trainer.evaluate(criterion)
print(f'Evaluating {model.__class__.__name__} on validation set:')
print('-'*30)
print('F1-Score:', score)
print('Accuracy:', accuracy)
print('Loss:', loss)

### Save Trained Model

In [None]:
from sklearn.metrics import roc_curve
def roc(model, data_loader, device, subsample=1.0):
    """Estimate the ROC curve and its integral for the model on a dataset.
    
    Args:
        model: A PyTorch model.
        data_loader: A PyTorch DataLoader (shuffled).
        device: Device for running model.
        subsample: The number of samples to use when estimating the ROC curve.

    Return:
        auc: Area under the ROC curve.
        fpr: False positive rate of model at various thresholds.
        tpr: True positive rate of model at same thresholds as FPR.
        thresholds: Thresholds at which the model was decided
    """
    y_true = []
    y_hat = []
    with torch.no_grad():
        num_batches = len(data_loader)
        sample_size = int(subsample * num_batches)
        
        for i, (images, labels) in enumerate(data_loader, start=1):
            if i > sample_size:
                break
            images = images.to(device)
            labels = labels.long().flatten().to(device)
            # Forward pass and get predicted label
            outputs = model(images)
            probabilities = model.log_softmax(outputs)
            # Update
            y_hat.extend(probabilities[:,1].tolist())
            y_true.extend(labels.tolist())
            if i % 100 == 0:
                print(f'Computed predictions for sample [{i}/{sample_size}]')
    fpr, tpr, thresholds = roc_curve(y_true, y_hat)
    auc = np.trapz(tpr, fpr)
    return auc, fpr, tpr, thresholds

def plot_roc(fpr, tpr, auc, model_name):
    """Plot ROC curve for a model.
    
    Args:
        fpr: False positive rate at various thresholds.
        tpr: True positive rate at same thresholds as fpr.
        auc: Computer area under the ROC curve.
        model_name: The name of the model.
    """
    plt.figure()
    # ROC Curve
    plt.plot(fpr, tpr, 'b-', lw=2, label=f'ROC curve (Area = {auc:.4f})')
    # No discrimination line
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.plot(fpr, tpr)
    # Scale axes
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    # Labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title(f'Received Operating Characteristic Curve for {model_name}')

In [None]:
auc, fpr, tpr, thresholds = roc(model, train_loader, device)
plot_roc(fpr, tpr, auc, model.__class__.__name__)
plt.savefig('./veggie16_revised_train_roc.png')

In [None]:
auc, fpr, tpr, thresholds = roc(model, val_loader, device)
plot_roc(fpr, tpr, auc, model.__class__.__name__)
plt.savefig('./veggie16_revised_val_roc.png')

In [None]:
import csv
def predict_to_csv(model, unlabeled_loader, device, col_names, csv_path, batch_size=50):
    """Saves model predictions to a CSV file.
    
    Args:
        model: A PyTorch model.
        unlabeled_loader: A PyTorch DataLoader that returns only images.
        device: Device for running model.
        col_names: Column names for the output csv.
        csv_path: Output path of csv.
    """
    if not os.path.exists(os.path.dirname(csv_path)):
        raise ValueError(f'Attempted to save predictions invalid directory: {csv_path}')
    model.eval()
    with torch.no_grad():
        with open(csv_path, 'w') as csvfile:
            predictions_writer = csv.writer(csvfile)
            predictions_writer.writerow(['id','label'])

            num_steps = len(unlabeled_loader)
            for i, images in enumerate(unlabeled_loader):
                images = images.to(device)
                outputs = model(images)
                probabilities = model.log_softmax(outputs)
                predictions = torch.argmax(probabilities, dim=1)

                for j, prob in enumerate(predictions):
                    idx = i*batch_size + j
                    predictions_writer.writerow([os.path.splitext(col_names[idx])[0], 
                                                 prob.item()])
                if i % 100 == 0:
                    print(f'Predictions written for batch [{i}/{num_steps}]')
    print(f'Saved model predictions to: {csv_path}')

In [None]:
import PIL
from torch.utils.data import Dataset
class UnlabeledPCam(Dataset):
    """The Patch Camelyon (PCam) dataset, without ground truth labels [1].
    
    Retrieved from https://www.kaggle.com/c/histopathologic-cancer-detection/.

    [1] B. S. Veeling, J. Linmans, J. Winkens, T. Cohen, M. Welling. "Rotation 
        Equivariant CNNs for Digital Pathology". arXiv:1806.03962
    """

    def __init__(self, image_dir, transform=None):
        """Create a PyTorch dataset of images from PCam.

        Args:
            image_dir: Folder with image data in file system.
            transform: Transforms to apply before loading.
        """
        if not os.path.exists(image_dir) or not os.path.isdir(image_dir):
            raise ValueError(f'Proposed image directory {image_dir} is not on this file system.')
        self.image_dir = image_dir
        self.transform = transform
        self.image_paths = os.listdir(self.image_dir)
        self.num_samples = len(self.image_paths)

    def __len__(self):
        """Returns the length of the unlabeled PCam dataset."""
        return self.num_samples

    def __getitem__(self, idx):
        """Get the image at a given index in the PCam dataset."""
        if torch.is_tensor(idx):
            idx = idx.to_list()
        image_path = os.path.join(self.image_dir, self.image_paths[idx])
        image = PIL.Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image

In [None]:
print(image_dir)

In [None]:
image_dir = os.path.join(DATASET_PATH, 'test')
test_set = UnlabeledPCam(image_dir,
                        transforms.Compose([
                            hcd.transforms.ToClosed(),
                            transforms.CenterCrop(48),
                            transforms.ToTensor(),
                            hcd.transforms.ToNormalized(rgb_means, rgb_stds)]))
test_loader = utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=4)

In [None]:
predict_to_csv(model, test_loader, device, test_set.image_paths, 
               './preds_veggie16_revised.csv', batch_size=50)

In [None]:
trainer.save_final_model()