Let's begin by importing the necessary packages:

In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import Dataset
import csv, math
from PIL import Image
import os

#from utils import CancerDataset
from torch.utils.data import DataLoader

# Input data files are available in the "../input/" directory.
import os
print(os.listdir("../input"))

['test', 'sample_submission.csv', 'train_labels.csv', 'train']


Next we will define a CancerDataset, that inherits from the PyTorch "Dataset" class. This will be helpful in iterating through our data in batches, shuffling, and augmenting our data. 

In [2]:
class CancerDataset(Dataset):
    """Cancer Detection Dataset."""

    def __init__(self, df, image_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.labels = df.label      
        self.im_names = df.id
        self.image_dir = image_dir
        if transform is not None:
            self.transform = transform
        else:
            self.transform = transforms.Compose([
                              transforms.ToTensor()])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        im_name = self.im_names[idx]
        im_path = os.path.join(self.image_dir, im_name + '.tif')
        im = Image.open(im_path)
        im = self.transform(im)

        label = np.array(int(self.labels[idx]))

        sample = {'image': im.type(torch.cuda.FloatTensor),
                  'label': torch.from_numpy(label).type(torch.cuda.LongTensor)}

        #if self.transform:
        #    sample = self.transform(sample)

        return sample

Here we will define our network structure. For this task, we've chosen 3 layers of convolution and pooling, followed by 3 dense layers.

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # input = 96 x 96 x 3
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, bias=True)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, bias=True)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=2, stride=1, bias=True)
        self.bn3 = nn.BatchNorm2d(64)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64*10*10, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 2)
        #self.sig = nn.Sigmoid()
        
    def forward(self, x):
        # Pytorch is formatted as batch x channels x rows x cols
        #x = x.permute(0, 3, 1, 2)
        x = self.conv1(x)
        x = self.pool(F.relu(self.bn1(x)))
        x = self.conv2(x)
        x = self.pool(F.relu(self.bn2(x)))
        x = self.conv3(x)
        x = self.pool(F.relu(self.bn3(x)))

        x = x.view(-1, 64*10*10)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def train(model, train_dl, batch_size, num_epochs, learning_rate=0.001):

    # Loss and optimizer

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adamax(model.parameters(), lr=learning_rate)
    #criterion = nn.BCEWithLogitsLoss()
    #optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    loss_list = []
    acc_list = []

    #Print all of the hyperparameters of the training iteration:
    print("===== HYPERPARAMETERS =====")
    print("batch_size=", batch_size)
    print("epochs=", num_epochs)
    print("learning_rate=", learning_rate)
    print("=" * 27)

    for epoch in range(num_epochs):
        for i, data in enumerate(train_dl):
            im_batch = data['image']
            label_batch = data['label']

            # Run the forward pass
            optimizer.zero_grad()

            outputs = model(im_batch)
            #print(outputs.shape)
            #print(outputs)
            
            loss = criterion(outputs, label_batch)
            loss_list.append(loss.item())

            # Backprop and perform Adam optimisation
            loss.backward()
            optimizer.step()

            # Track the accuracy
            total = label_batch.size(0)
            _, predicted = torch.max(outputs.data, 1)
            correct = (predicted == label_batch).sum().item()
            acc_list.append(correct / total)
            if (i + 1) % 200 == 0:
                print(loss.item())

We first initialize our datasets and pass them to the dataloaders

Then we configure the device (either GPU or CPU) and initiate training.

In [4]:
traincsv = '../input/train_labels.csv'
im_dir = '../input/train/'
train_df = pd.read_csv(traincsv)
im_transform = transforms.Compose([
                  transforms.RandomHorizontalFlip(), 
                  transforms.RandomVerticalFlip(),
                  transforms.RandomRotation(20),
                  transforms.ToTensor()])
                  #transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])

num_epochs = 3
batch_size = 64

# Initialize dataset
ds = CancerDataset(train_df, image_dir=im_dir, transform=im_transform)
#https://stackoverflow.com/questions/50544730/how-do-i-split-a-custom-dataset-into-training-and-test-datasets
train_size = int(0.8 * len(ds))
val_size = len(ds) - train_size
train_ds, val_ds = torch.utils.data.random_split(ds, [train_size, val_size])
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True, num_workers=0)

# Device configuration - use GPU if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model = Net().to(device)

train(model, train_dl, batch_size, num_epochs, 0.001)

===== HYPERPARAMETERS =====
batch_size= 64
epochs= 3
learning_rate= 0.001
0.4608437418937683
0.4939935505390167
0.26370421051979065
0.25264474749565125
0.30566319823265076
0.3037013113498688
0.2593081295490265
0.37768977880477905
0.2698407769203186
0.30770060420036316
0.39942196011543274
0.37669020891189575
0.4490520656108856
0.3043595552444458
0.28148573637008667
0.3729715645313263
0.3450397849082947
0.3085711896419525
0.3312935531139374
0.3430314064025879
0.24619035422801971
0.21712902188301086
0.27644211053848267
0.5876179337501526
0.25410646200180054
0.2204357385635376
0.283459335565567
0.28090935945510864
0.2225927859544754
0.2801467478275299
0.4364542067050934
0.3367641568183899
0.3021238446235657
0.29901692271232605
0.3375479280948639
0.3891212046146393
0.3377498388290405
0.17018640041351318
0.20743441581726074


Now that Training has completed, we can evaluate the model. It is important here that we use the command 'torch.no_grad()' to ensure that we no longer update gradients. We are only running inference. 

In [5]:
model.eval()  # evaluate using moving mean and variance from batch norm
with torch.no_grad():
    correct = 0
    total = 0
    for i, data in enumerate(val_dl):
        im_batch = data['image']
        label_batch = data['label']

        # Run the forward pass
        outputs = model(im_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += label_batch.size(0)
        correct += (predicted == label_batch).sum().item()
        if i>4:
            break
    print('Test Accuracy: {} %'.format(100 * correct / total))


Test Accuracy: 88.54166666666667 %


Once we have evaluated our model and we are satisfied, we can generate a dataset and dataloader from the test set.

In [6]:
im_dir = '../input/test/'
testcsv = '../input/sample_submission.csv'
test_df = pd.read_csv(testcsv)

# Initialize dataset
test_ds = CancerDataset(test_df, image_dir=im_dir, transform=None)
test_dl = DataLoader(test_ds, batch_size=128, shuffle=False, num_workers=0)

Next we run inference on the test set. 

In [7]:
model.eval()  # evaluate using moving mean and variance from batch norm
with torch.no_grad():
    preds = []
    for i, data in enumerate(test_dl):
        im_batch = data['image']

        # Run the forward pass
        outputs = model(im_batch)
        _, predicted = torch.max(outputs.data, 1)
        predicted = predicted.detach().cpu().numpy()
        #predicted = predicted.numpy()
        for i in predicted:
            preds.append(i)
        #print(len(preds))

    print(len(preds))

    

57458


Save predictions for competition submission!

In [8]:
sub_df = pd.read_csv('../input/sample_submission.csv')
sub_df.drop('label', axis=1)
sub_df['label'] = preds
sub_df.to_csv('submission.csv', index=False)