In [None]:
## pip if needed
!pip install neptune
#!pip install -U 'neptune-client'
!pip install torchsummary

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import copy
from PIL import Image
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
import torch.optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
from torchsummary import summary
import neptune
#from neptune.new.integrations.pytorch_lightning import NeptuneLogger


pl.seed_everything(2023)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## Setting up neptune logging

run = neptune.init_run(
    project="a-dev-walker/DL-final-project",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIyMmVjNDlmYS04MjZmLTQ1N2QtODUxMi1lNTdmZGQzMzNhMzUifQ==",
)

In [None]:
## Downloading the Data and labels into the working space
labels_df = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')

In [None]:
## Investigating the data a little bit to see the label counts
print(labels_df['label'].value_counts())

In [None]:
## Creating the dataset object

class HistopathDataset(Dataset):
    def __init__(self, data_loc, image_ids, labels, transform=None):
        self.data_loc = data_loc
        self.image_ids = image_ids
        self.labels = labels
        self.transform = transform
        
        self.image_file_names = [image_id + ".tif" for image_id in self.image_ids]
        
        self.image_file_paths = [os.path.join(self.data_loc, file) for file in self.image_file_names ]
        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        img = Image.open(self.image_file_paths[idx])
        label = self.labels.iloc[idx]

        if self.transform:
            img = self.transform(img)

        return img, label

    
    
#For the train transform we want to augment the images somewhat to teach the system a variety of orientations
#This is done even though histopathology iamges are not orientated but there could be some bias in the capture that we'd like to eliminate
train_transform = transforms.Compose([
    transforms.Resize((96, 96)),
    transforms.RandomHorizontalFlip(p=0.5), 
    transforms.RandomVerticalFlip(p=0.5),  
    transforms.RandomRotation(45), 
    transforms.ToTensor()
])  

    
#Test and Validation transform just needs to turn to tensors
test_val_transform = transforms.Compose([
    transforms.Resize((96, 96)),
    transforms.ToTensor()
])





In [None]:
## Splitting the data, putting it into a dataset, and using a dataloader

data_location = "/kaggle/input/histopathologic-cancer-detection/train"
BATCH_SIZE = 32

overall_labels = labels_df['label']
overall_labels = overall_labels.reset_index(drop=True)


# Doing test train split
X_train, X_val_test, y_train, y_val_test = train_test_split(labels_df['id'], overall_labels, test_size=.3)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size = .5)

# Creating datasets
train_dataset = HistopathDataset(data_loc = data_location, image_ids = X_train,labels = y_train, transform = train_transform)
val_dataset = HistopathDataset(data_loc = data_location, image_ids = X_val,labels = y_val, transform = test_val_transform)
test_dataset = HistopathDataset(data_loc = data_location, image_ids = X_test,labels = y_test, transform = test_val_transform)

# Creating data loaders
train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
val_dataloader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle = True)



In [None]:
# Checking the lenghts of all of the datasets for QC

print("train dataset size:", len(train_dataset))
print("validation dataset size:", len(val_dataset))
print("test dataset size:", len(test_dataset))



print("train dataloader size:", len(train_dataloader))
print("validation dataloader size:", len(val_dataloader))
print("test dataloader size:", len(test_dataloader))


In [None]:
## Creating the Model architecture

class HistopathClassifier(pl.LightningModule):
    def __init__(self):
        super(HistopathClassifier, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 12 * 12, 512)
        self.fc2 = nn.Linear(512, 1)
        self.dropout = nn.Dropout(0.25)
        self.sigmoid = nn.Sigmoid()
        self.loss = nn.BCELoss()

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 64 * 12 * 12)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return self.sigmoid(x)
    
    def train_step(self, inputs, labels, criterion, optimizer):
        optimizer.zero_grad()
        outputs = self(inputs).squeeze()
        
        if outputs.squeeze().dim() == 0: #done for rare instances where a batch could have only 1 sample
            outputs = outputs.unsqueeze(0)
    
        #print(outputs.shape)
        #print(outputs.squeeze().shape)
        #print(labels.float().shape)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        return loss.item()

    def validate_step(self, inputs, labels, criterion):
        with torch.no_grad():
            outputs = self(inputs).squeeze()
            if outputs.squeeze().dim() == 0: #done for rare instances where a batch could have only 1 sample
                outputs = outputs.unsqueeze(0)
            
            loss = criterion(outputs, labels.float())
        return loss.item()



In [None]:
## Creating the model and getting a summary

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


num_epochs = 20
model = HistopathClassifier()
model.to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)


summary(model, input_size=(3, 96, 96),device=device.type)

In [None]:
## Going about training the model utilzing neptune to log values

weight_path = "weights.pt"
best_loss=float('inf') # initialize best loss to a large value

run["config/model"] = type(model).__name__
run["config/criterion"] = type(criterion).__name__
run["config/optimizer"] = type(optimizer).__name__


for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        loss = model.train_step(inputs, labels, criterion, optimizer)
        # if (i%100 == 0): print(loss)
        running_loss += loss

    # Log training loss to Neptune
    epoch_loss = running_loss / (i + 1)
    run["train/loss"].log(epoch_loss)
    
    # Validation
    val_loss = 0.0
    for i, data in enumerate(val_dataloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        loss = model.validate_step(inputs, labels, criterion)
        val_loss += loss
        
    # Getting the epoch validation loss    
    epoch_val_loss = val_loss / (i + 1)
    
    # Log validation loss to Neptune and update the scheduler
    run["val/loss"].log(epoch_val_loss)
    scheduler.step(epoch_val_loss)

    print(f"Epoch: {epoch + 1}, Train Loss: {epoch_loss}, Val Loss: {epoch_val_loss}")

        
    # Saving the best model to neptune for use down the line
    if(epoch_val_loss < best_loss):
        best_loss = epoch_val_loss
        best_model_wts = copy.deepcopy(model.state_dict())
            
        # Store weights into a local file that will be uploaded
        torch.save(model.state_dict(), weight_path)
        run["model_checkpoints/my_model"].upload("/kaggle/working/weights.pt")
        print("Copied best model weights!")
    
    
    
print("Finished Training")

In [None]:
## Getting test loss on the trained model as well as the test accuarcy

test_loss = 0.0
correct = 0
total = 0

#model.load_state_dict(torch.load("/kaggle/working/weights.pt"))
model.load_state_dict(torch.load("/kaggle/working/weights.pt"))


with torch.no_grad():
    for data in test_dataloader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        
        loss = criterion(outputs.squeeze(), labels.float())
        test_loss += loss.item()

        # Calculate accuracy
        predicted = torch.round(outputs.squeeze())
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Log test loss and accuracy to Neptune
test_loss = test_loss / len(test_dataloader)
test_accuracy = correct / total
#run["test/loss"].log(test_loss)
#run["test/accuracy"].log(test_accuracy)

print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Close the Neptune run after logging all metrics
run.stop()

In [None]:
## Creating the ROC curve for the model
from sklearn.metrics import roc_curve, auc

model.eval()

# Initialize lists to store true labels and predicted probabilities
true_labels = []
predicted_probs = []

with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Store true labels
        true_labels.extend(labels.cpu().numpy())
        
        # Get predicted probabilities and store them
        outputs = model(inputs)
        predicted_probs.extend(outputs.cpu().numpy().squeeze())

# Convert lists to numpy arrays
true_labels = np.array(true_labels)
predicted_probs = np.array(predicted_probs)


In [None]:
## Plotting the ROC curve for the model

import matplotlib.pyplot as plt

fpr, tpr, _ = roc_curve(true_labels, predicted_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()