# Mac training Using Fluorescent and Brightfield Channels

We train a predefined "MacNet" CNN to identify alveolar (tissue resident) macrophages versus bone marrow (proxy for monocyte-derived) macrophages. 

The input will be 1-4 channels of brightfield, lipid stain (BODIPY), nuclear stain (Hoechst), mitochondria stain (MitoTracker Red), or cell autofluorescence in green/red/blue channels.

The output will be a binary classification of whether the cell is a bone marrow macrophage or alveolar macrophage

## Constants/Variables

In [None]:
PATHS = [r"..\data\processed\first_batch\alveolar.pickle", 
         r"..\data\processed\first_batch\marrow.pickle",
         r"..\data\processed\first_batch\monocyte.pickle"]
num_classes = len(PATHS)
input_ch = 1
NUM_FOLDS = 5
NUM_BATCHES = 4 # num samples in batch
NUM_EPOCHS = 20

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pickle
from utils import *

import torch
from torch.utils.data import DataLoader
from torch import nn
import torchvision.transforms as transforms

from torch.utils.tensorboard import SummaryWriter

from MacDataset import MacDataset
import macnet
import matplotlib.pyplot as plt
import numpy as np
import random

## Transforms
This section defines the transforms used to augment the base data for training and testing.

In [None]:
train_transforms = transforms.Compose([
    standardize_input(),
    rotate_90_input()
    ])
test_transforms = transforms.Compose([
    standardize_input()
    ])

In [None]:
raw_images = []
raw_labels = []
for i, path in enumerate(PATHS):
    path_data = pickle.load(open(path, "rb"))
    path_data["labels"][:] = i
    raw_images.append(path_data["images"])
    raw_labels.append(path_data["labels"])


## Balance \# Images Per Class

This balances the number of samples per class by randomly deleting samples till the classes are even. This seems to work better than the equal classes sampler

It first detects the smallest class, then creates a random list of indices of which only the last min_len samples are kept. As this list is random, different samples should be selected each time.


In [None]:
min_len = sum([len(label) for label in raw_labels])
balanced_images = []
balanced_labels = []
for i in range(len(raw_labels)):
    num_samples = len(raw_labels[i])
    if  num_samples < min_len:
        min_len = num_samples

for i in range(len(raw_labels)):
    num_samples = len(raw_labels[i])
    raw_idx = list(range(num_samples))
    random.shuffle(raw_idx)
    raw_idx = raw_idx[:min_len]
    balanced_images.append(raw_images[i][raw_idx])
    balanced_labels.append(raw_labels[i][raw_idx])

images = np.vstack(balanced_images)
labels = np.hstack(balanced_labels)

fold_idx = list(range(len(labels)))
random.shuffle(fold_idx)
fold_idx = np.array_split(fold_idx, NUM_FOLDS)
min_len = len(labels)
for fold in fold_idx:
    if len(fold) < min_len:
        min_len = len(fold)

for i in range(len(fold_idx)):
    if len(fold_idx[i]) > min_len:
        fold_idx[i] = np.delete(fold_idx[i], 0)

i = 0
testing_images = images[fold_idx[i]]
training_images = np.delete(images, fold_idx[i], axis=0)

In [None]:
def train(dataloader, model, loss_fn, optimizer, batches, output=None):
    size = len(dataloader.dataset)
    total_done = 0
    correct = 0
    final_training_acc = 0
    
    for batch, data in enumerate(dataloader):
        X, y = data[0][:,[0],:,:].to(device), data[1].to(device)
        # Compute prediction error
        pred = model(X.float())
        loss = loss_fn(input=pred, target=y.to(torch.long))
        # Backpropagation
        model.train()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        correct += (y==torch.argmax(pred,1)).sum().item()
        total_done += batches        
        if batch % 25 == 0 and batch > 0:
            loss, current = loss.item(), batch * len(X)

            training_acc = correct/total_done
            final_training_acc = training_acc
    print(f"Avg. Loss: {loss:>7f}, Accuracy: {final_training_acc:>.2%} [{size:>5d}/{size:>5d}]")   
    print()
    if output is not None:
        output.append((loss.item(),final_training_acc))
    return final_training_acc
    

def test(dataloader, model, acc_out, test_results=None):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for data in dataloader:
            X, y = data[0][:,[0],:,:].to(device), data[1].to(device)
            pred = model(X.float())
            test_loss += loss_fn(input=pred, target=y.to(torch.long)).item()
            correct += (y==torch.argmax(pred,1)).sum().item()
            if test_results is not None:
                test_results.append((y.cpu(), pred.cpu()))
    test_loss /= size
    correct /= size
    print(f"\nTest Error: \nAvg. Loss: {test_loss:>7f}, Accuracy: {correct:>0.2%}\n")

    acc_out.append((test_loss, correct))
    return correct

In [None]:
test_err_fold = []
train_err_fold = []

test_results_fold = [[] for _ in range(NUM_FOLDS)]
train_hist = [[] for _ in range(NUM_FOLDS)]
test_hist = [[] for _ in range(NUM_FOLDS)]

for i in range(NUM_FOLDS):
    print("\n FOLD " + str(i + 1) + " OF " + str(NUM_FOLDS))
    print("=========================================================\n")
    testing_images = images[fold_idx[i]]
    testing_labels = labels[fold_idx[i]]
    training_images = np.delete(images, fold_idx[i], axis=0)
    training_labels = np.delete(labels, fold_idx[i], axis=0)
    
    train_data = MacDataset(training_images, training_labels, 
                                transform=train_transforms)
    test_data = MacDataset(testing_images, testing_labels,
                                transform=test_transforms)

    train_sampler = equal_classes_sampler(train_data.labels)
    test_sampler = equal_classes_sampler(test_data.labels)
    
    dataloader = DataLoader(train_data, batch_size=NUM_BATCHES, sampler=train_sampler,
                            shuffle=False, num_workers=0)

    dataloader_test = DataLoader(test_data, batch_size=NUM_BATCHES, sampler=test_sampler,
                            shuffle=False, num_workers=0)              
    # Get cpu or gpu device for training.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using {} device".format(device))
    
    model = macnet.Net(num_classes, input_ch).to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)

    print("\nTraining Start")

    training_error = []
    testing_error = []
    for t in range(NUM_EPOCHS):
        print(f"Epoch {t+1}\n-------------------------------")

        print("\nTraining Error:")
        training_error.append(train(dataloader, model, loss_fn, optimizer, NUM_BATCHES, output=train_hist[i]))
        if t < NUM_EPOCHS - 1:
            testing_error.append(test(dataloader_test, model, acc_out=test_hist[i]))
        else:
            testing_error.append(test(dataloader_test, model, 
                                 acc_out=test_hist[i], 
                                 test_results=test_results_fold[i]))
                                 
    train_err_fold.append(training_error[-1])
    curr_testing_error = testing_error[-1]

    print("saving model")
    torch.save(model, "./model_fold_" + str(i))
    pickle.dump(test_data, open("./test_data_fold_" + str(i), "wb"))

    test_err_fold.append(testing_error[-1])

train_err_fold = [round(error, 4) for error in train_err_fold]
test_err_fold = [round(error, 4) for error in test_err_fold]
print("training errors per fold")
print(train_err_fold)
print("testing errors per fold")
print(test_err_fold)
print("saving training and testing data")
pickle.dump(test_results_fold, open("model_test_results.p", "wb"))
pickle.dump(train_hist, open("model_train_hist.p", "wb"))