# Setup Necessary Things

Follow instructions on this url to setup kaggle.json in colab session.

https://www.kaggle.com/general/74235

In [None]:
# upload kaggle.json
from google.colab import files
files.upload()

In [None]:
# kaggle already available in google colab, uncomment if not using google colab
# !pip install -q kaggle

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d paultimothymooney/blood-cells
!unzip -q blood-cells.zip -d blood-cells

Import Necessary Libraries

In [None]:
# common imports
import os, cv2, itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
from time import time

# plotting imports
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
RS = 123

# data processing imports
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP

# performance metrics
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve, auc

# pytorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import kaiming_uniform_, xavier_uniform_
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import Adam

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm import tqdm


# torchvision imports
import torchvision.utils as utils #import make_grid
import torchvision.transforms as transforms
import torchvision.models as models

# tensorboard imports
from torch.utils.tensorboard import SummaryWriter
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'running on {device}')


# Define Custom Network

Adjust ResNet for Blood Cell Dataset

In [None]:
class BloodCellResNet(nn.Module):
    def __init__(self, NUM_CLASSES):
        super().__init__()
        self.resnet50 = models.resnet50(pretrained=True)
        num_ftrs = self.resnet50.fc.in_features
        self.resnet50.fc = nn.Linear(num_ftrs, 256)

        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, NUM_CLASSES)

    def forward(self, x):
        x = F.relu(self.resnet50(x))
        x = F.dropout(x)

        x = F.relu(self.fc2(x))
        x = F.dropout(x)

        x = F.relu(self.fc3(x))
        x = F.dropout(x)

        x = F.relu(self.fc4(x))
        x = F.dropout(x)

        x = self.fc5(x)

        return x


# Define Custom Dataset

To encapsulate Blood Cell dataset

In [None]:
class BloodCellDataset(Dataset):
    def __init__(self, path, transform=None):
        self.images, self.labels = self.get_data(path)
        self.transform = transform
        self.classes = ['NEUTROPHIL','EOSINOPHIL', 'MONOCYTE', 'LYMPHOCYTE']

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        image = cv2.resize(cv2.imread(self.images[idx]), (224, 224))
        label = self.labels[idx]
        # label = torch.Tensor([0, 0, 0, 0])
        # label[label_class] = 1
        if self.transform:
            image = self.transform(image)
        return image, label
    
    def get_data(self, folder):
        mappings = dict(zip(['NEUTROPHIL','EOSINOPHIL', 'MONOCYTE', 'LYMPHOCYTE'], list(range(0,4))))
        images = []
        labels = []
        for subtype in os.listdir(folder):
            if not subtype.startswith('.'):
                label = mappings[subtype]
            for image_name in tqdm(os.listdir(os.path.join(folder, subtype))):
                images.append(os.path.join(folder, subtype, image_name))
                labels.append(label)
        return np.asarray(images), np.asarray(labels)


Define important variable

In [None]:
# directories path
DATASET_ROOT = '/content/blood-cells/dataset2-master/dataset2-master/images'
PATH_TRAIN = os.path.join(DATASET_ROOT, 'TRAIN')
PATH_VALIDATION = os.path.join(DATASET_ROOT, 'TEST')
PATH_TEST = os.path.join(DATASET_ROOT, 'TEST_SIMPLE')
PATH_LOG_DIR = '/content/log_dir_blood_cells/'
PATH_SAVED_MODELS = '/content/saved_models'
if os.path.exists(PATH_SAVED_MODELS):
    os.mkdir(PATH_SAVED_MODELS)

# setting hyper-parameters
BATCH_SIZE_TRAIN = 8
BATCH_SIZE_VAL = 16
BATCH_SIZE_TEST = 16
lr = 0.0025


# Create Dataset and Dataloader Objects

In [None]:
# create dataset objects
dataset_train = BloodCellDataset(path=PATH_TRAIN, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.6606, 0.6414, 0.6787), (0.2602, 0.2627, 0.2635))]))
dataset_val = BloodCellDataset(path=PATH_VALIDATION, transform=transforms.Compose([transforms.ToTensor()]))
dataset_test = BloodCellDataset(path=PATH_TEST, transform=transforms.Compose([transforms.ToTensor()]))
print('\ndataset_train:', len(dataset_train))
print('dataset_val:', len(dataset_val))
print('dataset_test:', len(dataset_test))


In [None]:
# create dataloader objects
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=BATCH_SIZE_VAL, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE_TEST, shuffle=False)
print('dataloader_train:', len(dataloader_train))
print('dataloader_val:', len(dataloader_val))
print('dataloader_test:', len(dataloader_test))


# Training

In [None]:
def train_model(model, dataloader_train, dataloader_val=None, epochs=1, lr=0.01, debug=False):
    TAG = '[train_model]'
    # create object for writing to tensorboard
    writer = SummaryWriter(PATH_LOG_DIR)
    # define loss function
    criterion = nn.CrossEntropyLoss()
    # define optimizer
    optimizer = Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        y_true, y_pred, y_score = list(), list(), list()
        epoch_loss = 0
        model.train()

        for i, batch in tqdm(enumerate(dataloader_train), leave=False, total=len(dataloader_train), position=0):
            inputs, targets = batch
            inputs, targets = inputs.float(), targets.long()
            inputs, targets = inputs.to(device), targets.to(device)
            if debug:
                print('\n', TAG, '[inputs]', inputs.shape, inputs.dtype)
                print('\n', TAG, '[targets]', targets.shape, targets.dtype)

            optimizer.zero_grad()
            yhat = model(inputs)
            if debug: print(TAG, '[yhat]', yhat.shape, yhat.dtype)
            loss = criterion(yhat, targets)
            if debug: print(TAG, '[loss]', loss.item())
            loss.backward()
            optimizer.step()

            yhat = yhat.detach().cpu().numpy()
            if debug: print('\n', TAG, '[yhat]', yhat.shape)
            targets = targets.detach().cpu().numpy().reshape((-1, 1))
            epoch_loss += loss.item()
            y_true.append(targets)
            y_pred.append(yhat.argmax(axis=1).reshape((-1, 1)))
            y_score.append(yhat.reshape((-1, len(dataloader_train.dataset.classes))))
    
        y_true, y_pred, y_score = np.vstack(y_true), np.vstack(y_pred), np.vstack(y_score)
        epoch_loss = epoch_loss / len(dataloader_train)
        metric_accuracy_score = accuracy_score(y_true, y_pred)
        metric_f1_score = f1_score(y_true, y_pred, average='micro')

        print(f'\n[training] | {epoch+1}/{epochs} | loss={epoch_loss:8.6f} | accuracy={metric_accuracy_score:8.6f} | f1_score={metric_f1_score:8.6f}')
        # write metric measures to tensorboard
        writer.add_scalars('Loss', {'train': epoch_loss}, epoch+1)
        writer.add_scalars('Accuracy', {'train': metric_accuracy_score}, epoch+1)
        writer.add_scalars('F1_Score', {'train': metric_f1_score}, epoch+1)

        if dataloader_val:
            validation_loss, metric_accuracy_score, metric_f1_score = evaluate_model(dataloader_val, model)
            print(f'\n[validation] | {epoch+1}/{epochs} | loss={validation_loss:8.6f} | accuracy={metric_accuracy_score:8.6f} | f1_score={metric_f1_score:8.6f}')
            # write metric measures to tensorboard
            writer.add_scalars('Loss', {'train': validation_loss}, epoch+1)
            writer.add_scalars('Accuracy', {'loss': metric_accuracy_score}, epoch+1)
            writer.add_scalars('F1_Score', {'loss': metric_f1_score}, epoch+1)
        
        if (epoch+1) % 5 == 0:
            torch.save(model.state_dict, f'saved_models/blood_cell_resnet50_lr{lr}_e{epoch}.pt')


Method for performing evaluation on dataloader

In [None]:
def evaluate_model(dataloader, model, debug=False):
    TAG = '[evaluate_model]'
    y_true, y_pred, y_score = list(), list(), list()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()
    model.eval()
    for i, batch in tqdm(enumerate(dataloader), leave=False, total=len(dataloader), position=0):
        inputs, targets = batch
        inputs, targets = inputs.float(), targets.long()
        inputs, targets = inputs.to(device), targets.to(device)
        if debug:
            print('\n', TAG, '[inputs]', inputs.shape, inputs.dtype)
            print('\n', TAG, '[targets]', targets.shape, targets.dtype)

        yhat = model(inputs)
        if debug: print('\n', TAG, '[yhat]', yhat.shape, yhat.dtype)
        loss = criterion(yhat, targets)
        if debug: print('\n', TAG, '[loss]', loss.item())

        yhat = yhat.detach().cpu().numpy()
        if debug: print('\n', TAG, '[yhat]', yhat.shape)
        targets = targets.detach().cpu().numpy().reshape((-1, 1))
        total_loss += loss.item()
        y_true.append(targets)
        y_pred.append(yhat.argmax(axis=1).reshape((-1, 1)))
        y_score.append(yhat.reshape((-1, len(dataloader.dataset.classes))))
    
    y_true, y_pred, y_score = np.vstack(y_true), np.vstack(y_pred), np.vstack(y_score)
    if debug: print('\n', TAG, 'y_true.shape:', y_true.shape, 'y_pred.shape:', y_pred.shape, 'y_score.shape:', y_score.shape)
    if debug: print('\n', TAG, np.unique(y_true), np.unique(y_pred))

    total_loss = total_loss / len(dataloader)
    metric_accuracy_score = accuracy_score(y_true, y_pred)
    metric_f1_score = f1_score(y_true, y_pred, average='micro')

    return (total_loss, metric_accuracy_score, metric_f1_score)


In [None]:
model = BloodCellResNet(4).to(device)
# print(model)

To resolve "memory out of bound" error

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()
!nvidia-smi

In [None]:
epochs = 10
lr = 0.0025
train_model(model, dataloader_train, dataloader_val, epochs, lr, debug=False)
torch.save(network.state_dict(), f'blood_cell_resnet50_e{epochs}.pt')

# Tensorboard

In [None]:
# !rm -rf /content/log_dir_blood_cells

In [None]:
%load_ext tensorboard
%tensorboard --logdir "/content/log_dir_blood_cells"

# Testing

In [None]:
# for loading saved model from .pt file
model = BloodCellResNet(4).to(device)
model.load_state_dict(torch.load('cell_classification_resnet18.pt'))
model.eval()

In [None]:
total_loss, metric_accuracy_score, metric_f1_score = evaluate_model(dataloader_test, model, debug=False)
print(f"\n[testing] | accuracy={metric_accuracy_score} | loss={total_loss} | f1_score={metric_f1_score}")

# Confusion Matrix
---

In [None]:
def get_predictions(dataloader, model):
    TAG = '[get_predictions]'
    y_true, y_pred, y_score = list(), list(), list()
    model.eval()
    for i, batch in tqdm(enumerate(dataloader), leave=False, total=len(dataloader), position=0):
        inputs, targets = batch
        inputs, targets = inputs.float(), targets.long()
        inputs, targets = inputs.to(device), targets.to(device)

        yhat = model(inputs)

        yhat = yhat.detach().cpu().numpy()
        targets = targets.detach().cpu().numpy().reshape((-1, 1))
        y_true.append(targets)
        y_pred.append(yhat.argmax(axis=1).reshape((-1, 1)))
        y_score.append(yhat.reshape((-1, len(dataloader.dataset.classes))))

    return (np.vstack(y_true), np.vstack(y_pred), np.vstack(y_score))


In [None]:
train_true, train_pred, train_score = get_predictions(dataloader_train, model)
val_true, val_pred, val_score = get_predictions(dataloader_val, model)
test_true, test_pred, test_score = get_predictions(dataloader_test, model)

In [None]:
train_confusion_matrix = confusion_matrix(train_true, train_pred)
print('\n[train_confusion_matrix]\n', train_confusion_matrix)
val_confusion_matrix = confusion_matrix(val_true, val_pred)
print('\n[val_confusion_matrix]\n', val_confusion_matrix)
test_confusion_matrix = confusion_matrix(test_true, test_pred)
print('\n[test_confusion_matrix]\n', test_confusion_matrix)

# Vislualization | PCA, T-SNE, UMAP
---

In [None]:
# Utility function to visualize the outputs of PCA and t-SNE
def custom_scatter(x, colors, title):
    # print('x.shape ->', x.shape)
    colors = np.array(colors).reshape(-1)
    # print('colors.shape ->', colors.shape)
    # choose a color palette with seaborn.
    num_classes = len(np.unique(colors))
    # print('num_classes ->', num_classes)
    palette = np.array(sns.color_palette("hls", num_classes))
    # print('palette.shape ->', palette.shape)

    # create a scatter plot.
    f = plt.figure(figsize=(15, 15))
    f.suptitle(title)
    ax = plt.subplot(aspect='equal')

    # c = palette[colors.astype(np.int)]
    # print('c.shape ->', c.shape)
    # c = np.squeeze(c, axis=(1,))

    # print(x[:,0].shape, x[:,1].shape, palette[colors.astype(np.int)].shape)
    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40, c=palette[colors.astype(np.int)])
    # plt.xlim(-25, 25)
    # plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    # add the labels for each digit corresponding to the label
    txts = []

    for i in range(num_classes):
        # Position of each label at median of data points.
        xtext, ytext = np.median(x[colors == i, :], axis=0)
        txt = ax.text(xtext, ytext, str(i), fontsize=24)
        txt.set_path_effects([PathEffects.Stroke(linewidth=5, foreground="w"), PathEffects.Normal()])
        txts.append(txt)

    return f, ax, sc, txts


In [None]:
mappings = {'EOSINOPHIL': 0, 'LYMPHOCYTE': 1, 'MONOCYTE': 2, 'NEUTROPHIL': 3}
root_path = '/content/blood-cells/dataset2-master/dataset2-master/images/TRAIN/'
images = []
labels = []
num_images = 500

for folder_name in os.listdir(root_path):
    print('folder_name ->', folder_name)
    image_label = mappings[folder_name]
    folder_path = os.path.join(root_path, folder_name)
    for i, image_name in enumerate(os.listdir(folder_path)):
        if i >= num_images: break
        labels.append(image_label)
        image_path = os.path.join(root_path, folder_name, image_name)
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.resize(image, (128, 128))
        images.append(image)

images = np.array(images).reshape(num_images * 4, -1)
labels = np.array(labels)
print('images.shape ->', images.shape)
print('labels.shape ->', labels.shape)

# normalize images
print(f'images.mean(): {images.mean():10.6f}, images.std(): {images.std():10.6f}')
images = (images - images.mean())
print(f'images.mean(): {images.mean():10.6f}, images.std(): {images.std():10.6f}')
images = images / images.std()
print(f'images.mean(): {images.mean():10.6f}, images.std(): {images.std():10.6f}')

Save images in tensorboard summarywriter for PCA, TSNE

In [None]:
# writer = SummaryWriter()
# label_images = images.reshape(2000, 1, 128, 128)
# print(label_images.shape, label_images.dtype)
# writer.add_embedding(images, metadata=labels, label_img=torch.from_numpy(label_images))
# writer.close()

Manually Create PCA, TSNE using library functions

In [None]:
# x_subset = df_test.iloc[:10000, :-1]
# y_subset = df_test.iloc[:10000, -1]
images = df_main
x_subset = df_main.iloc[:, :-1].values
y_subset = df_main.iloc[:, -1].values

# # PCA Visualization
time_start = time.time()
components = 10
pca = PCA(n_components=components)
pca_result = pca.fit_transform(x_subset)
print(pca_result.shape)
print('PCA done! Time elapsed: {} seconds'.format(time.time() - time_start))

pca_df = pd.DataFrame(columns=[f'pca{i+1}' for i in range(components)])
for i in range(components):
    pca_df['pca' + str(i + 1)] = pca_result[:, i]

variances = pca.explained_variance_ratio_[:components]
print('Variance explained per principal component:\n{}'.format(variances))

# create a scree plot
f = plt.figure(figsize=(12, 12))
ax = plt.subplot(aspect='equal')
ax.bar(range(components), variances)
ax.scatter(range(components), variances)
ax.axis('tight')

two_comp = pca_df[['pca1','pca2']] # taking first and second principal component
print('# Visualizing the PCA output, components = (pca1, pca2)')
f, ax, sc, txts = custom_scatter(two_comp.values, y_subset)
print(f, ax, sc, txts, sep='\n') # Visualizing the PCA output

# two_comp = pca_df[['pca1','pca3']] # taking first and second principal component
# print('# Visualizing the PCA output, components = (pca1, pca3)')
# f, ax, sc, txts = custom_scatter(two_comp.values, y_subset)
# print(f, ax, sc, txts, sep='\n') # Visualizing the PCA output

# two_comp = pca_df[['pca2','pca3']] # taking first and second principal component
# print('# Visualizing the PCA output, components = (pca2, pca3)')
# f, ax, sc, txts = custom_scatter(two_comp.values, y_subset)
# print(f, ax, sc, txts, sep='\n') # Visualizing the PCA output

# # t-SNE Visualization
# time_start = time.time()
# tsne = TSNE(random_state=RS).fit_transform(x_subset)
# print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))
# print('# Visualizing the t-SNE output')
# custom_scatter(tsne, y_subset)

# # Recommended Approach, first PCA then t-SNE
# time_start = time.time()
# pca_10 = PCA(n_components=10)
# pca_result_10 = pca_10.fit_transform(x_subset)
# print('PCA with 10 components done! Time elapsed: {} seconds'.format(time.time()-time_start))
# print('Cumulative variance explained by 10 principal components: {}'.format(np.sum(pca_10.explained_variance_ratio_)))
# time_start = time.time()
# pca_then_tsne = TSNE(random_state=RS).fit_transform(pca_result_10)
# print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))
# print('# Visualizing the PCA then t-SNE output')
# custom_scatter(pca_then_tsne, y_subset)


## UMAP

In [None]:
points = 8000
components = 2
distances = ['hamming', 'dice', 'jaccard', 'russellrao', 'kulsinski', 'rogerstanimoto', 'sokalmichener', 'sokalsneath', 'yule']
umap_dfs = {}

for distance in distances:
    umap_df = pd.DataFrame(columns=[f'umap{i+1}' for i in range(components)])
    idx = 0
    while len(umap_df) < len(df_new_features):
        start = idx * points
        end = start + points
        if end > len(df_new_features):
            end = len(df_new_features)
        df_main = df_new_features.iloc[start:end]

        x_subset = df_main.iloc[:, :-1].values
        y_subset = df_main.iloc[:, -1].values

        time_start = time.time()
        umap1 = UMAP(metric=distance, n_components=components, n_neighbors=25)
        umap_result = umap1.fit_transform(x_subset)
        
        print(umap_result.shape)
        print('UMAP done! Time elapsed: {} seconds'.format(time.time() - time_start))
        
        umap_result_df = pd.DataFrame(columns=[f'umap{i+1}' for i in range(components)])
        for i in range(components):
            umap_result_df['umap' + str(i + 1)] = umap_result[:, i]
        
        umap_df = pd.concat([umap_df, umap_result_df])
        idx += 1

    umap_dfs[distance] = umap_df


In [None]:
for distance in distances:
    two_comp = umap_dfs[distance][['umap1', 'umap2']] # taking first and second principal component
    # print('# Visualizing the UMAP output, components = (umap1, umap2)')
    f, ax, sc, txts = custom_scatter(two_comp.values[:], df_new_features.iloc[:two_comp.values.shape[0], -1], distance)
    # print(f, ax, sc, txts, sep='\n') # Visualizing the UMAP output
