# CNN Model Figure Generation

Generate plots from data recorded during training/testing

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
from utils import *

import torch
from torch.utils.data import DataLoader

# Testing Accuracy

# Plot Testing/Training Error Curve
Plot the training of the first fold from csv file

In [None]:
import csv
# https://stackoverflow.com/questions/14091387/creating-a-dictionary-from-a-csv-file
testing_results = pickle.load(open('curr_run/model_test_results.p', "rb"))
training_results = pickle.load(open('curr_run/model_train_hist.p', "rb"))

print(len(training_results))

"""
result = {}
for row in reader:
    for column, value in row.items():  # consider .iteritems() for Python 2
        result.setdefault(column, []).append(float(value))

result["epoch"] = [int(i) for i in result["epoch"]]

plt.plot(result["epoch"], result["train_loss"], result["test_loss"])
plt.title("Average Loss Per Epoch")
plt.legend(["training", "testing"])
plt.show()

plt.plot(result["epoch"], result["train_acc"], result["test_acc"])
plt.ylim([0.6,1])
plt.title("Average Accuracy Per Epoch")
plt.legend(["training", "testing"])
plt.show()
"""

# Confusion Matrix Plotting

https://stackoverflow.com/questions/53290306/confusion-matrix-and-test-accuracy-for-pytorch-transfer-learning-tutorial

In [None]:
data = pickle.load(open("runs/tr_md_run/model_test_results.p", "rb"))
confusion_matrices = []
matrices = []
test_accuracies = []
num_folds = len(data)

for fold in range(num_folds):
    curr_data = list(zip(*data[fold]))
    data[fold] = {}
    data[fold]["true"] = curr_data[0]
    data[fold]["pred"] = curr_data[1]

    labels = data[fold]["true"]
    labels = torch.flatten(torch.stack(labels))

    outputs = data[fold]["pred"]
    num_classes = len(outputs[0][0])
    outputs = torch.flatten(torch.stack(outputs), end_dim=1)
    outputs = torch.argmax(outputs, 1)
    outputs = np.array(outputs)

    confusion_matrices.append(np.zeros((num_classes, num_classes))) 

    for t, p in zip(labels, outputs):
            confusion_matrices[fold][int(t), int(p)] += 1

    matrix = np.array(confusion_matrices[fold])
    matrix = np.array([i/sum(i) for i in matrix])
    matrices.append(matrix)

    test_accuracy = 0
    for i in range(num_classes):
        test_accuracy += confusion_matrices[fold][i,i]
    test_accuracy = test_accuracy / sum(confusion_matrices[fold].flatten())
    test_accuracies.append(test_accuracy)

for i, accuracy in enumerate(test_accuracies):
    print(f"Fold: {i}, Test Accuracy: {accuracy:>.2%}")

print(print(f"Average: Test Accuracy: {sum(test_accuracies)/num_folds:>.2%}"))

In [None]:
import seaborn as sn
import pandas as pd

axis_labels = ["M0", "M1", "M2"]
num_classes = len(axis_labels)
matrix_sum = np.zeros_like(matrices[0])
for mat in confusion_matrices:
    matrix_sum += mat

matrix_macnet = matrix_sum / len(matrices)

matrix_df = pd.DataFrame(matrix_macnet, index=axis_labels, columns=axis_labels)
sn.set(font_scale=1.4) # for label size
sn.heatmap(matrix_df, annot=True, fmt='.2%') # font size
plt.title('Aggregate Results of 5-Fold Cross Validation')
plt.show()

In [None]:
from sklearn.metrics import classification_report
target_names = ['TR', 'MD', 'Monocyte']
archs = ["macnet", "knn", "random forest"]
roc_data = []
arch_matrices = {}
arch_matrices["macnet"] = matrix_sum
arch_matrices["knn"] = pickle.load(open("runs/tr_md_run/knn_conf_matrix.p", "rb"))
arch_matrices["random forest"] = pickle.load(open("runs/tr_md_run/random_forest_conf_matrix.p", "rb"))

In [None]:

for arch in archs:
    matrix_count = arch_matrices[arch]
    labels = []
    preds = []
    for i, row in enumerate(matrix_count):
        labels.append(np.ones(row.sum().astype(int))*i)
        curr_preds = np.array([])
        for j in range(len(row)):
            tmp = np.ones(row[j].astype(int)).astype(int)*j
            curr_preds = np.append(curr_preds, tmp)
        preds.append(curr_preds)
    labels = np.concatenate(labels)
    preds = np.concatenate(preds)

    res = classification_report(labels, preds, target_names=target_names, digits=4, output_dict=True)
    print(arch)
    del res["weighted avg"]["support"]
    for key in res["weighted avg"].keys():
        type_data = key
        value = res["weighted avg"][key]
        roc_data.append([arch, type_data, value])
        print(key, f"{value:0.2%}    ", end="")
    roc_data.append([arch, "accuracy", res["accuracy"]])
    print("accuracy", f"{res['accuracy']:0.2%}    ", end="")
    print()


roc_data = pd.DataFrame(roc_data, columns=["class", "data_type", "value"])


In [None]:
sn.set_theme(style="whitegrid")

yvals = np.arange(0,1,0.1)
ylabels = [f'{i:.2%}' for i in yvals]
sn.set(font_scale = 1.5)
g = sn.catplot(
    data=pd.DataFrame(roc_data), kind="bar",
    x="class", y="value", hue="data_type",
    ci="sd", palette="dark", height=5, aspect=2
)
g.set(yticks=yvals)
g.set(yticklabels=ylabels)
g.set(ylim=(0.5, 1))
#g.set_yticklabels(ylabel)

g.set_axis_labels("Class", "Percent")
g.legend.set_title("Metric")

# Reciever Operator Characteristic Curve (ROC)


In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

roc_data = pickle.load(open("model_roc_data.p", "rb"))

labels = roc_data[0]["true"]
labels = torch.flatten(torch.stack(labels))
labels_b = label_binarize(labels, classes=[0,1,2])

outputs = roc_data[0]["outputs"]
outputs = torch.flatten(torch.stack(outputs), end_dim=1)
outputs = np.array(torch.softmax(outputs, 1))

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(labels_b[:, i], outputs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr["micro"], tpr["micro"], _ = roc_curve(labels_b.ravel(), outputs.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
"""plt.figure()
lw = 2
plt.plot(fpr[0], tpr[0], color='darkorange',
         lw=lw, label='alveolar (area = %0.4f)' % roc_auc[0])
plt.plot(fpr[1], tpr[1], color='red',
         lw=lw, label='marrow (area = %0.4f)' % roc_auc[1])
plt.plot(fpr[2], tpr[2], color='green',
         lw=lw, label='monocyte (area = %0.4f)' % roc_auc[2])                  
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.9, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of 3-class CNN')
plt.legend(loc="lower right")
plt.show()"""

# T-SNE 

Redefining standardization operation used in training because cannot get transforms.Compose to work properly with Dataset object. TODO: Fixable 

In [None]:
def standardize_input(image):
    image = image.detach().numpy()
    chans = range(image.shape[1])
    means = [np.mean(image[0][chan]) for chan in chans]
    stdevs = [np.std(image[0][chan]) for chan in chans]
    for chan in chans:
        image[0][chan] = (image[0][chan] - means[chan]) / stdevs[chan]
    
    output = torch.Tensor(image)
    return output

Load testing data and run inference

In [None]:
print("Using existing trained model")
net = torch.load('./model_fold_0')
net.to("cpu")
net.eval()

test_data = pickle.load(open("./test_data_fold_0", "rb"))
test_sampler = equal_classes_sampler(test_data.labels)

dataloader_test = DataLoader(test_data, batch_size=1, sampler=test_sampler,
                        shuffle=False, num_workers=0)  

activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

net.fc3.register_forward_hook(get_activation('fc3'))               

infer_results = pd.DataFrame(columns=['X', 'y', 'pred', 'activation'])

for batch, data in enumerate(dataloader_test):
    X, y = data[0].detach().clone(), data[1].detach().clone()
    X = standardize_input(X)
    pred = net(X[:,0:2,:,:].float())
    pred_class = torch.argmax(pred,1).item()

    X_out = data[0].detach().numpy()[0,:,:,:]
    y_out = y.detach().numpy()
    activation_out = np.array(activation['fc3'][0])
    infer_results.loc[batch] = [X_out, y_out, pred_class, activation_out]  
    

In [None]:
from sklearn.manifold import TSNE
import seaborn as sns

TSNE = TSNE(n_components=2, perplexity=40, n_iter=10000, learning_rate=200)
activations = np.stack(infer_results["activation"].to_numpy())
tsne_results = TSNE.fit_transform(activations)
df_tsne = pd.DataFrame(tsne_results, columns=['t-sne-one', 't-sne-two'])
infer_results.drop(["activation"], axis=1)

In [None]:
from scipy import ndimage

def calculate_intensity(img, channel_num, diag=False):
    arr = np.copy(img[channel_num])
    avg1 = list(arr[:2,:].flatten())
    avg2 = list(arr[:,:2].flatten())
    avg3 = list(arr[-2:,:].flatten())
    avg4 = list(arr[:,-2:].flatten())
    avgs = avg1 + avg2 + avg3 + avg4
    avgs.sort()
    avg = avgs[int(len(avgs)*0.9)] # 70th percentile

    
    arr2 = np.copy(arr)
    arr2 = arr2 - avg
    arr2[arr2 < 0] = 0
    arr2 = ndimage.median_filter(arr2, size=3)
    num_non_zero = np.count_nonzero(arr2)
    num_total = np.sum(arr2)
    avg2 = num_total / num_non_zero
    
    lit_pct = num_non_zero/(96*96)*100

    if diag:
        print("average intensity of lit pixels: ", round(avg2,2))
        print("percentage \"lit\": ", round(lit_pct, 2))
        toshow = [arr, arr2]
        labels = ["Stain", "Clean Stain"]
        num_show = len(toshow)
        f, axarr = plt.subplots(1,num_show, figsize=(8, 4))
        for i in range(num_show):
            axarr[i].imshow(toshow[i])
            axarr[i].grid(False)
            axarr[i].set_title(labels[i]) 
            axarr[i].get_xaxis().set_visible(False)
            axarr[i].get_yaxis().set_visible(False)

        plt.show()
    return avg2

In [None]:
CD80_brightnesses = []
CD206_brightnesses = []

marker_thresholds = [(7,7),(10,10),(12,12)]
num_thresholds = len(marker_thresholds)
# 0 for CD80(blue), 1 for CD206(red), 2 for both (purple), 3 for neither (grey)
markers = [[] for _ in range(num_thresholds)]

for i, row in infer_results.iterrows():
    CD80_brightness = calculate_intensity(row["X"], 2)
    CD206_brightness = calculate_intensity(row["X"], 3)
    CD80_brightnesses.append(CD80_brightness)
    CD206_brightnesses.append(CD206_brightness)

    for j, threshold in enumerate(marker_thresholds):
        cond_1 = CD80_brightness > threshold[0]
        cond_2 = CD206_brightness > threshold[1]
        if cond_1 and cond_2:
            markers[j].append(2)
        elif cond_1:
            markers[j].append(0)
        elif cond_2:
            markers[j].append(1)
        else:
            markers[j].append(3)

infer_results.drop(["X"], axis=1)

infer_results["CD80_meas"] = pd.Series(CD80_brightnesses)
infer_results["CD206_meas"] = pd.Series(CD206_brightnesses)
for i, thresh in enumerate(marker_thresholds):
    col_name = "markers_" + str(thresh)
    infer_results[col_name] = pd.Series(markers[i])



In [None]:

bin_range = list(range(0,41,5))
bin_range.extend([60,100])

bin_switch = False

if bin_switch:
    print(infer_results["CD80_meas"].value_counts(bins=bin_range).sort_index())
    print("--------------------")
    print(infer_results["CD206_meas"].value_counts(bins=bin_range).sort_index())

plt.figure(figsize=(16,8))
bins = list(range(0,71,5))
plt.hist([infer_results["CD80_meas"], infer_results["CD206_meas"]], bins, label=["CD80", "CD206"], color=['b', 'r'])
plt.legend(loc='upper right')
plt.xticks(np.arange(0, 70, step=5))
plt.title("Intensity Distribution of CD80/CD206")
plt.xlabel("Average pixel intensity")
plt.ylabel("Num Count")
plt.show()

In [None]:
fig, ax =plt.subplots(1,num_thresholds+1, figsize=(10*num_thresholds+10,10))


for i, thresh in enumerate(marker_thresholds):
    phenotype = {0: "CD80+",
                1: "CD206+",
                2: "CD80+/CD206+",
                3: "CD80-/CD206-"}
    col_name = "markers_" + str(thresh)
    df_tsne['label'] = [phenotype[int(ele)] for ele in infer_results[col_name]] 
    colors = ["#0a70c4", "#db0d0d", "#660ddb", "#b5b5b5"]
    customPalette = sns.set_palette(sns.color_palette(colors))
    sns.scatterplot(
        x="t-sne-one", y="t-sne-two",
        hue="label",
        palette=customPalette,
        data=df_tsne,
        hue_order = ['CD80+', 'CD206+', 'CD80+/CD206+', 'CD80-/CD206-'],
        legend="full",
        alpha=0.5,
        ax=ax[i],
    )
    ax[i].set_title(col_name)

phenotype = {0: "M0",
             1: "M1",
             2: "M2"}
df_tsne['label'] = [phenotype[int(ele)] for ele in infer_results['y']]              
colors = ["#ffc814", "#0a70c4", "#db0d0d"]
customPalette = sns.set_palette(sns.color_palette(colors))
sns.scatterplot(
    x="t-sne-one", y="t-sne-two",
    hue="label",
    palette=customPalette,
    data=df_tsne,
    hue_order = ['M0', 'M1', 'M2'],
    legend="full",
    alpha=0.5,
    ax=ax[num_thresholds],
)
ax=ax[num_thresholds].set_title("CNN Classification")
plt.show()
