# CNN Model Figure Generation

Generate plots from data recorded during training/testing

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pickle
from utils import *

import torch
from torch.utils.data import DataLoader

# Testing Accuracy

# Plot Testing/Training Error Curve
Plot the training of the first fold from csv file

In [None]:
import csv
# https://stackoverflow.com/questions/14091387/creating-a-dictionary-from-a-csv-file
testing_results = pickle.load(open('curr_run/model_test_results.p', "rb"))
training_results = pickle.load(open('curr_run/model_train_hist.p', "rb"))

print(len(training_results))

"""
result = {}
for row in reader:
    for column, value in row.items():  # consider .iteritems() for Python 2
        result.setdefault(column, []).append(float(value))

result["epoch"] = [int(i) for i in result["epoch"]]

plt.plot(result["epoch"], result["train_loss"], result["test_loss"])
plt.title("Average Loss Per Epoch")
plt.legend(["training", "testing"])
plt.show()

plt.plot(result["epoch"], result["train_acc"], result["test_acc"])
plt.ylim([0.6,1])
plt.title("Average Accuracy Per Epoch")
plt.legend(["training", "testing"])
plt.show()
"""

# Confusion Matrix Plotting

https://stackoverflow.com/questions/53290306/confusion-matrix-and-test-accuracy-for-pytorch-transfer-learning-tutorial

In [None]:
data = pickle.load(open("curr_run/model_test_results.p", "rb"))
confusion_matrices = []
matrices = []
test_accuracies = []
num_folds = len(data)
for fold in range(num_folds):
    curr_data = list(zip(*data[fold]))
    data[fold] = {}
    data[fold]["true"] = curr_data[0]
    data[fold]["pred"] = curr_data[1]

    labels = data[fold]["true"]
    labels = torch.flatten(torch.stack(labels))

    outputs = data[fold]["pred"]
    num_classes = len(outputs[0][0])
    outputs = torch.flatten(torch.stack(outputs), end_dim=1)
    outputs = torch.argmax(outputs, 1)
    outputs = np.array(outputs)

    confusion_matrices.append(np.zeros((num_classes, num_classes))) 

    for t, p in zip(labels, outputs):
            confusion_matrices[fold][int(t), int(p)] += 1
    
    matrix = np.array(confusion_matrices[fold])
    matrix = np.array([i/sum(i) for i in matrix])
    matrices.append(matrix)

    test_accuracy = 0
    for i in range(num_classes):
        test_accuracy += confusion_matrices[fold][i,i]
    test_accuracy = test_accuracy / sum(confusion_matrices[fold].flatten())
    test_accuracies.append(test_accuracy)

for i, accuracy in enumerate(test_accuracies):
    print(f"Fold: {i}, Test Accuracy: {accuracy:>.2%}")

print(print(f"Average: Test Accuracy: {sum(test_accuracies)/num_folds:>.2%}"))

In [None]:
import seaborn as sn
import pandas as pd

axis_labels = ["alveolar", "marrow", "monocyte"]
num_classes = len(axis_labels)
matrix_sum = np.zeros_like(matrices[0])
for mat in matrices:
    matrix_sum += mat
matrix_sum /= len(matrices)

matrix_df = pd.DataFrame(matrix_sum, index=axis_labels, columns=axis_labels)
sn.set(font_scale=1.4) # for label size
sn.heatmap(matrix_df, annot=True, fmt='.2%') # font size
plt.title('Aggregate Results of 5-Fold Cross Validation')
plt.show()

# Reciever Operator Characteristic Curve (ROC)


In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

roc_data = pickle.load(open("model_roc_data.p", "rb"))

labels = roc_data[0]["true"]
labels = torch.flatten(torch.stack(labels))
labels_b = label_binarize(labels, classes=[0,1,2])

outputs = roc_data[0]["outputs"]
outputs = torch.flatten(torch.stack(outputs), end_dim=1)
outputs = np.array(torch.softmax(outputs, 1))

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(labels_b[:, i], outputs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr["micro"], tpr["micro"], _ = roc_curve(labels_b.ravel(), outputs.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
plt.figure()
lw = 2
plt.plot(fpr[0], tpr[0], color='darkorange',
         lw=lw, label='alveolar (area = %0.4f)' % roc_auc[0])
plt.plot(fpr[1], tpr[1], color='red',
         lw=lw, label='marrow (area = %0.4f)' % roc_auc[1])
plt.plot(fpr[2], tpr[2], color='green',
         lw=lw, label='monocyte (area = %0.4f)' % roc_auc[2])                  
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.9, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of 3-class CNN')
plt.legend(loc="lower right")
plt.show()

# T-SNE 

In [None]:
print("Using existing trained model")
net = torch.load('./models/model_fold_0')
net.to("cpu")
net.eval()

test_data = pickle.load(open("./models/test_data_fold_0", "rb"))
test_sampler = equal_classes_sampler(test_data.labels)

dataloader_test = DataLoader(test_data, batch_size=len(test_data), sampler=test_sampler,
                        shuffle=False, num_workers=0)  

In [None]:

activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

net.fc2.register_forward_hook(get_activation('fc2'))
X = None
y = None
for batch, data in enumerate(dataloader_test):
    X, y = data[0][:,[0],:,:], data[1]
    # Compute prediction error
    pred = net(X.float())



In [None]:

from sklearn.manifold import TSNE
import seaborn as sns
fc2_activation = np.array(activation['fc2'])

TSNE = TSNE(n_components=2, perplexity=50, n_iter=10000, learning_rate=100)

tsne_results = TSNE.fit_transform(fc2_activation)

df_tsne = pd.DataFrame(tsne_results, columns=['t-sne-one', 't-sne-two'])
phenotype = {0: "alveolar",
             1: "marrow",
             2: "monocyte"}
df_tsne['label'] = [phenotype[int(ele)] for ele in y] 

plt.figure(figsize=(10,10))
sns.scatterplot(
    x="t-sne-one", y="t-sne-two",
    hue="label",
    palette=sns.color_palette("hls", 3),
    data=df_tsne,
    legend="full",
    alpha=0.5
)
plt.show()
