# Evaluate Results with Multiclass ROC Analysis

**Goal of this analysis:** Understand our best model's ability to distinguish between the different tumor classes, how the model handles some class imbalance, and generate visuals for threshold selection and comparison.


**1) Prepare Data:** Convert the multiclass labels into a binary format for analysis.

**2) Compute ROC and AUC:** Calculate ROC curves and AUC scores for each class using a One versus Rest strategy (OvR)

**3) Average AUC:** Compute btoh micro-averaged and macro-averaged AUC scores to evaluate overall model performance.

**4) Plotting Curves:** Plot the ROC curves for each class and the averaged ROC curves to visualize the model's performance.

## Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from itertools import cycle

## Load in Test Data

In [None]:
X_test = np.load('X_test.npy')
Y_test = np.load('Y_test.npy')

## One-versus-Rest Classification

In [None]:
# convert y-test to binary format for computation
n_classes = Y_test.max() + 1
Y_test_bin = label_binarize(Y_test, classes=[0, 1, 2, 3])

## Initial Attempt

### Generate ROC Curves and AUC Scores

In [None]:
# Load hybrid model (best performance)
best_model = tf.keras.models.load_model("hybrid_transformer_cnn_model.keras")

# assign object for probability predictions
Y_score = best_model.predict(X_test)

# for each class, calculate roc and auc
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(Y_test_bin[:, i], Y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# calc micro-average
fpr["micro"], tpr["micro"], _ = roc_curve(Y_test_bin.ravel(), Y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# calc macro-average

# aggregate false positive (FP) rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# interpolate all ROC curves at these points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# average it and compute auc
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])


### Visualizations

In [None]:
# Plot ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'
                   ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc="lower right")
plt.show()


## Modular approach to run evaluation on all three models (baseline, cnn, hybrid)

In [None]:
def evaluate_model(model, X_test, Y_test, class_names):

    # Predict probabilities
    Y_test_pred = model.predict(X_test)

    # compute ROC AUC for each tumor class
    fpr, tpr, aucs = {}, {}, {}
    for i, class_name in enumerate(class_names):
        fpr[i], tpr[i], _ = roc_curve(Y_test == i, Y_test_pred[:, i])
        aucs[i] = auc(fpr[i], tpr[i])

    # Compute micro and macro average AUC
    fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), Y_test_pred.ravel())
    aucs["micro"] = auc(fpr["micro"], tpr["micro"])

    # plot ROC curves
    plt.figure(figsize=(10, 7))
    plt.plot(fpr["micro"], tpr["micro"], label=f'Micro-average ROC curve (area = {aucs["micro"]:.2f})')
    for i, class_name in enumerate(class_names):
        plt.plot(fpr[i], tpr[i], label=f'ROC curve of class {class_name} (area = {aucs[i]:.2f})')

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

    # generate confusion matrix
    Y_test_pred_classes = np.argmax(Y_test_pred, axis=1)
    conf_matrix = confusion_matrix(Y_test, Y_test_pred_classes)

    # plot confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greens',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix')
    plt.show()

    # gen classification report for analysis
    report = classification_report(Y_test, Y_test_pred_classes, target_names=class_names)
    print("Classification Report:\n", report)


In [None]:
# set class names
class_names = ['No Tumor', 'Meningioma', 'Glioma', 'Pituitary']

# run evaluation function on each model
print("Evaluating Baseline Model:")
evaluate_model(baseline_model, X_test, Y_test, class_names)

print("\nEvaluating 2-Layer CNN Model:")
evaluate_model(model_2_layers, X_test, Y_test, class_names)

print("\nEvaluating 3-Layer CNN Model:")
evaluate_model(model_3_layers, X_test, Y_test, class_names)

print("\nEvaluating Hybrid Model:")
evaluate_model(hybrid_transformer_cnn_model, X_test, Y_test, class_names)