In [1]:
import os, sys
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.models import load_model

In [2]:
sys.path.append(os.path.abspath(os.path.join('..','data_processing')))
sys.path.append(os.path.abspath(os.path.join('..','models')))

In [None]:
from triplet_preprocessing import test_triplet_dataset

### Evaluates the triplet models on the test dataset by computing similarity scores generating ROC and PR curves and calculating performance metrics at a given threshold

In [9]:
def evaluate_on_test(
        model,
        test_dataset,
        optimal_threshold=0.75,
        num_test_steps=2000
):

    embedding_network = model.get_layer('EmbeddingNetwork')
    positive_similarities = []
    negative_similarities = []

    print("Computing similarities from test dataset")
    for anchors, positives, negatives in tqdm(test_dataset.take(num_test_steps)):
        # Compute embeddings
        anchor_embeddings = embedding_network.predict(anchors, verbose=0)
        positive_embeddings = embedding_network.predict(positives, verbose=0)
        negative_embeddings = embedding_network.predict(negatives, verbose=0)

        # Compute similarities and normalize to [0,1] range
        for i in range(len(anchor_embeddings)):
            # Compute cosine similarity
            pos_sim = np.dot(anchor_embeddings[i], positive_embeddings[i]) / (
                    np.linalg.norm(anchor_embeddings[i]) * np.linalg.norm(positive_embeddings[i])
            )
            # Convert from [-1,1] to [0,1]
            pos_sim = (pos_sim + 1) / 2
            positive_similarities.append(pos_sim)

            neg_sim = np.dot(anchor_embeddings[i], negative_embeddings[i]) / (
                    np.linalg.norm(anchor_embeddings[i]) * np.linalg.norm(negative_embeddings[i])
            )
            # Convert from [-1,1] to [0,1]
            neg_sim = (neg_sim + 1) / 2
            negative_similarities.append(neg_sim)

    # Prepare test data
    y_true = np.concatenate([np.ones(len(positive_similarities)),
                             np.zeros(len(negative_similarities))])
    y_scores = np.concatenate([positive_similarities, negative_similarities])

    # Compute ROC and PR curves
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    pr_auc = auc(recall, precision)

    # Calculate metrics using optimal threshold
    predictions = (y_scores >= optimal_threshold).astype(int)
    tp = np.sum((predictions == 1) & (y_true == 1))
    fp = np.sum((predictions == 1) & (y_true == 0))
    tn = np.sum((predictions == 0) & (y_true == 0))
    fn = np.sum((predictions == 0) & (y_true == 1))

    test_metrics = {
        'accuracy': (tp + tn) / len(y_true),
        'precision': tp / (tp + fp) if (tp + fp) > 0 else 0,
        'recall': tp / (tp + fn) if (tp + fn) > 0 else 0,
        'f1': 2 * (tp / (tp + fp)) * (tp / (tp + fn)) / ((tp / (tp + fp)) + (tp / (tp + fn))) if (tp + fp) > 0 and (tp + fn) > 0 else 0,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc
    }

    # Plot ROC and PR curves
    plt.figure(figsize=(15, 5))

    # ROC curve
    plt.subplot(131)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Test ROC Curve')
    plt.legend(loc="lower right")

    # Precision-Recall curve
    plt.subplot(132)
    plt.plot(recall, precision, color='blue', lw=2, label=f'PR curve (AUC = {pr_auc:.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Test Precision-Recall Curve')
    plt.legend(loc="lower right")

    # Similarity distributions (0-1 range)
    plt.subplot(133)
    plt.hist(positive_similarities, bins=50, alpha=0.5, label='Positive Pairs', density=True)
    plt.hist(negative_similarities, bins=50, alpha=0.5, label='Negative Pairs', density=True)
    plt.axvline(x=optimal_threshold, color='r', linestyle='--', label='Optimal Threshold')
    plt.xlabel('Similarity Score (0-1)')
    plt.ylabel('Density')
    plt.title('Test Similarity Distributions')
    plt.legend()

    plt.tight_layout()
    plt.show()

    print("\nTest Results:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.3f}")

    return test_metrics

### Exmaple of evaluating a model

In [None]:
v6 = load_model('../results/siamese/triplet/triplet_v6/triplet_v6.h5',compile=False)
v6.compile()

In [None]:
test_results = evaluate_on_test(v6,test_triplet_dataset)