## Task 5: Results Evaluation

Import libraries

In [None]:
import wandb
import os
os.environ["WANDB_NOTEBOOK_NAME"] = "result_evaluation.ipynb"
wandb.login()
import pickle
import numpy as np
import matplotlib.pyplot as plt

Functions

In [None]:
# campare the feature difference between the original and perturbated samples
def calculate_perturbated_feature(arr1, arr2):
    # Create a boolean matrix where True indicates elements are different
    diff_matrix = arr1 != arr2
    
    # Convert boolean matrix to integer matrix (1 for True, 0 for False)
    result_matrix = diff_matrix.astype(int)
    
    return result_matrix.sum(axis=0)

# Load attack results from pickle file to dictionary
def load_results(dataset_name, model_name, attack_name):

    file_path = f"results/{dataset_name}_{model_name}_{attack_name}.pickle"

    # if file does not exist, return empty dictionary
    if not os.path.exists(file_path):
        print(f"The file 'results/{dataset_name}_{model_name}_{attack_name}.pickle' does not exist. Skip.")
        return None

    # Load the pickled object from the file
    with open(file_path, 'rb') as handle:
        loaded_results = pickle.load(handle)


    # Ensure the loaded object is a dictionary
    if isinstance(loaded_results, dict):
        my_dict = loaded_results
        # print("Successfully loaded the pickled dictionary.")
        return my_dict
    else:
        print("The loaded object is not a dictionary.")
        raise TypeError(f"The loaded object is not a dictionary, which is {type(my_dict)}.")

Load pickle files

In [None]:
# "Adult", "Electricity", "Higgs", "BankMarketing", "house_16H", "GermanCredit", "jm1", "BreastCancer"
dataset_list = ["Adult", "Electricity", "Higgs", "BankMarketing", "house_16H", "GermanCredit", "jm1", "BreastCancer"]
model_list = ["LogisticRegression","MLP", "TabTransformer", "FTTransformer"]
attack_list = ["L2Gaussian", "L2Uniform", "LinfUniform", "LinfFGSM", "LinfPGD", "LinfBIM", "L2CarliniWagner", "L2DeepFool"]

In [None]:
all_results = dict()
for dataset_name in dataset_list:
    for model_name in model_list:
        for attack_name in attack_list:
            # Load the results from pickle file
            result = load_results(dataset_name, model_name, attack_name)
            all_results[f"{dataset_name}_{model_name}_{attack_name}"] = result

    

Plotting:

In [None]:
dataset_name = "Adult"
model_name = "MLP"
attack_name = "LinfPGD"

In [None]:
attack_success_rates = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['attack_success_rates']
average_distances = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['average_distances']
l0_distances = average_distances['L0 Distance']
l1_distances = average_distances['L1 Distance']
l2_distances = average_distances['L2 Distance']
linf_distances = average_distances['Linf Distance']
mahalanobis_distances = average_distances['Mahalanobis Distance']
sensitivity = average_distances['Sensitivity']
outliner_rates = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['outliner_rates']
epsilons = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['epsilons']
paths = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['numpy_path']

1. Plot the relationship between epsilon and attack success rate

In [None]:
# Plot the relationship between epsilon and attack success rate
plt.figure(figsize=(10, 6))
plt.plot(epsilons, attack_success_rates, marker='o', linestyle='-')
plt.title(f'Model: {model_name} - Dataset: {dataset_name} - Attack: {attack_name} - Success Rate vs. Epsilon')
plt.xlabel('Epsilon')
plt.ylabel('Success Rate')
plt.grid(True)
plt.show()

2. Plot the relationship between epsilon and distances metrics

In [None]:
# Linf distance
plt.figure(figsize=(10, 6))
plt.plot(epsilons, linf_distances, marker='o', linestyle='-')
plt.title(f'Model: {model_name} - Dataset: {dataset_name} - Attack: {attack_name} - Linf Distance vs. Epsilon')
plt.xlabel('Epsilon')
plt.ylabel('Linf Distance')
plt.grid(True)
plt.show()


In [None]:
# L2 distance
plt.figure(figsize=(10, 6))
plt.plot(epsilons, l2_distances, marker='o', linestyle='-')
plt.title(f'Model: {model_name} - Dataset: {dataset_name} - Attack: {attack_name} - L2 Distance vs. Epsilon')
plt.xlabel('Epsilon')
plt.ylabel('L2 Distance')
plt.grid(True)
plt.show()

In [None]:
# L1 distance
plt.figure(figsize=(10, 6))
plt.plot(epsilons, l1_distances, marker='o', linestyle='-')
plt.title(f'Model: {model_name} - Dataset: {dataset_name} - Attack: {attack_name} - L1 Distance vs. Epsilon')
# plt.ylim(0,6)
plt.xlabel('Epsilon')
plt.ylabel('L1 Distance')
plt.grid(True)
plt.show()

3. Plot the relationship between epsilon and sparsity metrics

In [None]:
# L0 distance
plt.figure(figsize=(10, 6))
plt.plot(epsilons, l0_distances, marker='o', linestyle='-')
plt.title(f'Model: {model_name} - Dataset: {dataset_name} - Attack: {attack_name} - Sparsity vs. Epsilon')
# plt.ylim(0,6)
plt.xlabel('Epsilon')
plt.ylabel('L0 Distance')
plt.grid(True)
plt.show()

4. Plot relationship between L2 distance vs. attack success rate

In [None]:
# L2 distance vs. attack success rate
plt.figure(figsize=(10, 6))
plt.plot(l2_distances, attack_success_rates, marker='o', linestyle='-')
plt.title(f'Model: {model_name} - Dataset: {dataset_name} - Attack: {attack_name} - L2 Distance vs. Success Rate')
plt.xlabel('L2 Distance')
plt.ylabel('Success Rate')
plt.grid(True)
plt.show()

5. What feature are perturbed the most?

In [None]:
matrix_list = []
for i in range(len(epsilons)):
    success_arr = np.load(paths[i]["success_arr"])
    success_arr_adv = np.load(paths[i]["success_arr_adv"])

    matrix = calculate_perturbated_feature(success_arr, success_arr_adv)
    matrix_list.append(matrix)

In [None]:
# Create a heatmap using matplotlib
plt.figure(figsize=(12, 8))
plt.imshow(matrix_list, cmap='viridis',aspect='auto')
# Add grid lines for each cell
plt.grid(which='both', color='grey', linewidth=0.1)
plt.colorbar(label='Values')
plt.title(f'Model: {model_name} - Dataset: {dataset_name} - Attack: {attack_name} Heatmap of Perturbated Features')
plt.xlabel('Features')
plt.ylabel('Epsilon')
plt.show()

Use subplot to plot all the figures

In [None]:

def plot_in_grid(all_results, dataset_name, model_name, attack_name):
    attack_success_rates = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['attack_success_rates']
    average_distances = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['average_distances']
    l0_distances = average_distances['L0 Distance']
    l1_distances = average_distances['L1 Distance']
    l2_distances = average_distances['L2 Distance']
    linf_distances = average_distances['Linf Distance']
    mahalanobis_distances = average_distances['Mahalanobis Distance']
    sensitivity = average_distances['Sensitivity']
    outliner_rates = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['outliner_rates']
    epsilons = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['epsilons']
    paths = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['numpy_path']

    fig, ax = plt.subplots(3, 3 ,figsize=(18, 14))
    fig.suptitle(f'Model: {model_name} - Dataset: {dataset_name} - Attack: {attack_name}', y=0.92, fontsize=16)
    ax[0, 0].plot(epsilons, attack_success_rates, marker='o', linestyle='-')
    ax[0, 0].set_title(f'Success Rate vs. Epsilon')
    ax[0, 0].set_xlabel('Epsilon')
    ax[0, 0].set_ylabel('Success Rate')
    ax[0, 0].grid(True)

    ax[0, 1].plot(epsilons, l0_distances, marker='o', linestyle='-')
    ax[0, 1].set_title(f'Sparsity vs. Epsilon')
    ax[0, 1].set_xlabel('Epsilon')
    ax[0, 1].set_ylabel('Sparsity')
    ax[0, 1].grid(True)

    ax[0, 2].plot(epsilons, l1_distances, marker='o', linestyle='-')
    ax[0, 2].set_title(f'L1 Distance vs. Epsilon')
    ax[0, 2].set_xlabel('Epsilon')
    ax[0, 2].set_ylabel('L1 Distance')
    ax[0, 2].grid(True)

    ax[1, 0].plot(epsilons, l2_distances, marker='o', linestyle='-')
    ax[1, 0].set_title(f'L2 Distance vs. Epsilon')
    ax[1, 0].set_xlabel('Epsilon')
    ax[1, 0].set_ylabel('L2 Distance')
    ax[1, 0].grid(True)

    ax[1, 1].plot(epsilons, linf_distances, marker='o', linestyle='-')
    ax[1, 1].set_title(f'Linf Distance vs. Epsilon')
    ax[1, 1].set_xlabel('Epsilon')
    ax[1, 1].set_ylabel('Linf Distance')
    ax[1, 1].grid(True)

    ax[1, 2].plot(epsilons, mahalanobis_distances, marker='o', linestyle='-')
    ax[1, 2].set_title(f'Mahalanobis Distance vs. Epsilon')
    ax[1, 2].set_xlabel('Epsilon')
    ax[1, 2].set_ylabel('Mahalanobis Distance')
    ax[1, 2].grid(True)

    ax[2, 0].plot(epsilons, outliner_rates, marker='o', linestyle='-')
    ax[2, 0].set_title(f'Outlier Rate vs. Epsilon')
    ax[2, 0].set_xlabel('Epsilon')
    ax[2, 0].set_ylabel('Outlier Rate')
    ax[2, 0].grid(True)

    ax[2, 1].plot(epsilons, sensitivity, marker='o', linestyle='-')
    ax[2, 1].set_title(f'Sensitivity vs. Epsilon')
    ax[2, 1].set_xlabel('Epsilon')
    ax[2, 1].set_ylabel('Sensitivity')
    ax[2, 1].grid(True)

    # ax[2, 2].plot(l2_distances, attack_success_rates, marker='o', linestyle='-')
    # ax[2, 2].set_title(f'L2 Distance vs. Success Rate')
    # ax[2, 2].set_xlabel('L2 Distance')
    # ax[2, 2].set_ylabel('Success Rate')
    # ax[2, 2].grid(True)

    matrix_list = []
    for i in range(len(epsilons)):
        success_arr = np.load(paths[i]["success_arr"])
        success_arr_adv = np.load(paths[i]["success_arr_adv"])

        matrix = calculate_perturbated_feature(success_arr, success_arr_adv)
        matrix_list.append(matrix)

    ax[2, 2].imshow(matrix_list, cmap='viridis',aspect='auto')
    # Add grid lines for each cell
    ax[2, 2].grid(which='both', color='grey', linewidth=0.1)
    # ax[2, 2].colorbar(label='Values')
    ax[2, 2].set_title(f'Heatmap of Perturbated Features')
    ax[2, 2].set_xlabel('Features')
    ax[2, 2].set_ylabel('Epsilon')

    plt.show()
    os.makedirs("results/figures", exist_ok=True)
    fig.savefig(f"results/figures/{dataset_name}_{model_name}_{attack_name}.png")



In [None]:
for dataset_name in dataset_list:
    for model_name in model_list:
        for attack_name in attack_list:
            if all_results[f"{dataset_name}_{model_name}_{attack_name}"] is not None:
                plot_in_grid(all_results, dataset_name, model_name, attack_name)

Plot different models and different attacks in one figure for comparison (same dataset)

In [None]:
def plot_multiline(all_results, dataset_name, models, attacks):
    fig, ax = plt.subplots(4, 3 ,figsize=(18, 19))

    for model_name in models:
        for attack_name in attacks:
            if all_results[f"{dataset_name}_{model_name}_{attack_name}"] is None:
                continue
            
            attack_success_rates = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['attack_success_rates']
            average_distances = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['average_distances']
            l0_distances = average_distances['L0 Distance']
            l1_distances = average_distances['L1 Distance']
            l2_distances = average_distances['L2 Distance']
            linf_distances = average_distances['Linf Distance']
            mahalanobis_distances = average_distances['Mahalanobis Distance']
            sensitivity = average_distances['Sensitivity']
            outliner_rates = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['outliner_rates']
            epsilons = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['epsilons']
            paths = all_results[f"{dataset_name}_{model_name}_{attack_name}"]["result"]['numpy_path']

            ax[0, 0].plot(epsilons, attack_success_rates, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[0, 1].plot(epsilons, l0_distances, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[0, 2].plot(epsilons, l1_distances, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[1, 0].plot(epsilons, l2_distances, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[1, 1].plot(epsilons, linf_distances, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[1, 2].plot(epsilons, mahalanobis_distances, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[2, 0].plot(epsilons, outliner_rates, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[2, 1].plot(epsilons, sensitivity, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[2, 2].plot(l0_distances, attack_success_rates, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[3, 0].plot(l2_distances, attack_success_rates, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[3, 1].plot(linf_distances, attack_success_rates, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")

            ax[3, 2].plot(outliner_rates, attack_success_rates, marker='o', linestyle='-', label=f"{model_name}/{attack_name}")


    ax[0, 0].set_title(f'Success Rate vs. Epsilon')
    ax[0, 0].set_xlabel('Epsilon')
    ax[0, 0].set_ylabel('Success Rate')
    # ax[0, 0].legend()
    ax[0, 0].grid(True)

    ax[0, 1].set_title(f'Sparsity vs. Epsilon')
    ax[0, 1].set_xlabel('Epsilon')
    ax[0, 1].set_ylabel('Sparsity')
    # ax[0, 1].legend()
    ax[0, 1].grid(True)

    ax[0, 2].set_title(f'L1 Distance vs. Epsilon')
    ax[0, 2].set_xlabel('Epsilon')
    ax[0, 2].set_ylabel('L1 Distance')
    # ax[0, 2].legend()
    ax[0, 2].grid(True)

    ax[1, 0].set_title(f'L2 Distance vs. Epsilon')
    ax[1, 0].set_xlabel('Epsilon')
    ax[1, 0].set_ylabel('L2 Distance')
    # ax[1, 0].legend()
    ax[1, 0].grid(True)

    ax[1, 1].set_title(f'Linf Distance vs. Epsilon')
    ax[1, 1].set_xlabel('Epsilon')
    ax[1, 1].set_ylabel('Linf Distance')
    # ax[1, 1].legend()
    ax[1, 1].grid(True)

    ax[1, 2].set_title(f'Mahalanobis Distance vs. Epsilon')
    ax[1, 2].set_xlabel('Epsilon')
    ax[1, 2].set_ylabel('Mahalanobis Distance')
    # ax[1, 2].legend()
    ax[1, 2].grid(True)

    ax[2, 0].set_title(f'Outlier Rate vs. Epsilon')
    ax[2, 0].set_xlabel('Epsilon')
    ax[2, 0].set_ylabel('Outlier Rate')
    # ax[2, 0].legend()
    ax[2, 0].grid(True)

    ax[2, 1].set_title(f'Sensitivity vs. Epsilon')
    ax[2, 1].set_xlabel('Epsilon')
    ax[2, 1].set_ylabel('Sensitivity')
    # ax[2, 1].legend()
    ax[2, 1].grid(True)

    ax[2, 2].set_title(f'Sparsity vs. Success Rate')
    ax[2, 2].set_xlabel('L0 Distance')
    ax[2, 2].set_ylabel('Success Rate')
    # ax[2, 2].legend()
    ax[2, 2].grid(True)

    ax[3, 0].set_title(f'L2 Distance vs. Success Rate')
    ax[3, 0].set_xlabel('L2 Distance')
    ax[3, 0].set_ylabel('Success Rate')
    # ax[3, 0].legend()
    ax[3, 0].grid(True)

    ax[3, 1].set_title(f'Linf Distance vs. Success Rate')
    ax[3, 1].set_xlabel('Linf Distance')
    ax[3, 1].set_ylabel('Success Rate')
    # ax[3, 1].legend()
    ax[3, 1].grid(True)

    ax[3, 2].set_title(f'Outliner Rate vs. Success Rate')
    ax[3, 2].set_xlabel('Outliner Rate')
    ax[3, 2].set_ylabel('Success Rate')
    # ax[3, 2].legend()
    ax[3, 2].grid(True)

    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='lower left')



    os.makedirs("results/figures", exist_ok=True)
    if len(models) > 1:
        model_n = "All"
    else:
        model_n = models[0]
    if len(attacks) > 1:
        attack_n = "All"
    else:
        attack_n = attacks[0]

    fig.suptitle(f'Dataset: {dataset_name} | Model: {model_n} | Attack: {attack_n}' , y=0.92, fontsize=16)

    fig.savefig(f"results/figures/{dataset_name}_{model_n}_{attack_n}.png", bbox_inches = 'tight')

    plt.show()

In [None]:
for dataset_name in dataset_list:
    plot_multiline(all_results, dataset_name, model_list, attack_list)
    for model_name in model_list:
        plot_multiline(all_results, dataset_name, [model_name], attack_list)
    for attack_name in attack_list:
        plot_multiline(all_results, dataset_name, model_list, [attack_name])


Additional: PCA to visualize the data

In [None]:
idx = -1

In [None]:
# Step 1: Import the necessary libraries
from sklearn.decomposition import PCA

# Step 2: Load and preprocess the data

success_arr = np.load(paths[idx]["success_arr"])
success_arr_adv = np.load(paths[idx]["success_arr_adv"])


combined_array = np.vstack((success_arr, success_arr_adv))

In [None]:
# Step 3: Apply dimensionality reduction using PCA
n_components = 2  # Choose the number of components for visualization (here, 2 for plotting)
pca = PCA(n_components=n_components)
# tsne = TSNE(n_components=n_components)

# Fit PCA on the combined data
pca.fit(combined_array)
# combined_tsne = tsne.fit_transform(combined_array)

# Apply PCA to array1 and array2
success_arr_pca = pca.transform(success_arr)
success_arr_adv_pca = pca.transform(success_arr_adv)
# success_arr_tsne = combined_tsne[:len(success_arr)]
# success_arr_adv_tsne = combined_tsne[:len(success_arr_adv)]


In [None]:
# Step 4: Plot the first 100 rows of each array in a dot plot
plt.figure(figsize=(10, 6))

num = 100

# Plot the first 100 rows of array1
plt.scatter(success_arr_pca[:num, 0], success_arr_pca[:num, 1], color='blue', label='Original (First 100)')

# Plot the first 100 rows of array2
plt.scatter(success_arr_adv_pca[:num, 0], success_arr_adv_pca[:num, 1], color='red', label='Adversarial Examples (First 100)')


# Add arrows from array1_pca to array2_pca for the first 100 points
for i in range(num):
    plt.arrow(success_arr_pca[i, 0], success_arr_pca[i, 1],
              success_arr_adv_pca[i, 0] - success_arr_pca[i, 0], success_arr_adv_pca[i, 1] - success_arr_pca[i, 1],
              color='gray', alpha=0.5, width=0.002, head_width=0.04)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.title('PCA Plot of Original and Success Adversarial Examples (First 100) - With Categorical Features')
plt.show()
