# MNIST: Training and Testing on a Clean Dataset & Adversarial Detection

## Imports and MNIST loading

In [None]:
# Imports all the module paths
import sys

import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from tqdm.notebook import tnrange, tqdm

sys.path.append("../../")

# Loads the rest of the modules
# File containing all the required training methods
import defences.mnist as defences

# For testing
import utils.clean_test as clean_test

# Contains the data loadders
import utils.dataloaders as dataloaders

# For printing outcomes
# import utils.printing as printing

# Example printing, but I removed it to simplify results
# for epsilon in epsilons:
#     printing.print_attack(
#         model,
#         testSetLoader,
#         "FGSM",
#         attacks["FGSM"],
#         epsilon=epsilon,
#     )

In [None]:
# Define the `device` PyTorch will be running on, please hope it is CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Notebook will use PyTorch Device: " + device.upper())

## Load the dataset

In [None]:
DATA_ROOT = "../../datasets/"

trainSetLoader, _, testSetLoader = dataloaders.get_MNIST_data_loaders(
    DATA_ROOT,
    trainSetSize=50000,
    validationSetSize=0,
    batchSize=128,
)

## Attacks and Their Results

In [None]:
# A possible attacks array (for nice printing):
# Some attacks use a helper library
import torchattacks

import attacks.fgsm as fgsm
import attacks.ifgsm as ifgsm
import attacks.pgd as pgd
import utils.attacking as attacking

loss_function = nn.CrossEntropyLoss()
attacks = {}

attacks["FGSM"] = fgsm.fgsm_attack
attacks["I-FGSM"] = ifgsm.ifgsm_attack
attacks["PGD"] = pgd.pgd_attack

## Train a model (let's say standard for now)

In [None]:
SAVE_LOAD_ROOT = "../../models_data/MNIST"

standard_model = defences.standard_training(
    trainSetLoader,
    load_if_available=True,
    load_path=SAVE_LOAD_ROOT + "/mnist_standard_mahalanobis",
)

In [None]:
# Evaluate the model
clean_test.test_trained_model(standard_model, testSetLoader)

In [None]:
# A possible attacks array (for nice printing):
# Some attacks use a helper library
import torchattacks

import attacks.fgsm as fgsm
import attacks.ifgsm as ifgsm
import attacks.pgd as pgd
import utils.attacking as attacking

attacks = {}

attacks["FGSM"] = fgsm.fgsm_attack
attacks["I-FGSM"] = ifgsm.ifgsm_attack
attacks["PGD"] = pgd.pgd_attack

## A Simple Unified Framework for Detecting Out-of-Distribution Samples and Adversarial Attacks
https://arxiv.org/abs/1807.03888

### Step 1: extract shape of features

In [None]:
# Extract shape of features (useful for later)
fake_input = torch.rand(2, 1, 28, 28).cuda()
fake_input = Variable(fake_input)

# Temporary list of features
out_list = standard_model.feature_list(fake_input)[1]

# Construct the feature list
num_feature_layers = len(out_list)
feature_list = np.empty(num_feature_layers)

for feature_layer in range(num_feature_layers):
    feature_list[feature_layer] = out_list[feature_layer].size(1)

print(feature_list)

### Step 2: extract mean and covariance

In [None]:
import sklearn.covariance


# Returns the sample mean and precision (which the original code defines as inverse of covariance)
# Also, returns per class values: list of class means, list of precisions
def sample_estimator(trainSetLoader, model, feature_list):
    # Number of classes
    num_classes = 10

    # Construct the covariance first
    group_lasso = sklearn.covariance.EmpiricalCovariance(assume_centered=False)
    correct, total = 0, 0

    # Here this also applies to layer wise features
    num_output = len(feature_list)
    num_sample_per_class = np.empty(num_classes)
    num_sample_per_class.fill(0)
    list_features = []

    # list_features[<layer>][<label>] is a list that holds the features
    # in a specific layer of a specific label (class)
    for i in range(num_output):
        temp_list = []
        for j in range(num_classes):
            temp_list.append(0)
        list_features.append(temp_list)

    for j, (image, label) in enumerate(
        tqdm(
            trainSetLoader.dataset,
            desc="Going through the images one by one",
            leave=False,
        )
    ):
        # Make tensor
        image = np.reshape(image, (1, 1, 28, 28))
        image = image.to(device)

        # This is for extracting the feature list
        output, out_features = model.feature_list(image)

        # Get hidden features
        for index in range(num_output):
            out_features[index] = out_features[index].view(
                out_features[index].size(0), out_features[index].size(1), -1
            )
            out_features[index] = torch.mean(out_features[index].data, 2)

        # Compute the accuracy
        pred = output.data.max(1)[1]
        equal_flag = pred.eq(torch.tensor(label).to(device)).cpu()
        correct += equal_flag.sum()

        # Construct the sample matrix (this is layer by layer)
        if num_sample_per_class[label] == 0:
            out_count = 0
            for out in out_features:
                list_features[out_count][label] = out[0].view(1, -1)
                out_count += 1
        else:
            out_count = 0
            for out in out_features:
                list_features[out_count][label] = torch.cat(
                    (list_features[out_count][label], out[0].view(1, -1)), 0
                )
                out_count += 1
        num_sample_per_class[label] += 1

    sample_class_mean = []
    out_count = 0
    for num_feature in feature_list:
        temp_list = torch.Tensor(num_classes, int(num_feature)).to(device)
        for j in range(num_classes):
            temp_list[j] = torch.mean(list_features[out_count][j], 0)
        sample_class_mean.append(temp_list)
        out_count += 1

    precision = []
    for k in range(num_output):
        X = 0
        for i in range(num_classes):
            if i == 0:
                X = list_features[k][i] - sample_class_mean[k][i]
            else:
                X = torch.cat((X, list_features[k][i] - sample_class_mean[k][i]), 0)

        # Find inverse
        group_lasso.fit(X.cpu().numpy())
        temp_precision = group_lasso.precision_
        temp_precision = torch.from_numpy(temp_precision).float().cuda()
        precision.append(temp_precision)

    # This is just a print from the code, which helps me understand
    # what they were actually doing
    print(
        "\n Training Accuracy:({:.2f}%)\n".format(
            100.0 * correct / len(trainSetLoader.dataset)
        )
    )

    return sample_class_mean, precision

In [None]:
sample_mean, precision = sample_estimator(
    testSetLoader, standard_model, feature_list=feature_list
)

# Looks very ugly, I don't recommend printing this
# print(sample_mean, precision)

In [None]:
# Compute the proposed (in the paper) Mahalanobis confidence score on (adversarial)
# samples at a specific layer index with set magnitude
def mahalanobis_score(
    trainSetLoader,
    model,
    sample_mean,
    precision,
    layer_index,
    magnitude,
):
    num_classes = 10
    mahalanobis = []

    # Do it one by one so I do not go fully insane and cry
    for j, (image, label) in enumerate(
        tqdm(
            trainSetLoader.dataset,
            desc="Going through the images one by one",
            leave=False,
        )
    ):
        # Make tensor
        image = np.reshape(image, (1, 1, 28, 28))
        image = image.to(device)
        image.requires_grad = True
        # image.retain_grad()

        label = torch.tensor(label).to(device)

        # Extract features
        out_features = model.intermediate_forward(image, layer_index)
        out_features = out_features.view(out_features.size(0), out_features.size(1), -1)
        out_features = torch.mean(out_features, 2)

        gaussian_score = 0
        for i in range(num_classes):
            batch_sample_mean = sample_mean[layer_index][i]
            zero_f = out_features.data - batch_sample_mean
            term_gau = (
                -0.5
                * torch.mm(torch.mm(zero_f, precision[layer_index]), zero_f.t()).diag()
            )
            if i == 0:
                gaussian_score = term_gau.view(-1, 1)
            else:
                gaussian_score = torch.cat((gaussian_score, term_gau.view(-1, 1)), 1)

        # Input_processing
        sample_pred = gaussian_score.max(1)[1]
        batch_sample_mean = sample_mean[layer_index].index_select(0, sample_pred)
        zero_f = out_features - Variable(batch_sample_mean)
        pure_gau = (
            -0.5
            * torch.mm(
                torch.mm(zero_f, Variable(precision[layer_index])), zero_f.t()
            ).diag()
        )
        loss = torch.mean(-pure_gau)
        loss.backward()

        gradient = torch.ge(image.grad.data, 0)
        gradient = (gradient.float() - 0.5) * 2
        gradient.index_copy_(
            1,
            torch.LongTensor([0]).cuda(),
            gradient.index_select(1, torch.LongTensor([0]).cuda()) / (0.2023),
        )
        gradient.index_copy_(
            1,
            torch.LongTensor([1]).cuda(),
            gradient.index_select(1, torch.LongTensor([1]).cuda()) / (0.1994),
        )
        gradient.index_copy_(
            1,
            torch.LongTensor([2]).cuda(),
            gradient.index_select(1, torch.LongTensor([2]).cuda()) / (0.2010),
        )
        tempInputs = torch.add(data.data, -magnitude, gradient)


#     for data_index in range(int(np.floor(test_data.size(0) / batch_size))):
#         target = test_label[total : total + batch_size].cuda()
#         data = test_data[total : total + batch_size].cuda()
#         total += batch_size
#         data, target = Variable(data, requires_grad=True), Variable(target)


#         noise_out_features = model.intermediate_forward(
#             Variable(tempInputs, volatile=True), layer_index
#         )
#         noise_out_features = noise_out_features.view(
#             noise_out_features.size(0), noise_out_features.size(1), -1
#         )
#         noise_out_features = torch.mean(noise_out_features, 2)
#         noise_gaussian_score = 0
#         for i in range(num_classes):
#             batch_sample_mean = sample_mean[layer_index][i]
#             zero_f = noise_out_features.data - batch_sample_mean
#             term_gau = (
#                 -0.5
#                 * torch.mm(torch.mm(zero_f, precision[layer_index]), zero_f.t()).diag()
#             )
#             if i == 0:
#                 noise_gaussian_score = term_gau.view(-1, 1)
#             else:
#                 noise_gaussian_score = torch.cat(
#                     (noise_gaussian_score, term_gau.view(-1, 1)), 1
#                 )

#         noise_gaussian_score, _ = torch.max(noise_gaussian_score, dim=1)
#         Mahalanobis.extend(noise_gaussian_score.cpu().numpy())

#     return Mahalanobis

In [None]:
# In here you finally compute the Mahalanobis score and hope for the best
# magnitude_list = [0.0, 0.01, 0.005, 0.002, 0.0014, 0.001, 0.0005]
magnitude_list = [0.0]
for magnitude in magnitude_list:
    print("Mahalanobis score with the following magnitude: {}".format(magnitude))

    # First do the score on clean data
    for feature_num in range(num_feature_layers):
        mahalanobis_in_temp = mahalanobis_score(
            testSetLoader,
            standard_model,
            sample_mean,
            precision,
            feature_num,
            magnitude,
        )
        mahalanobis_in_temp = np.asarray(mahalanobis_in_temp, dtype=np.float32)
        if feature_num == 0:
            mahalanobis_in = mahalanobis_in_temp.reshape(
                (mahalanobis_in_temp.shape[0], -1)
            )
        else:
            mahalanobis_in = np.concatenate(
                (
                    mahalanobis_in_temp,
                    mahalanobis_in_temp.reshape((mahalanobis_in_temp.shape[0], -1)),
                ),
                axis=1,
            )

    # Then do the score on adversarial data
#     for i in range(num_output):
#         M_out = get_Mahalanobis_score_adv(
#             model,
#             test_adv_data,
#             test_label,
#             args.num_classes,
#             args.outf,
#             args.net_type,
#             sample_mean,
#             precision,
#             i,
#             magnitude,
#         )
#         M_out = np.asarray(M_out, dtype=np.float32)
#         if i == 0:
#             Mahalanobis_out = M_out.reshape((M_out.shape[0], -1))
#         else:
#             Mahalanobis_out = np.concatenate(
#                 (Mahalanobis_out, M_out.reshape((M_out.shape[0], -1))), axis=1
#             )

#     # Then do the score on noisy data
#     for i in range(num_output):
#         M_noisy = lib_generation.get_Mahalanobis_score_adv(
#             model,
#             test_noisy_data,
#             test_label,
#             args.num_classes,
#             args.outf,
#             args.net_type,
#             sample_mean,
#             precision,
#             i,
#             magnitude,
#         )
#         M_noisy = np.asarray(M_noisy, dtype=np.float32)
#         if i == 0:
#             Mahalanobis_noisy = M_noisy.reshape((M_noisy.shape[0], -1))
#         else:
#             Mahalanobis_noisy = np.concatenate(
#                 (Mahalanobis_noisy, M_noisy.reshape((M_noisy.shape[0], -1))), axis=1
#             )

#     Mahalanobis_in = np.asarray(Mahalanobis_in, dtype=np.float32)
#     Mahalanobis_out = np.asarray(Mahalanobis_out, dtype=np.float32)
#     Mahalanobis_noisy = np.asarray(Mahalanobis_noisy, dtype=np.float32)
#     Mahalanobis_pos = np.concatenate((Mahalanobis_in, Mahalanobis_noisy))

#     Mahalanobis_data, Mahalanobis_labels = lib_generation.merge_and_generate_labels(
#         Mahalanobis_out, Mahalanobis_pos
#     )
#     file_name = os.path.join(
#         args.outf,
#         "Mahalanobis_%s_%s_%s.npy" % (str(magnitude), args.dataset, args.adv_type),
#     )

#     Mahalanobis_data = np.concatenate((Mahalanobis_data, Mahalanobis_labels), axis=1)
#     np.save(file_name, Mahalanobis_data)