In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.imagenet_utils import decode_predictions, preprocess_input
import cv2

class AdversarialAnalyzer:
    def __init__(self, model_name='VGG16'):

        if model_name == 'VGG16':
            self.model = VGG16(weights='imagenet')
            self.last_conv_layer = 'block5_conv3'
        else:
            self.model = ResNet50(weights='imagenet')
            self.last_conv_layer = 'conv5_block3_out'

        self.model_name = model_name

    def load_and_preprocess_image(self, img_path, target_size=(224, 224)):
        """Load and preprocess image for the model"""
        img = image.load_img(img_path, target_size=target_size)
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        return img_array

    def create_adversarial_example_fgsm(self, img_array, target_class=None, epsilon=0.1):
        """
        Create adversarial example using Fast Gradient Sign Method (FGSM)

        Args:
            img_array: Input image array
            target_class: Target class for targeted attack (None for untargeted)
            epsilon: Perturbation strength
        """
        img_tensor = tf.convert_to_tensor(img_array)

        with tf.GradientTape() as tape:
            tape.watch(img_tensor)
            predictions = self.model(img_tensor)

            if target_class is None:
                loss = tf.keras.losses.categorical_crossentropy(
                    predictions, predictions
                )
            else:
                target_one_hot = tf.one_hot(target_class, predictions.shape[-1])
                loss = -tf.keras.losses.categorical_crossentropy(
                    target_one_hot, predictions
                )

        gradients = tape.gradient(loss, img_tensor)
        signed_grad = tf.sign(gradients)
        adversarial_img = img_tensor + epsilon * signed_grad

        adversarial_img = tf.clip_by_value(adversarial_img, -1, 1)

        return adversarial_img.numpy()

    def grad_cam(self, img_array, class_idx, layer_name=None):

        if layer_name is None:
            layer_name = self.last_conv_layer

        grad_model = tf.keras.models.Model(
            [self.model.inputs],
            [self.model.get_layer(layer_name).output, self.model.output]
        )

        with tf.GradientTape() as tape:
            conv_outputs, predictions = grad_model(img_array)
            class_output = predictions[:, class_idx]

        grads = tape.gradient(class_output, conv_outputs)

        pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))

        conv_outputs = conv_outputs[0]
        heatmap = conv_outputs @ pooled_grads[..., tf.newaxis]
        heatmap = tf.squeeze(heatmap)

        heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)

        return heatmap.numpy()

    def grad_cam_softmax(self, img_array, class_idx):
        intermediate_layer = self.model.get_layer(self.last_conv_layer)
        grad_model = tf.keras.models.Model(
            [self.model.inputs],
            [intermediate_layer.output, self.model.output]
        )

        with tf.GradientTape() as tape:
            conv_outputs, predictions = grad_model(img_array)

            class_output = tf.nn.softmax(predictions)[:, class_idx]

        grads = tape.gradient(class_output, conv_outputs)
        pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))

        conv_outputs = conv_outputs[0]
        heatmap = conv_outputs @ pooled_grads[..., tf.newaxis]
        heatmap = tf.squeeze(heatmap)
        heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)

        return heatmap.numpy()

    def integrated_gradients(self, img_array, class_idx, baseline=None, steps=50):
        if baseline is None:
            baseline = np.zeros_like(img_array)

        img_array_nobatch = img_array[0]
        baseline_nobatch = baseline[0]

        alphas = tf.linspace(start=0.0, stop=1.0, num=steps+1)
        interpolated_inputs = []

        for alpha in alphas:
            interpolated_input = baseline_nobatch + alpha * (img_array_nobatch - baseline_nobatch)
            interpolated_inputs.append(interpolated_input)

        interpolated_inputs_tensor = tf.stack(interpolated_inputs)

        total_gradients = tf.zeros_like(img_array_nobatch, dtype=tf.float32)

        batch_size = 32

        for i in range(0, steps + 1, batch_size):
            batch_inputs = interpolated_inputs_tensor[i:i + batch_size]

            with tf.GradientTape() as tape:
                tape.watch(batch_inputs)
                predictions = self.model(batch_inputs)
                class_outputs = predictions[:, class_idx]

            batch_gradients = tape.gradient(class_outputs, batch_inputs)
            total_gradients = tf.add(total_gradients, tf.reduce_sum(batch_gradients, axis=0))

        avg_gradients = total_gradients / (steps + 1)
        integrated_gradients = (img_array_nobatch - baseline_nobatch) * avg_gradients

        return np.expand_dims(integrated_gradients.numpy(), axis=0)


    def integrated_gradients_pre_softmax(self, img_array, class_idx, baseline=None, steps=50):
        if baseline is None:
            baseline = np.zeros_like(img_array)

        logits_model = tf.keras.models.Model(
            inputs=self.model.input,
            outputs=self.model.layers[-2].output
        )

        img_array_nobatch = img_array[0]
        baseline_nobatch = baseline[0]

        alphas = tf.linspace(start=0.0, stop=1.0, num=steps+1)
        interpolated_inputs = []

        for alpha in alphas:
            interpolated_input = baseline_nobatch + alpha * (img_array_nobatch - baseline_nobatch)
            interpolated_inputs.append(interpolated_input)

        interpolated_inputs_tensor = tf.stack(interpolated_inputs)

        total_gradients = tf.zeros_like(img_array_nobatch, dtype=tf.float32)
        batch_size = 32

        for i in range(0, steps + 1, batch_size):
            batch_inputs = interpolated_inputs_tensor[i:i + batch_size]

            with tf.GradientTape() as tape:
                tape.watch(batch_inputs)
                logits = logits_model(batch_inputs)
                class_outputs = logits[:, class_idx]

            batch_gradients = tape.gradient(class_outputs, batch_inputs)
            total_gradients = tf.add(total_gradients, tf.reduce_sum(batch_gradients, axis=0))


        avg_gradients = total_gradients / (steps + 1)
        integrated_gradients = (img_array_nobatch - baseline_nobatch) * avg_gradients

        return np.expand_dims(integrated_gradients.numpy(), axis=0)


    def visualize_heatmap(self, img_array, heatmap, alpha=0.6):
        img = img_array[0].copy()
        img = (img - img.min()) / (img.max() - img.min())

        heatmap_resized = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
        heatmap_resized = np.uint8(255 * heatmap_resized)
        heatmap_colored = cv2.applyColorMap(heatmap_resized, cv2.COLORMAP_JET)
        heatmap_colored = cv2.cvtColor(heatmap_colored, cv2.COLOR_BGR2RGB)

        overlayed = heatmap_colored * alpha + img * 255 * (1 - alpha)
        overlayed = np.uint8(overlayed)

        return overlayed

    def analyze_adversarial_example(self, img_path, epsilon=0.1, target_class=None):
        original_img = self.load_and_preprocess_image(img_path)

        original_preds = self.model.predict(original_img)
        original_class = np.argmax(original_preds[0])
        original_confidence = np.max(original_preds[0])

        print(f"Original prediction: {decode_predictions(original_preds, top=1)[0][0][1]} "
              f"(confidence: {original_confidence:.3f})")

        adversarial_img = self.create_adversarial_example_fgsm(
            original_img, target_class, epsilon
        )

        adv_preds = self.model.predict(adversarial_img)
        adv_class = np.argmax(adv_preds[0])
        adv_confidence = np.max(adv_preds[0])

        print(f"Adversarial prediction: {decode_predictions(adv_preds, top=1)[0][0][1]} "
              f"(confidence: {adv_confidence:.3f})")

        fig, axes = plt.subplots(3, 4, figsize=(20, 15))

        original_gradcam = self.grad_cam(original_img, original_class)
        original_gradcam_softmax = self.grad_cam_softmax(original_img, original_class)
        original_ig = self.integrated_gradients(original_img, original_class)
        original_ig_pre_softmax = self.integrated_gradients_pre_softmax(original_img, original_class)

        adv_gradcam = self.grad_cam(adversarial_img, adv_class)
        adv_gradcam_softmax = self.grad_cam_softmax(adversarial_img, adv_class)
        adv_ig = self.integrated_gradients(adversarial_img, adv_class)
        adv_ig_pre_softmax = self.integrated_gradients_pre_softmax(adversarial_img, adv_class)

        axes[0, 0].imshow((original_img[0] + 1) / 2)
        axes[0, 0].set_title('Original Image')
        axes[0, 0].axis('off')

        axes[0, 1].imshow(self.visualize_heatmap(original_img, original_gradcam))
        axes[0, 1].set_title('Original: Grad-CAM')
        axes[0, 1].axis('off')

        axes[0, 2].imshow(self.visualize_heatmap(original_img, original_gradcam_softmax))
        axes[0, 2].set_title('Original: Grad-CAM (Softmax)')
        axes[0, 2].axis('off')

        ig_vis = np.mean(np.abs(original_ig[0]), axis=2) # Use original_ig[0] to get the single image
        ig_vis = (ig_vis - ig_vis.min()) / (ig_vis.max() - ig_vis.min())
        axes[0, 3].imshow(self.visualize_heatmap(original_img, ig_vis))
        axes[0, 3].set_title('Original: Integrated Gradients')
        axes[0, 3].axis('off')

        axes[1, 0].imshow((adversarial_img[0] + 1) / 2)
        axes[1, 0].set_title('Adversarial Image')
        axes[1, 0].axis('off')

        axes[1, 1].imshow(self.visualize_heatmap(adversarial_img, adv_gradcam))
        axes[1, 1].set_title('Adversarial: Grad-CAM')
        axes[1, 1].axis('off')

        axes[1, 2].imshow(self.visualize_heatmap(adversarial_img, adv_gradcam_softmax))
        axes[1, 2].set_title('Adversarial: Grad-CAM (Softmax)')
        axes[1, 2].axis('off')

        adv_ig_vis = np.mean(np.abs(adv_ig[0]), axis=2) # Use adv_ig[0] to get the single image
        adv_ig_vis = (adv_ig_vis - adv_ig_vis.min()) / (adv_ig_vis.max() - adv_ig_vis.min())
        axes[1, 3].imshow(self.visualize_heatmap(adversarial_img, adv_ig_vis))
        axes[1, 3].set_title('Adversarial: Integrated Gradients')
        axes[1, 3].axis('off')

        ig_pre_softmax_vis = np.mean(np.abs(original_ig_pre_softmax[0]), axis=2)
        ig_pre_softmax_vis = (ig_pre_softmax_vis - ig_pre_softmax_vis.min()) / (ig_pre_softmax_vis.max() - ig_pre_softmax_vis.min())
        axes[2, 0].imshow(self.visualize_heatmap(original_img, ig_pre_softmax_vis))
        axes[2, 0].set_title('Original: IG (Pre-softmax)')
        axes[2, 0].axis('off')

        adv_ig_pre_softmax_vis = np.mean(np.abs(adv_ig_pre_softmax[0]), axis=2)
        adv_ig_pre_softmax_vis = (adv_ig_pre_softmax_vis - adv_ig_pre_softmax_vis.min()) / (adv_ig_pre_softmax_vis.max() - adv_ig_pre_softmax_vis.min())
        axes[2, 1].imshow(self.visualize_heatmap(adversarial_img, adv_ig_pre_softmax_vis))
        axes[2, 1].set_title('Adversarial: IG (Pre-softmax)')
        axes[2, 1].axis('off')

        perturbation = adversarial_img - original_img
        perturbation_vis = np.mean(np.abs(perturbation[0]), axis=2)
        perturbation_vis = (perturbation_vis - perturbation_vis.min()) / (perturbation_vis.max() - perturbation_vis.min())
        axes[2, 2].imshow(perturbation_vis, cmap='hot')
        axes[2, 2].set_title('Adversarial Perturbation')
        axes[2, 2].axis('off')

        attention_diff = np.abs(adv_gradcam - original_gradcam)
        axes[2, 3].imshow(attention_diff, cmap='hot')
        axes[2, 3].set_title('Grad-CAM Difference')
        axes[2, 3].axis('off')

        plt.tight_layout()
        plt.show()

        print("\n=== Quantitative Analysis ===")
        print(f"L2 norm of perturbation: {np.linalg.norm(perturbation):.4f}")
        print(f"Max perturbation: {np.max(np.abs(perturbation)):.4f}")

        original_attention_flat = original_gradcam.flatten()
        adv_attention_flat = adv_gradcam.flatten()
        attention_correlation = np.corrcoef(original_attention_flat, adv_attention_flat)[0, 1]
        print(f"Grad-CAM correlation (original vs adversarial): {attention_correlation:.4f}")

        gradcam_softmax_diff = np.mean(np.abs(original_gradcam_softmax - original_gradcam))
        ig_softmax_diff = np.mean(np.abs(ig_vis - ig_pre_softmax_vis))

        print(f"Grad-CAM difference (softmax vs pre-softmax): {gradcam_softmax_diff:.4f}")
        print(f"IG difference (softmax vs pre-softmax): {ig_softmax_diff:.4f}")

        return {
            'original_img': original_img,
            'adversarial_img': adversarial_img,
            'original_gradcam': original_gradcam,
            'adv_gradcam': adv_gradcam,
            'original_ig': original_ig,
            'adv_ig': adv_ig,
            'attention_correlation': attention_correlation
        }

if __name__ == "__main__":
    analyzer = AdversarialAnalyzer('VGG16')  # or 'ResNet50'

    img_path = '/content/istockphoto-91843294-612x612.jpg'
    results = analyzer.analyze_adversarial_example(img_path, epsilon=0.1)

Output hidden; open in https://colab.research.google.com to view.