In [None]:
# Import necessary libraries

!pip install lime scikit-image matplotlib
from lime import lime_image
from skimage.segmentation import mark_boundaries
import matplotlib.pyplot as plt

from tensorflow.keras.datasets import mnist  # Only for dataset loading
import cupy as cp  # GPU-based operations
import numpy as np
from tqdm import tqdm
import abc

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=e2fb1b228bb243ff940ff50726c39fcabe9c7c4df70dcb0b5594eac1f07ff7bb
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
# Define im2col function
def im2col(input_data, kernel_size, stride=1):
    batch_size, channels, height, width = input_data.shape
    kernel_height, kernel_width = kernel_size
    out_height = (height - kernel_height) // stride + 1
    out_width = (width - kernel_width) // stride + 1

    # Initialize column matrix
    cols = cp.zeros((batch_size, channels, kernel_height, kernel_width, out_height, out_width), dtype=input_data.dtype)

    for y in range(kernel_height):
        for x in range(kernel_width):
            y_max = y + stride * out_height
            x_max = x + stride * out_width
            cols[:, :, y, x, :, :] = input_data[:, :, y:y_max:stride, x:x_max:stride]

    cols = cols.transpose(0, 3, 1, 2, 4, 5).reshape(batch_size * out_height * out_width, -1)
    return cols

# Define col2im function
def col2im(cols, input_shape, kernel_size, stride=1):
    """
    Converts column representation back to the original input tensor shape.

    :param cols: Gradient columns of shape (batch_size * out_height * out_width, in_channels * kernel_height * kernel_width)
    :param input_shape: Shape of the input tensor (batch_size, in_channels, height, width)
    :param kernel_size: Tuple of (kernel_height, kernel_width)
    :param stride: Stride of the convolution
    :return: Gradient with respect to the input tensor, shape (batch_size, in_channels, height, width)
    """
    batch_size, in_channels, height, width = input_shape
    kernel_height, kernel_width = kernel_size
    out_height = (height - kernel_height) // stride + 1
    out_width = (width - kernel_width) // stride + 1

    # Initialize the gradient input tensor
    grad_input_padded = cp.zeros(input_shape, dtype=cols.dtype)

    # Reshape cols to (batch_size, out_height, out_width, in_channels, kernel_height, kernel_width)
    cols_reshaped = cols.reshape(batch_size, out_height, out_width, in_channels, kernel_height, kernel_width)

    # Transpose to (batch_size, in_channels, out_height, out_width, kernel_height, kernel_width)
    cols_reshaped = cols_reshaped.transpose(0, 3, 1, 2, 4, 5)

    # Iterate over the kernel height and width to accumulate gradients
    for y in range(kernel_height):
        for x in range(kernel_width):
            grad_input_padded[:, :, y:y + stride*out_height:stride, x:x + stride*out_width:stride] += cols_reshaped[:, :, :, :, y, x]

    return grad_input_padded

In [None]:
# Base module class for layers
class Module(abc.ABC):
    def __init__(self, name):
        self.name = name
        self.weights = {}
        self.grad_weights = {}

    @abc.abstractmethod
    def forward(self, input):
        pass

    @abc.abstractmethod
    def backward(self, grad_output):
        pass

    def __repr__(self):
        return self.name

# Composition for stacking layers
class Composition(Module):
    def __init__(self, submodules):
        super().__init__(f"Composition({', '.join(sub.name for sub in submodules)})")
        self.submodules = submodules

        # Aggregate weights from submodules with unique keys
        self.weights = {}
        for sub in self.submodules:
            self.weights.update({
                f'{sub.name}.{w_name}': weight
                for w_name, weight in sub.weights.items()
            })

    def forward(self, input):
        for submodule in self.submodules:
            input = submodule.forward(input)
        return input

    def backward(self, grad_output):
        for submodule in reversed(self.submodules):
            grad_output = submodule.backward(grad_output)
        # Aggregate gradients from submodules
        self.grad_weights = {}
        for sub in self.submodules:
            self.grad_weights.update({
                f'{sub.name}.{w_name}': grad_weight
                for w_name, grad_weight in sub.grad_weights.items()
            })
        return grad_output

In [None]:
# Optimized Convolutional Layer Using im2col and col2im (corrected)
class Convolution(Module):
    def __init__(self, name, in_channels, out_channels, kernel_size, padding=0, stride=1):
        super().__init__(name)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.padding = padding
        self.stride = stride

        # He initialization for weights
        self.weights = {
            'W': cp.random.randn(out_channels, in_channels * kernel_size[0] * kernel_size[1]).astype(cp.float32) * cp.sqrt(2. / (in_channels * kernel_size[0] * kernel_size[1])),
            'b': cp.zeros(out_channels, dtype=cp.float32)
        }

    def forward(self, input):
        self.input = input
        batch_size, channels, height, width = input.shape
        kernel_height, kernel_width = self.kernel_size
        stride = self.stride

        # Apply padding
        if self.padding > 0:
            input_padded = cp.pad(input, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')
        else:
            input_padded = input

        self.input_padded = input_padded
        _, _, padded_height, padded_width = input_padded.shape

        # Compute output dimensions
        out_height = (padded_height - kernel_height) // stride + 1
        out_width = (padded_width - kernel_width) // stride + 1

        # Convert input to columns
        self.cols = im2col(input_padded, self.kernel_size, self.stride)  # Shape: (batch_size*out_height*out_width, in_channels*kernel_height*kernel_width)

        # Reshape weights
        W = self.weights['W']  # Shape: (out_channels, in_channels*kernel_height*kernel_width)

        # Perform matrix multiplication
        out = self.cols @ W.T + self.weights['b'][None, :]  # Shape: (batch_size*out_height*out_width, out_channels)

        # Reshape output
        out = out.reshape(batch_size, out_height, out_width, self.out_channels).transpose(0, 3, 1, 2)  # Shape: (batch_size, out_channels, out_height, out_width
        return out

    def backward(self, grad_output):
        batch_size, out_channels, out_height, out_width = grad_output.shape

        # Reshape grad_output to (batch_size*out_height*out_width, out_channels)
        grad_output_reshaped = grad_output.transpose(0, 2, 3, 1).reshape(-1, out_channels)  # Shape: (batch_size*out_height*out_width, out_channels)

        # Compute gradients w.r.t weights and biases
        grad_W = grad_output_reshaped.T @ self.cols  # Shape: (out_channels, in_channels*kernel_height*kernel_width)
        grad_b = grad_output_reshaped.sum(axis=0)  # Shape: (out_channels,)

        # Compute gradients w.r.t input columns
        W = self.weights['W']  # Shape: (out_channels, in_channels*kernel_height*kernel_width)
        grad_cols = grad_output_reshaped @ W  # Shape: (batch_size*out_height*out_width, in_channels*kernel_height*kernel_width)

        # Compute grad_input_padded using the corrected col2im
        grad_input_padded = col2im(grad_cols, self.input_padded.shape, self.kernel_size, self.stride)

        # Remove padding if applied
        if self.padding > 0:
            grad_input = grad_input_padded[:, :, self.padding:-self.padding, self.padding:-self.padding]
        else:
            grad_input = grad_input_padded

        # Store gradients
        self.grad_weights['W'] = grad_W
        self.grad_weights['b'] = grad_b

        return grad_input

# Linear (Fully Connected) Layer
class Linear(Module):
    def __init__(self, name='Linear', n_in_features=256, n_out_features=10):
        super().__init__(name)
        stdv = 1.0 / cp.sqrt(n_in_features)
        self.weights = {
            'W': cp.random.uniform(-stdv, stdv, (n_out_features, n_in_features)).astype(cp.float32),
            'b': cp.random.uniform(-stdv, stdv, (n_out_features,)).astype(cp.float32)
        }

    def forward(self, input):
        self.input = input
        return input @ self.weights['W'].T + self.weights['b']

    def backward(self, grad_output):
        # Compute gradients w.r.t weights and biases
        grad_W = grad_output.T @ self.input  # Shape: (n_out_features, n_in_features)
        grad_b = cp.sum(grad_output, axis=0)  # Shape: (n_out_features,)

        # Compute gradient w.r.t input
        grad_input = grad_output @ self.weights['W']

        # Store gradients
        self.grad_weights['W'] = grad_W
        self.grad_weights['b'] = grad_b

        return grad_input


# Flatten Layer
class Flatten(Module):
    def __init__(self, name='Flatten'):
        super().__init__(name)

    def forward(self, input):
        self.input_shape = input.shape
        batch_size = input.shape[0]
        return input.reshape(batch_size, -1)

    def backward(self, grad_output):
        return grad_output.reshape(self.input_shape)

In [None]:
# ReLU Activation Layer
class ReLU(Module):
    def __init__(self, name='ReLU'):
        super().__init__(name)

    def forward(self, input):
        self.input = input
        return cp.maximum(0, input)  # Apply ReLU on GPU

    def backward(self, grad_output):
        return grad_output * (self.input > 0)  # Gradient pass-through for positive inputs

# Response Normalization Activation Layer (Optimized and Vectorized)
class ResponseNormalization(Module):
    def __init__(self, name='ResponseNormalization', alpha=0.05, beta=0.5, k=1.0):
        """
        Initialize the Response Normalization layer.
        :param alpha: Scaling parameter in the normalization formula
        :param beta: Exponent parameter in the normalization formula
        :param k: Bias term to avoid division by zero
        """
        super().__init__(name)
        self.alpha = alpha
        self.beta = beta
        self.k = k

    def forward(self, input):
        """
        Forward pass for the response normalization layer.
        :param input: Input tensor of shape (batch_size, depth, height, width)
        :return: Normalized tensor of the same shape
        """
        self.input = input
        # Compute squared input
        squared_input = input ** 2
        # Compute the normalization denominator
        self.norm = (self.k + (self.alpha / 2) * cp.sum(squared_input, axis=1, keepdims=True)) ** self.beta
        # Normalize the input
        output = input / self.norm
        return output

    def backward(self, grad_output):
        """
        Backward pass for the response normalization layer.
        :param grad_output: Gradient of the loss w.r.t. the output tensor
        :return: Gradient of the loss w.r.t. the input tensor
        """
        # Compute gradient w.r.t input
        grad_input = (grad_output / self.norm) - (self.alpha * self.beta * self.input * cp.sum(self.input * grad_output, axis=1, keepdims=True) / (self.norm ** (self.beta + 1)))
        return grad_input

# Softmax Activation Layer
class Softmax(Module):
    def __init__(self, name='Softmax'):
        super().__init__(name)

    def forward(self, input):
        # Shift input for numerical stability
        input_shifted = input - cp.max(input, axis=1, keepdims=True)
        exp_scores = cp.exp(input_shifted)
        self.output = exp_scores / cp.sum(exp_scores, axis=1, keepdims=True)
        return self.output

    def backward(self, grad_output):
        """
        Compute the gradient of the loss with respect to the input of the Softmax layer.
        :param grad_output: Gradient from the loss with respect to the Softmax output.
        :return: Gradient with respect to the input of the Softmax layer.
        """
        # Compute the gradient for each sample in the batch
        grad_input = cp.zeros_like(self.output)
        for i in range(self.output.shape[0]):
            y = self.output[i].reshape(-1, 1)
            jacobian = cp.diagflat(y) - cp.dot(y, y.T)
            grad_input[i] = cp.dot(jacobian, grad_output[i])
        return grad_input

In [None]:
# Corrected Stochastic Pooling Layer
class StochasticPooling(Module):
    def __init__(self, name='StochasticPooling', pool_size=(2, 2)):
        """
        Initialize the Stochastic Pooling layer.
        :param pool_size: Tuple (height, width) specifying the pooling region size.
        """
        super().__init__(name)
        self.pool_size = pool_size

    def forward(self, input):
        """
        Forward pass for stochastic pooling.
        :param input: 4D tensor (batch_size, channels, height, width)
        :return: Output with reduced spatial dimensions
        """
        self.input = input
        batch_size, channels, height, width = input.shape
        pool_h, pool_w = self.pool_size
        out_h = height // pool_h
        out_w = width // pool_w

        # Reshape input to (batch_size, channels, out_h, out_w, pool_h * pool_w)
        input_reshaped = input.reshape(batch_size, channels, out_h, pool_h, out_w, pool_w)
        input_reshaped = input_reshaped.transpose(0, 1, 2, 4, 3, 5).reshape(batch_size, channels, out_h, out_w, pool_h * pool_w)

        # Compute probabilities using absolute values
        abs_input = cp.abs(input_reshaped)
        sum_abs = cp.sum(abs_input, axis=-1, keepdims=True)
        probabilities = abs_input / (sum_abs + 1e-12)  # Prevent division by zero

        # Compute cumulative probabilities along the last axis
        cumulative_probs = cp.cumsum(probabilities, axis=-1)

        # Generate random numbers for sampling
        random_vals = cp.random.rand(batch_size, channels, out_h, out_w, 1)

        # Determine the sampled indices
        sampled_indices_flat = cp.sum(cumulative_probs <= random_vals, axis=-1) - 1
        sampled_indices_flat = cp.clip(sampled_indices_flat, 0, pool_h * pool_w - 1)  # Ensure indices are within bounds

        self.sampled_indices_flat = sampled_indices_flat.copy()

        # Convert flat indices back to (h, w) indices within the pooling window
        sampled_indices_h = sampled_indices_flat // pool_w
        sampled_indices_w = sampled_indices_flat % pool_w

        # Prepare indices for advanced indexing
        batch_indices = cp.arange(batch_size)[:, None, None, None]
        channel_indices = cp.arange(channels)[None, :, None, None]
        out_h_indices = cp.arange(out_h)[None, None, :, None]
        out_w_indices = cp.arange(out_w)[None, None, None, :]

        # Broadcast indices to match the shape
        batch_indices = cp.broadcast_to(batch_indices, (batch_size, channels, out_h, out_w))
        channel_indices = cp.broadcast_to(channel_indices, (batch_size, channels, out_h, out_w))
        out_h_indices = cp.broadcast_to(out_h_indices, (batch_size, channels, out_h, out_w))
        out_w_indices = cp.broadcast_to(out_w_indices, (batch_size, channels, out_h, out_w))

        # Gather the sampled values
        output = input_reshaped[
            batch_indices,
            channel_indices,
            out_h_indices,
            out_w_indices,
            sampled_indices_flat
        ]

        return output

    def backward(self, grad_output):
        """
        Backward pass for stochastic pooling.
        :param grad_output: Gradient of loss with respect to output
        :return: Gradient of loss with respect to input
        """
        batch_size, channels, out_h, out_w = grad_output.shape
        pool_h, pool_w = self.pool_size

        # Initialize gradient w.r.t input
        grad_input = cp.zeros_like(self.input, dtype=cp.float32)

        # Convert flat indices back to (h, w) indices
        sampled_indices_h = self.sampled_indices_flat // pool_w
        sampled_indices_w = self.sampled_indices_flat % pool_w

        # Prepare indices for gradient scattering
        batch_indices = cp.arange(batch_size)[:, None, None, None]
        channel_indices = cp.arange(channels)[None, :, None, None]
        out_h_indices = cp.arange(out_h)[None, None, :, None]
        out_w_indices = cp.arange(out_w)[None, None, None, :]

        # Broadcast indices to match the shape
        batch_indices = cp.broadcast_to(batch_indices, (batch_size, channels, out_h, out_w))
        channel_indices = cp.broadcast_to(channel_indices, (batch_size, channels, out_h, out_w))
        out_h_indices = cp.broadcast_to(out_h_indices, (batch_size, channels, out_h, out_w))
        out_w_indices = cp.broadcast_to(out_w_indices, (batch_size, channels, out_h, out_w))

        # Scatter gradients back to input
        grad_input[
            batch_indices,
            channel_indices,
            out_h_indices * pool_h + sampled_indices_h,
            out_w_indices * pool_w + sampled_indices_w
        ] += grad_output

        return grad_input

In [None]:
# Combined Softmax and Cross-Entropy Loss
class SoftmaxCrossEntropy(Module):
    def __init__(self, name='SoftmaxCrossEntropy'):
        super().__init__(name)

    def forward(self, input, target):
        """
        Forward pass combining Softmax activation and Categorical Cross-Entropy loss.
        :param input: Logits (batch_size, num_classes)
        :param target: True labels as one-hot vectors (batch_size, num_classes)
        :return: Scalar loss value
        """
        # Shift input for numerical stability
        input_shifted = input - cp.max(input, axis=1, keepdims=True)
        exp_scores = cp.exp(input_shifted)
        self.output = exp_scores / cp.sum(exp_scores, axis=1, keepdims=True)
        self.target = target
        self.loss = -cp.sum(target * cp.log(self.output + 1e-12)) / input.shape[0]
        return self.loss

    def backward(self):
        """
        Backward pass for combined Softmax and Categorical Cross-Entropy.
        :return: Gradient with respect to input logits
        """
        grad_input = (self.output - self.target) / self.target.shape[0]
        return grad_input

# Mini-Batch SGD Optimizer (corrected)
class MiniBatchSGD:
    def __init__(self, module, lr=1e-3, batch_size=32):
        self.module = module
        self.lr = lr
        self.batch_size = batch_size

    def step(self):
        for name, grad in self.module.grad_weights.items():
            if name.endswith('.W') or name.endswith('.b'):
                if name in self.module.weights:
                    self.module.weights[name] -= self.lr * grad
                else:
                    raise KeyError(f"Weight key '{name}' not found in module.weights.")

In [None]:
# Define the CNN model without SoftmaxCrossEntropy
def create_cnn_model():
    return Composition([
        # First Convolution: 1x28x28 -> 16x25x25
        Convolution(name='Conv1', in_channels=1, out_channels=16, kernel_size=(4, 4), padding=0, stride=1),
        ReLU(name='ReLU1'),

        # Second Convolution: 16x25x25 -> 32x24x24
        Convolution(name='Conv2', in_channels=16, out_channels=32, kernel_size=(2, 2), padding=0, stride=1),
        ResponseNormalization(name='Norm1', alpha=0.05, beta=0.5, k=1.0),

        # First Stochastic Pooling: 32x24x24 -> 32x12x12
        StochasticPooling(name='Pool1', pool_size=(2, 2)),

        # Second Stochastic Pooling: 32x12x12 -> 32x6x6
        StochasticPooling(name='Pool2', pool_size=(2, 2)),

        # Third Convolution: 32x6x6 -> 64x4x4
        Convolution(name='Conv3', in_channels=32, out_channels=64, kernel_size=(3, 3), padding=0, stride=1),

        # Flatten: 64x4x4 -> 1024
        Flatten(name='Flatten1'),

        # Fully Connected Layer: 1024 -> 10
        Linear(name='FC1', n_in_features=1024, n_out_features=10)
    ])

# Load and preprocess the MNIST dataset
def load_preprocess_data():
    # Load MNIST data
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    print("Original training data shape:", X_train.shape)
    print("Original test data shape:", X_test.shape)

    # Normalize the images to [0, 1] and move to GPU
    X_train = cp.asarray(X_train, dtype=cp.float32) / 255.0
    X_test = cp.asarray(X_test, dtype=cp.float32) / 255.0

    # Reshape the data to include the channel dimension (batch_size, channels, height, width)
    X_train = X_train.reshape(-1, 1, 28, 28)
    X_test = X_test.reshape(-1, 1, 28, 28)

    print("Reshaped training data shape:", X_train.shape)
    print("Reshaped test data shape:", X_test.shape)

    # One-hot encode the labels and move to GPU
    num_classes = 10
    y_train = cp.eye(num_classes, dtype=cp.float32)[y_train]
    y_test = cp.eye(num_classes, dtype=cp.float32)[y_test]

    print("One-hot encoded training labels shape:", y_train.shape)
    print("One-hot encoded test labels shape:", y_test.shape)

    return X_train, y_train, X_test, y_test

# Instantiate the model, loss function, and optimizer
def instantiate_components():
    # Instantiate the model
    model = create_cnn_model()

    # Instantiate the loss function
    loss_fn = SoftmaxCrossEntropy()

    # Instantiate the optimizer
    optimizer = MiniBatchSGD(module=model, lr=0.01, batch_size=32)

    return model, loss_fn, optimizer

def train_model(model, loss_fn, optimizer, X_train, y_train, num_epochs=10, batch_size=32):
    # Convert training data back to CPU for shuffling and slicing
    X_train_cpu = cp.asnumpy(X_train)
    y_train_cpu = cp.asnumpy(y_train)

    for epoch in range(num_epochs):
        permutation = np.random.permutation(X_train_cpu.shape[0])
        X_train_cpu_shuffled = X_train_cpu[permutation]
        y_train_cpu_shuffled = y_train_cpu[permutation]

        epoch_loss = 0.0
        correct_predictions = 0

        for start in range(0, X_train_cpu_shuffled.shape[0], batch_size):
            end = start + batch_size
            X_batch_cpu = X_train_cpu_shuffled[start:end]
            y_batch_cpu = y_train_cpu_shuffled[start:end]

            # Transfer batch to GPU
            X_batch = cp.asarray(X_batch_cpu)
            y_batch = cp.asarray(y_batch_cpu)

            # Forward pass
            logits = model.forward(X_batch)
            loss = loss_fn.forward(logits, y_batch)
            epoch_loss += loss

            predictions = cp.argmax(loss_fn.output, axis=1)
            targets = cp.argmax(y_batch, axis=1)
            correct_predictions += cp.sum(predictions == targets)

            # Backward pass
            grad_loss = loss_fn.backward()
            model.backward(grad_loss)
            optimizer.step()

            # Free GPU memory after each batch
            del X_batch, y_batch, logits, grad_loss, predictions, targets
            cp.get_default_memory_pool().free_all_blocks()

        avg_loss = epoch_loss / (X_train_cpu.shape[0] / batch_size)
        accuracy = correct_predictions / X_train_cpu.shape[0]
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f} - Accuracy: {accuracy:.4f}")

    # Free memory after training
    del X_train_cpu, y_train_cpu, X_train_cpu_shuffled, y_train_cpu_shuffled
    cp.get_default_memory_pool().free_all_blocks()

In [None]:
# LIME
from lime import lime_image
from skimage.segmentation import mark_boundaries
from skimage.transform import resize
import matplotlib.pyplot as plt
from skimage.segmentation import quickshift

def explain_prediction(model, image, label, num_features=5, hide_rest=True):
    # Convert CuPy array to NumPy and preprocess for LIME
    image_np = image.get()[0].transpose(1, 2, 0)
    if image_np.shape[-1] == 1:
        image_np = np.repeat(image_np, 3, axis=-1)
    image_np_resized = resize(image_np, (28, 28), anti_aliasing=True)

    explainer = lime_image.LimeImageExplainer()

    def predict_fn(images):
        images_cp = cp.asarray(images, dtype=cp.float32).transpose(0, 3, 1, 2)
        if images_cp.shape[1] == 3:
            images_cp = (images_cp[:, 0, :, :] * 0.2989 +
                         images_cp[:, 1, :, :] * 0.5870 +
                         images_cp[:, 2, :, :] * 0.1140).reshape(images_cp.shape[0], 1, images_cp.shape[2], images_cp.shape[3])
        logits = model.forward(images_cp)
        probs = cp.exp(logits - cp.max(logits, axis=1, keepdims=True))
        probs = probs / cp.sum(probs, axis=1, keepdims=True)
        return probs.get()

    segmentation_fn = lambda x: quickshift(x, kernel_size=1, max_dist=5, ratio=0.2)

    explanation = explainer.explain_instance(
        image_np_resized,
        predict_fn,
        labels=[label],
        hide_color=0,
        num_samples=1000,
        segmentation_fn=segmentation_fn
    )

    temp, mask = explanation.get_image_and_mask(
        label=label,
        positive_only=False,  # Include both positive and negative contributions
        num_features=num_features,
        hide_rest=hide_rest
    )

    plt.imshow(mark_boundaries(temp, mask))
    plt.title(f"LIME Explanation for Class {label}")
    plt.axis('off')
    plt.show()

In [None]:
def evaluate_model(model, loss_fn, X_test, y_test, batch_size=64, num_explanations=5):
    num_samples = X_test.shape[0]
    num_batches = (num_samples + batch_size - 1) // batch_size
    total_loss = 0
    correct_predictions = 0

    for i in tqdm(range(num_batches), desc="Evaluating"):
        start = i * batch_size
        end = min(start + batch_size, num_samples)
        X_batch = X_test[start:end]
        y_batch = y_test[start:end]

        # Forward pass: Get logits from the model
        logits = model.forward(X_batch)

        # Compute loss using the loss function
        loss = loss_fn.forward(logits, y_batch)
        total_loss += loss

        # Compute probabilities using Softmax
        softmax = Softmax()
        probabilities = softmax.forward(logits)

        # Compute predictions
        predictions = cp.argmax(probabilities, axis=1)
        targets = cp.argmax(y_batch, axis=1)
        correct_predictions += cp.sum(predictions == targets)

        # Free memory for the current batch
        del X_batch, y_batch, logits, loss, probabilities, predictions, targets
        cp.get_default_memory_pool().free_all_blocks()

    # Calculate average loss and accuracy
    avg_loss = total_loss / num_batches
    accuracy = correct_predictions / num_samples
    print(f"Test Loss: {avg_loss:.4f} - Test Accuracy: {accuracy:.4f}")

    # Generate LIME explanations for a few test samples
    print("\nGenerating LIME explanations for sample predictions...")
    for idx in range(min(num_explanations, num_samples)):
        image = X_test[idx:idx+1]
        label = cp.argmax(y_test[idx]).item()
        explain_prediction(model, image, label, num_features=5, hide_rest=True)


def evaluate_in_batches(model, X, y, batch_size=64):
    num_samples = X.shape[0]
    num_batches = (num_samples + batch_size - 1) // batch_size
    correct = 0
    for i in range(num_batches):
        start = i * batch_size
        end = min(start + batch_size, num_samples)
        X_batch = X[start:end]
        y_batch = y[start:end]
        logits = model.forward(X_batch)
        softmax = Softmax()
        probs = softmax.forward(logits)
        preds = cp.argmax(probs, axis=1)
        targets = cp.argmax(y_batch, axis=1)
        correct += cp.sum(preds == targets)
        # Free GPU memory
        del X_batch, y_batch, logits, probs, preds, targets
        cp.get_default_memory_pool().free_all_blocks()
    return correct / num_samples

def hyperparameter_tuning(X_train, y_train, X_test, y_test, hyperparams_list):
    best_accuracy = -1
    best_params = None

    for params in hyperparams_list:
        lr = params.get('lr', 0.001)
        batch_size = params.get('batch_size', 64)
        num_epochs = params.get('num_epochs', 10)

        model = create_cnn_model()
        loss_fn = SoftmaxCrossEntropy()
        optimizer = MiniBatchSGD(module=model, lr=lr, batch_size=batch_size)

        train_model(model, loss_fn, optimizer, X_train, y_train, num_epochs=num_epochs, batch_size=batch_size)

        # Evaluate model in batches to avoid OOM
        accuracy = evaluate_in_batches(model, X_test, y_test, batch_size=64)
        print(f"Hyperparams: lr={lr}, batch_size={batch_size}, num_epochs={num_epochs} -> Test Accuracy: {accuracy}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params

    print(f"Best Hyperparams: {best_params} with Test Accuracy: {best_accuracy}")

if __name__ == "__main__":
    X_train, y_train, X_test, y_test = load_preprocess_data()

    hyperparams_list = [
        {'lr': 0.001, 'batch_size': 64, 'num_epochs': 10},
        {'lr': 0.005, 'batch_size': 32, 'num_epochs': 50},
        {'lr': 0.0005, 'batch_size': 128, 'num_epochs': 100}
    ]

    hyperparameter_tuning(X_train, y_train, X_test, y_test, hyperparams_list)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Original training data shape: (60000, 28, 28)
Original test data shape: (10000, 28, 28)
Reshaped training data shape: (60000, 1, 28, 28)
Reshaped test data shape: (10000, 1, 28, 28)
One-hot encoded training labels shape: (60000, 10)
One-hot encoded test labels shape: (10000, 10)
Epoch 1/10 - Loss: 2.0719 - Accuracy: 0.3293
Epoch 2/10 - Loss: 1.6738 - Accuracy: 0.5832
Epoch 3/10 - Loss: 1.4259 - Accuracy: 0.6507
Epoch 4/10 - Loss: 1.2554 - Accuracy: 0.6876
Epoch 5/10 - Loss: 1.1444 - Accuracy: 0.7036
Epoch 6/10 - Loss: 1.0588 - Accuracy: 0.7204
Epoch 7/10 - Loss: 0.9942 - Accuracy: 0.7318
Epoch 8/10 - Loss: 0.9462 - Accuracy: 0.7361
Epoch 9/10 - Loss: 0.9025 - Accuracy: 0.7440
Epoch 10/10 - Loss: 0.8733 - Accuracy: 0.7507
Hyperparams: lr=0.001, batch_size=64, num_epochs=10 -> Test Accuracy: 0.7604
Epoch 1/50 - Lo