In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np


In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [31]:
transform = transforms.ToTensor()

test_dataset = torchvision.datasets.MNIST(
    root="../data",
    train=False,
    download=True,
    transform=transform
)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False
)


In [32]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.fc = nn.Sequential(
            nn.Linear(64 * 14 * 14, 128),  # penultimate
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


In [33]:
model = SimpleCNN().to(device)
model.load_state_dict(
    torch.load("../models/backdoored_model.pth", map_location=device)
)
model.eval()
print("✅ Backdoored model loaded")


✅ Backdoored model loaded


In [34]:
activations = []

def hook_fn(module, input, output):
    activations.append(output.detach().cpu())

hook = model.fc[0].register_forward_hook(hook_fn)

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        _ = model(images)

hook.remove()

activations = torch.cat(activations, dim=0)
mean_activation = activations.mean(dim=0)


In [35]:
prune_ratio = 0.2
num_prune = int(prune_ratio * mean_activation.numel())

prune_idx = torch.argsort(mean_activation)[:num_prune]

with torch.no_grad():
    model.fc[0].weight[prune_idx] = 0
    model.fc[0].bias[prune_idx] = 0

print(f"✂️ Pruned {num_prune}/{mean_activation.numel()} neurons")


✂️ Pruned 25/128 neurons


In [36]:
train_dataset = torchvision.datasets.MNIST(
    root="../data",
    train=True,
    download=True,
    transform=transform
)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

epochs = 2
model.train()

for epoch in range(epochs):
    total_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}] Loss: {total_loss:.4f}")


Epoch [1/2] Loss: 9.5604
Epoch [2/2] Loss: 5.7660


In [37]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

clean_acc = 100 * correct / total
print("✅ Clean Accuracy after defense:", clean_acc)


✅ Clean Accuracy after defense: 98.91
