In [21]:
!pip install opacus



In [22]:
!pip install scikit-learn-extra



In [23]:
!pip install apricot-select numpy scikit-learn



In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split, Subset
import numpy as np
from opacus import PrivacyEngine

In [69]:
import heapq
from sklearn.metrics import pairwise_distances
from apricot import FacilityLocationSelection
import time

In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 128
LR = 0.01
EPOCHS = 20
SEED = 42
MAX_GRAD_NORM = 1.0
NOISE_MULTIPLIER = 1.0
DELTA = 1e-5
PATIENCE = 5

torch.manual_seed(SEED)
np.random.seed(SEED)

In [70]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),
])

full_train = datasets.MNIST(
    root="./data",
    train=True,
    download=True,
    transform=transform
)

# train_size = 50000
# val_size = len(full_train) - train_size
# train_dataset, val_dataset = random_split(full_train, [train_size, val_size])

train_size = 5000
val_size = 1000
train_dataset, subset_rest = random_split(full_train, [train_size, len(full_train) - train_size])
val_dataset, _ = random_split(subset_rest, [val_size, len(subset_rest) - val_size])

test_dataset = datasets.MNIST(
    root="./data",
    train=False,
    download=True,
    transform=transform
)

train_data_array = train_dataset.dataset.data[train_dataset.indices].numpy()
selector = FacilityLocationSelection(n_samples=300, metric='euclidean', optimizer='lazy', verbose=True)

train_data_array = train_data_array.reshape(train_data_array.shape[0], -1)

selected_subset_data = selector.fit_transform(train_data_array)
selector.fit(train_data_array)

selected_indices = selector.ranking

selected_train_dataset = Subset(train_dataset, selected_indices)

train_loader = DataLoader(selected_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=256, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=256, shuffle=False)

100%|██████████| 300/300 [00:00<00:00, 428it/s]
100%|██████████| 300/300 [00:00<00:00, 427it/s]


In [71]:
# baseline model
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [65]:
def train_one_epoch(model, loader, optimizer):
    # train loop
    model.train()
    total_loss = 0
    for batch_idx, (data, target) in enumerate(loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = F.cross_entropy(out, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [66]:
@torch.no_grad()
def evaluate(model, loader):
    # evaluation
    model.eval()
    loss, correct = 0, 0
    for data, target in loader:
        data, target = data.to(device), target.to(device)
        out = model(data)
        loss += F.cross_entropy(out, target, reduction="sum").item()
        pred = out.argmax(1)
        correct += pred.eq(target).sum().item()
    loss /= len(loader.dataset)
    acc = 100. * correct / len(loader.dataset)
    return loss, acc

In [None]:
# @title
# CODE GOING HERE

#GOALS: UTLIZE CREST ALGO TO TRAIN THE MODEL
#   1 - Load MNIST
#   2 - TRAIN CNN MODEL
#   3 - Extract features
#   4 - Apply k-medoids clustering
#   5 - Retrain the model with the new subset (probably label easy/medium/hard)
#   6 - Get new scores and idk cal it a day?

def extractFeatures(model, loader):
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            out = model(data)
            features.append(out.cpu().numpy())
            labels.append(target.cpu().numpy())
    return np.concatenate(features), np.concatenate(labels)

def kMedoids(data, k, max_iter=100):
    n, d = data.shape
    medoid_indices = np.random.choice(n, k, replace=False)
    medoids = data[medoid_indices]
    for _ in range(max_iter):
        distances = np.linalg.norm(data[:, np.newaxis] - medoids, axis=2)
        labels = np.argmin(distances, axis=1)
        new_medoids = np.array([np.median(data[labels == i], axis=0) for i in range(k)])
        if np.array_equal(medoids, new_medoids):
            break
        medoids = new_medoids
    return medoids, labels

def graphMedoids(medoids, labels):
    fig, ax = plt.subplots()
    ax.scatter(data[:, 0], data[:, 1], c=labels)
    ax.scatter(medoids[:, 0], medoids[:, 1], marker='x', color='red')
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    plt.show()
    return fig, ax


In [72]:
def run_experiment(use_dp=False):
    # setup model and optimizer
    model = CNN().to(device)
    optimizer = optim.SGD(model.parameters(), lr=LR, momentum=0.9)

    privacy_engine = None
    if use_dp:
        privacy_engine = PrivacyEngine()
        model, optimizer, train_loader_dp = privacy_engine.make_private(
            module=model,
            optimizer=optimizer,
            data_loader=train_loader,
            max_grad_norm=MAX_GRAD_NORM,
            noise_multiplier=NOISE_MULTIPLIER,
        )
    else:
        train_loader_dp = train_loader


    # train
    best_val_acc = 0
    epochs_no_improve = 0
    best_model_path = f"best_model{'_dp' if use_dp else ''}.pt"

#TIMING ADDED IN
    train_start_time = time.time()
    for epoch in range(1, EPOCHS + 1):
        train_loss = train_one_epoch(model, train_loader_dp, optimizer)
        val_loss, val_acc = evaluate(model, val_loader)

        eps = privacy_engine.get_epsilon(DELTA) if use_dp else None

        print(f"[{'DP-SGD' if use_dp else 'Standard SGD'}] Epoch {epoch}: "
              + f"train_loss={train_loss:.4f}, "
              + f"val_loss={val_loss:.4f}, val_acc={val_acc:.4f}%"
              + (f", ε={eps:.4f}" if use_dp else ""))

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_no_improve = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            epochs_no_improve += 1
            first_pass = False

        if epochs_no_improve >= PATIENCE:
            print(f"\nEarly stopping at epoch {epoch} (no improvement for {PATIENCE} epochs).")
            break

    #END TRAIN TIME
    train_end_time = time.time()
    train_duration = train_end_time - train_start_time
    print(f"Training duration: {train_duration:.2f} seconds")

    #---------------------------------------------------------------------------
    # DETERMINE HOW THE DATA LOOKS AND HOW THE MODEL DID RIGHT HERE:
    #features = extractFeatures(model, train_loader)
    #medoids, labels = kMedoids(features, k=10)
    #fig, ax = graphMedoids(medoids, labels)
    #plt.savefig(f"graph{'_dp' if use_dp else ''}.png")


    #FROM LABEL, RETRAIN MODEL ON THE LABELED DATA

   #---------------------------------------------------------------------------
    # test on best model
    model.load_state_dict(torch.load(best_model_path))
    test_loss, test_acc = evaluate(model, test_loader)
    final_eps = privacy_engine.get_epsilon(DELTA) if use_dp else None

    print(f"\n[{ 'DP-SGD' if use_dp else 'Standard-SGD'}]")
    print(f"Best val acc: {best_val_acc:.2f}%, Test acc: {test_acc:.2f}%"
          + (f"ε={final_eps:.2f}" if use_dp else ""))
    return best_val_acc, test_acc, final_eps

# Baseline: No Differential Privacy

In [73]:
print("Running standard SGD baseline")
run_experiment(use_dp=False)

print("\nRunning DP-SGD baseline")
run_experiment(use_dp=True)

Running standard SGD baseline
[Standard SGD] Epoch 1: train_loss=2.2910, val_loss=2.2358, val_acc=14.9000%
[Standard SGD] Epoch 2: train_loss=2.1612, val_loss=2.1134, val_acc=16.3000%
[Standard SGD] Epoch 3: train_loss=1.9558, val_loss=1.8835, val_acc=39.4000%
[Standard SGD] Epoch 4: train_loss=1.5724, val_loss=1.4586, val_acc=55.9000%
[Standard SGD] Epoch 5: train_loss=1.0632, val_loss=0.9295, val_acc=76.7000%
[Standard SGD] Epoch 6: train_loss=0.5405, val_loss=0.6670, val_acc=78.8000%
[Standard SGD] Epoch 7: train_loss=0.3623, val_loss=0.6867, val_acc=77.7000%
[Standard SGD] Epoch 8: train_loss=0.2711, val_loss=0.7048, val_acc=80.8000%
[Standard SGD] Epoch 9: train_loss=0.2387, val_loss=0.7650, val_acc=78.4000%
[Standard SGD] Epoch 10: train_loss=0.2642, val_loss=1.2854, val_acc=76.0000%
[Standard SGD] Epoch 11: train_loss=0.3460, val_loss=0.8693, val_acc=79.7000%
[Standard SGD] Epoch 12: train_loss=0.1702, val_loss=1.0226, val_acc=79.1000%
[Standard SGD] Epoch 13: train_loss=0.2151,



[DP-SGD] Epoch 1: train_loss=2.3131, val_loss=2.2999, val_acc=14.8000%, ε=4.5824
[DP-SGD] Epoch 2: train_loss=2.3042, val_loss=2.2871, val_acc=19.6000%, ε=6.0506
[DP-SGD] Epoch 3: train_loss=2.2878, val_loss=2.2695, val_acc=26.8000%, ε=7.2074
[DP-SGD] Epoch 4: train_loss=2.2576, val_loss=2.2508, val_acc=33.5000%, ε=8.2092
[DP-SGD] Epoch 5: train_loss=2.2339, val_loss=2.2301, val_acc=37.6000%, ε=9.1137
[DP-SGD] Epoch 6: train_loss=2.2104, val_loss=2.2087, val_acc=40.1000%, ε=9.9496
[DP-SGD] Epoch 7: train_loss=2.1851, val_loss=2.1861, val_acc=43.6000%, ε=10.7339
[DP-SGD] Epoch 8: train_loss=2.1480, val_loss=2.1600, val_acc=45.1000%, ε=11.4774
[DP-SGD] Epoch 9: train_loss=2.1005, val_loss=2.1298, val_acc=47.3000%, ε=12.1878
[DP-SGD] Epoch 10: train_loss=2.0809, val_loss=2.0968, val_acc=50.5000%, ε=12.8705
[DP-SGD] Epoch 11: train_loss=2.0353, val_loss=2.0582, val_acc=55.5000%, ε=13.5296
[DP-SGD] Epoch 12: train_loss=1.9866, val_loss=2.0153, val_acc=59.7000%, ε=14.1682
[DP-SGD] Epoch 13: 

(61.3, 58.56, np.float64(18.230446378444586))