# CIFAR DP-sgd example

This code runs CIFAR10 and CIFAR100 under DP-sgd. To switch between these two datasets, update the `dataset` field in `train_config.yaml` and the `data_path` field in `audit.yaml` accordingly.


##### Make sure opacus is installed

In [1]:
!pip install opacus

[0m

In [2]:
import os
import sys
import yaml

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
sys.path.append(project_root)

Next, we create the population dataset by concatenating the train and test data. To create the population, we make use of the UserDataset provided in the InputHandler.

In [3]:

from torchvision.datasets import CIFAR10, CIFAR100
from torch import cat, tensor
import pickle
from cifar_handler import CifarInputHandler

# Load the config.yaml file
with open('train_config.yaml', 'r') as file:
    train_config = yaml.safe_load(file)
    
root = train_config["data"]["data_dir"]
path = os.path.join(os.getcwd(), root)
# Load the CIFAR train and test datasets
if train_config["data"]["dataset"] == "cifar10":
    trainset = CIFAR10(root=root, train=True, download=True)
    testset = CIFAR10(root=root, train=False, download=True)
elif train_config["data"]["dataset"] == "cifar100":
    trainset = CIFAR100(root=root, train=True, download=True)
    testset = CIFAR100(root=root, train=False, download=True)
else:
    raise ValueError("Unknown dataset type")

train_data = tensor(trainset.data).permute(0, 3, 1, 2).float() / 255  # (N, C, H, W)
test_data = tensor(testset.data).permute(0, 3, 1, 2).float() / 255

# Ensure train and test data looks correct
assert train_data.shape[0] == 50000, "Train data should have 50000 samples"
assert test_data.shape[0] == 10000, "Test data should have 10000 samples"
assert train_data.shape[1] == 3, "Data should have 3 channels"
assert test_data.shape[1] == 3, "Data should have 3 channels"
assert train_data.max() <= 1 and train_data.min() >= 0, "Data should be normalized"
assert test_data.max() <= 1 and test_data.min() >= 0, "Data should be normalized"

# Concatenate train and test data into the population
data = cat([train_data.clone().detach(), test_data.clone().detach()], dim=0)
targets = cat([tensor(trainset.targets), tensor(testset.targets)], dim=0)
# Create UserDataset object
population_dataset = CifarInputHandler.UserDataset(data, targets)

assert len(population_dataset) == 60000, "Population dataset should have 60000 samples"

# Store the population dataset to be used by LeakPro
dataset_name = train_config["data"]["dataset"]
file_path =  "data/"+ dataset_name + ".pkl"
if not os.path.exists(file_path):
    with open(file_path, "wb") as file:
        pickle.dump(population_dataset, file)
        print(f"Save data to {file_path}")

  from .autonotebook import tqdm as notebook_tqdm


Files already downloaded and verified
Files already downloaded and verified


With the population dataset stored, we next create the train and test set that will go in to training the target model.

In [4]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import numpy as np

train_fraction = train_config["data"]["f_train"]
test_fraction = train_config["data"]["f_test"]
batch_size = train_config["train"]["batch_size"]

dataset_size = len(population_dataset)
train_size = int(train_fraction * dataset_size)
test_size = int(test_fraction * dataset_size)

selected_index = np.random.choice(np.arange(dataset_size), train_size + test_size, replace=False)
train_indices, test_indices = train_test_split(selected_index, test_size=test_size)

train_subset = CifarInputHandler.UserDataset(data[train_indices], targets[train_indices])
test_subset = CifarInputHandler.UserDataset(data[test_indices], targets[test_indices], **train_subset.return_params())

train_loader = DataLoader(train_subset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_subset, batch_size = batch_size, shuffle = False)

# Evaluate mean and variance of the train data
train_mean = train_subset.mean
train_std = train_subset.std
print (f"Train mean: {train_mean}, Train std: {train_std}")

Train mean: tensor([[[0.4964]],

        [[0.4857]],

        [[0.4499]]]), Train std: tensor([[[0.2453]],

        [[0.2419]],

        [[0.2617]]])



## Noise Multiplier Configuration for Privacy Analysis

In this code block, we configure the parameters necessary for calculating the noise multiplier using the **Ocapi** library, which we used for differential privacy analysis. 

- **`target_epsilon`**: The desired epsilon value.
- **`target_delta`**: The delta value indicating the risk of privacy loss.
- **`sample_rate`**: The rate at which data points are used in training.
- **`epochs`**: The number of training epochs for the model.
- **`epsilon_tolerance`**: A small margin for the epsilon value,
- **`accountant`**: Specifies the method of tracking privacy loss, with "prv" referring to the Privacy Accountant for DPSGD.
- **`eps_error`**: The allowable error in epsilon calculations
- **`max_grad_norm`**: A limit on the gradient norm to ensure the gradients do not explode during training.

The most common hyperparameters to tune are `target_epsilon`, `sample_rate`, `noise_multiplier`, and `max_grad_norm`. These parameters should be inputed by the user based on thier need for balancing privacy and utility.


In [5]:
target_model_dir = "./target_dpsgd"
delta = 1e-5
target_epsilon = 3.5
sample_rate = 1/len(train_loader) # already incorporates batchsize
    
noise_multiplier_dict = {
    "target_epsilon": target_epsilon,
    "target_delta": delta,
    "sample_rate": sample_rate,
    "epochs": 21,
    "epsilon_tolerance": 0.01,
    "accountant": "prv",
    "eps_error": 0.01,
    "max_grad_norm": 1,
}

# Create metadata for privacy engine
with open(f"{target_model_dir}/dpsgd_dic.pkl", "wb") as f:
    pickle.dump(noise_multiplier_dict, f)

In [6]:
from torch import save, optim, nn
from cifar_handler_dpsgd import CifarInputHandlerDPsgd
from target_model_class import ResNet18

# Train the model
if not os.path.exists("target"):
    os.makedirs("target")
if train_config["data"]["dataset"] == "cifar10":
    num_classes = 10
elif train_config["data"]["dataset"] == "cifar100":
    num_classes = 100
else:
    raise ValueError("Invalid dataset name")

# Create instance of target model
model = ResNet18(num_classes = num_classes)

# Read out the relevant parameters for training
lr = train_config["train"]["learning_rate"]
momentum = train_config["train"]["momentum"]
epochs = train_config["train"]["epochs"]
    
# Create optimizer and loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
# optimizer = optim.SGD

# train target model
train_result = CifarInputHandlerDPsgd().train(dataloader=train_loader,
                            model=model,
                            criterion=criterion,
                            optimizer=optimizer,
                            epochs=epochs,
                            do_dpsgd=True)

from opacus.validators import ModuleValidator
_ = ModuleValidator.fix(model)

# Get the trained DP-sgd model
model = train_result.model

from opacus.validators import ModuleValidator
_ = ModuleValidator.fix(model)

# Evaluate on test set
test_result = CifarInputHandlerDPsgd().eval(test_loader, model, criterion)

# Store the model and metadata
model.to("cpu")

# with open(train_config["run"]["log_dir"]+"/target_model.pkl", "wb") as f:
#     save(model.state_dict(), f)

state_dict = model.cpu().state_dict()
cleaned_state_dict = {key.replace("_module.", ""): value#.replace("module.", ""): value
                    for key, value in state_dict.items()}

for key, _ in state_dict.items():
    print(key)

m_ = ResNet18()
for key, _ in m_.state_dict().items():
    print(key)

with open(train_config["run"]["log_dir"]+"/target_model.pkl", "wb") as f:
    save(cleaned_state_dict, f)

# Create metadata to be used by LeakPro
from leakpro import LeakPro
meta_data = LeakPro.make_mia_metadata(train_result = train_result,
                                      optimizer = optimizer,
                                      loss_fn = criterion,
                                      dataloader = train_loader,
                                      test_result = test_result,
                                      epochs = epochs,
                                      train_indices = train_indices,
                                      test_indices = test_indices,
                                      dataset_name = dataset_name)

with open("target_dpsgd/model_metadata.pkl", "wb") as f:
    pickle.dump(meta_data, f)
    



Model has privacy violations. Fixing...
Model fixed and SGD re-instantiated.
Training with DP-SGD
Pickle file loaded successfully!
Data: {'target_epsilon': 3.5, 'target_delta': 1e-05, 'sample_rate': 0.02631578947368421, 'epochs': 21, 'epsilon_tolerance': 0.01, 'accountant': 'prv', 'eps_error': 0.01, 'max_grad_norm': 1}


Epoch 1/2: 100%|██████████| 38/38 [00:02<00:00, 13.04it/s]
Epoch 2/2: 100%|██████████| 38/38 [00:02<00:00, 14.78it/s]


_module.model.conv1.weight
_module.model.bn1.weight
_module.model.bn1.bias
_module.model.layer1.0.conv1.weight
_module.model.layer1.0.bn1.weight
_module.model.layer1.0.bn1.bias
_module.model.layer1.0.conv2.weight
_module.model.layer1.0.bn2.weight
_module.model.layer1.0.bn2.bias
_module.model.layer1.1.conv1.weight
_module.model.layer1.1.bn1.weight
_module.model.layer1.1.bn1.bias
_module.model.layer1.1.conv2.weight
_module.model.layer1.1.bn2.weight
_module.model.layer1.1.bn2.bias
_module.model.layer2.0.conv1.weight
_module.model.layer2.0.bn1.weight
_module.model.layer2.0.bn1.bias
_module.model.layer2.0.conv2.weight
_module.model.layer2.0.bn2.weight
_module.model.layer2.0.bn2.bias
_module.model.layer2.0.downsample.0.weight
_module.model.layer2.0.downsample.1.weight
_module.model.layer2.0.downsample.1.bias
_module.model.layer2.1.conv1.weight
_module.model.layer2.1.bn1.weight
_module.model.layer2.1.bn1.bias
_module.model.layer2.1.conv2.weight
_module.model.layer2.1.bn2.weight
_module.model.

In [9]:
for (key1, _), (key2, _) in zip(cleaned_state_dict.items(), m_.state_dict().items()):
    print(key1, key2, key1 == key2)
    
cleaned_state_dict == m_.state_dict().items()

model.conv1.weight model.conv1.weight True
model.bn1.weight model.bn1.weight True
model.bn1.bias model.bn1.bias True
model.layer1.0.conv1.weight model.bn1.running_mean False
model.layer1.0.bn1.weight model.bn1.running_var False
model.layer1.0.bn1.bias model.bn1.num_batches_tracked False
model.layer1.0.conv2.weight model.layer1.0.conv1.weight False
model.layer1.0.bn2.weight model.layer1.0.bn1.weight False
model.layer1.0.bn2.bias model.layer1.0.bn1.bias False
model.layer1.1.conv1.weight model.layer1.0.bn1.running_mean False
model.layer1.1.bn1.weight model.layer1.0.bn1.running_var False
model.layer1.1.bn1.bias model.layer1.0.bn1.num_batches_tracked False
model.layer1.1.conv2.weight model.layer1.0.conv2.weight False
model.layer1.1.bn2.weight model.layer1.0.bn2.weight False
model.layer1.1.bn2.bias model.layer1.0.bn2.bias False
model.layer2.0.conv1.weight model.layer1.0.bn2.running_mean False
model.layer2.0.bn1.weight model.layer1.0.bn2.running_var False
model.layer2.0.bn1.bias model.layer1.

False

In [None]:
import torch

model_ = ResNet18(num_classes = num_classes)
model_priv_ = train_result.model

# Print model types to understand what we're comparing
print(f"Original model type: {type(model_)}")
print(f"DP-SGD model type: {type(model_priv_)}")

# Check if model_priv_ is wrapped by opacus
print(f"Is model_priv_ wrapped by opacus GradSampleModule? {isinstance(model_priv_, torch.nn.Module)}")

# Compare parameter shapes and counts
orig_params = sum(p.numel() for p in model_.parameters())
priv_params = sum(p.numel() for p in model_priv_.parameters())
print(f"\nOriginal model parameters: {orig_params:,}")
print(f"DP-SGD model parameters: {priv_params:,}")
print(f"Parameter count match: {orig_params == priv_params}")

print(model_priv_)
print(model_)

from opacus.validators import ModuleValidator
_ = ModuleValidator.fix(model_)


In [None]:
import matplotlib.pyplot as plt

train_acc = train_result.metrics.extra["accuracy_history"]
train_loss = train_result.metrics.extra["loss_history"]
test_acc = test_result.accuracy
test_loss = test_result.loss

# Plot training and test accuracy
plt.figure(figsize=(8, 4))

plt.subplot(1, 2, 1)
plt.plot(train_acc, label='Train Accuracy')
plt.plot(len(train_loss)-1, test_acc, 'ro', label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy over Epochs')
plt.legend()

# Plot training and test loss
plt.subplot(1, 2, 2)
plt.plot(train_loss, label='Train Loss')
plt.plot(len(train_loss)-1, test_loss, 'ro', label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from cifar_handler import CifarInputHandler
# from cifar_handler_dpsgd import CifarInputHandlerDPsgd

from leakpro import LeakPro

# # Read the config file
# config_path = "audit_dpsgd.yaml"q

# # Prepare leakpro object
# leakpro = LeakPro(CifarInputHandler, config_path)

# Read the DPsgd config file and prepare LeakPro object for DPsgd
config_path = "audit_dpsgd.yaml"
leakpro = LeakPro(CifarInputHandlerDPsgd, config_path)

# Run the audit 
mia_results_optuna = leakpro.run_audit(return_results=True, use_optuna=False)

## Generate report

In [None]:
# Import and initialize ReportHandler
from leakpro.reporting.report_handler import ReportHandler

# report_handler = ReportHandler()
report_handler = ReportHandler(report_dir="./leakpro_output/results")

# Save MIA resuls using report handler
for res in mia_results_optuna:
    report_handler.save_results(attack_name=res.attack_name, result_data=res, config=res.configs)

# # Create the report by compiling the latex text
report_handler.create_report()