#### Face Identity Classification
This example illustrates MIA attacks on a face identity classfier model. The classfier is train on [CelebA Mask HQ](https://github.com/switchablenorms/CelebAMask-HQ) dataset which should be downloaded, decompressed, and placed in the ```data``` directory as follows:

directory_structure:

```
data/
    ├── train/
    │   ├── identity 1/
    │   ├── identity 2/
    │   └── ...
    └── test/
        ├── identity 1/
        ├── identity 2/
        └── ...
```


In [None]:
import os

# Path to the dataset zip file
zip_path = "./data/CelebA_HQ_facial_identity_dataset.zip"
data_folder = "./data"

# Check if the 'data' folder exists, if not, create it
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
    print("'data' folder created.")

# Check if the file already exists
if not os.path.exists(zip_path):
    print("Dataset not found. Downloading...")
    !wget -O {zip_path} https://postechackr-my.sharepoint.com/:u:/g/personal/dongbinna_postech_ac_kr/ES-jbCNC6mNHhCyR4Nl1QpYBlxVOJ5YiVerhDpzmoS9ezA?download=1 
    !unzip {zip_path} -d ./data
    print("Download and extraction completed.")
else:
    print("Dataset already exists.")

### Train the Identity Classifier Model

In [None]:
import os
import sys
import yaml

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
sys.path.append(project_root)

In [None]:
# Load the config.yaml file
with open('train_config.yaml', 'r') as file:
    train_config = yaml.safe_load(file)

# Generate the dataset and dataloaders
path = os.path.join(os.getcwd(), train_config["data"]["data_dir"])

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from torch.utils.data import DataLoader, Subset
from celebA_hq_dpsgd_handler import CelebAInputHandlerDPsgd

train_fraction = train_config["data"]["f_train"]
test_fraction = train_config["data"]["f_test"]
batch_size = train_config["train"]["batch_size"]
data_dir =  train_config["data"]["data_dir"] + "/celeba_hq_data.pkl"
dataset_name = train_config["data"]["dataset"]
log_dir = train_config["run"]["log_dir"]

# Create log directory if it does not exist
os.makedirs(log_dir, exist_ok=True)

if not os.path.exists(data_dir):
    population_dataset = CelebAInputHandlerDPsgd.UserDataset.from_celebHq(config=train_config)
    with open(data_dir, "wb") as file:
        pickle.dump(population_dataset, file)
        print(f"Save data to {data_dir}")
else:
    with open(data_dir, "rb") as file:
        population_dataset = pickle.load(file)
        print(f"Load data from {data_dir}")

dataset_size = len(population_dataset)
train_size = int(train_fraction * dataset_size)
test_size = int(test_fraction * dataset_size)

# Use sklearn's train_test_split to split into train and test indices
selected_index = np.random.choice(np.arange(dataset_size), train_size + test_size, replace=False)
train_indices, test_indices = train_test_split(selected_index, test_size=test_size)

train_subset = Subset(population_dataset, train_indices)
test_subset = Subset(population_dataset, test_indices)

train_loader = DataLoader(train_subset, batch_size =batch_size, shuffle=True)
test_loader = DataLoader(test_subset, batch_size= batch_size, shuffle=False)


## Noise Multiplier Configuration for Privacy Analysis

In this code block, we configure the parameters necessary for calculating the noise multiplier using the **Ocapi** library, which we used for differential privacy analysis. 

- **`target_epsilon`**: The desired epsilon value.
- **`target_delta`**: The delta value indicating the risk of privacy loss.
- **`sample_rate`**: The rate at which data points are used in training.
- **`epochs`**: The number of training epochs for the model.
- **`epsilon_tolerance`**: A small margin for the epsilon value,
- **`accountant`**: Specifies the method of tracking privacy loss, with "prv" referring to the Privacy Accountant for DPSGD.
- **`eps_error`**: The allowable error in epsilon calculations
- **`max_grad_norm`**: A limit on the gradient norm to ensure the gradients do not explode during training.

The most common hyperparameters to tune are `target_epsilon`, `sample_rate`, `noise_multiplier`, and `max_grad_norm`. These parameters should be inputed by the user based on thier need for balancing privacy and utility.


In [None]:
delta = 1e-5
target_epsilon = 3.5
sample_rate = 1/len(train_loader) # already incorporates batchsize
    
noise_multiplier_dict = {
    "target_epsilon": target_epsilon,
    "target_delta": delta,
    "sample_rate": sample_rate,
    "epochs": train_config["train"]["epochs"],
    "epsilon_tolerance": 0.01,
    "accountant": "prv",
    "eps_error": 0.01,
    "max_grad_norm": 1.,
}


# Create metadata for privacy engine
with open(f"{log_dir}/dpsgd_dic.pkl", "wb") as f:
    pickle.dump(noise_multiplier_dict, f)

In [None]:
from torch import save, optim, nn
from target_model_class import ResNet18_DPsgd

num_classes = 307

# Create instance of target model
model = ResNet18_DPsgd(
                    num_classes = num_classes,
                    dpsgd=True,
                    validate=False,
                    )

# Read out the relevant parameters for training
lr = train_config["train"]["learning_rate"]
weight_decay = train_config["train"]["weight_decay"]
epochs = train_config["train"]["epochs"]
    
# Create optimizer and loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)

# Train the target model
train_result = CelebAInputHandlerDPsgd().train(dataloader=train_loader,
                        model=model,
                        criterion=criterion,
                        optimizer=optimizer,
                        epochs=epochs)

# Evaluate on test set
test_result = CelebAInputHandlerDPsgd().eval(test_loader, model, criterion)

# Store the model and metadata
model = train_result.model
model.to("cpu")
with open(f"{log_dir}/target_model.pkl", "wb") as f:
    save(model.state_dict(), f)

# Create metadata to be used by LeakPro
from leakpro import LeakPro
meta_data = LeakPro.make_mia_metadata(train_result = train_result,
                                      optimizer = optimizer,
                                      loss_fn = criterion,
                                      dataloader = train_loader,
                                      test_result = test_result,
                                      epochs = epochs,
                                      train_indices = train_indices,
                                      test_indices = test_indices,
                                      dataset_name = dataset_name)

with open(f"{log_dir}/model_metadata.pkl", "wb") as f:
    pickle.dump(meta_data, f)

In [None]:
import matplotlib.pyplot as plt

train_acc = train_result.metrics.extra["accuracy_history"]
train_loss = train_result.metrics.extra["loss_history"]
test_acc = test_result.accuracy
test_loss = test_result.loss

# Plot training and test accuracy
plt.figure(figsize=(5, 4))

plt.subplot(1, 2, 1)
plt.plot(train_acc, label='Train Accuracy')
plt.plot(test_acc, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy over Epochs')
plt.legend()

# Plot training and test loss
plt.subplot(1, 2, 2)
plt.plot(train_loss, label='Train Loss')
plt.plot(test_loss, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()

plt.tight_layout()
plt.show()

### MIA Attacks on Identifier Model 

In [None]:
from leakpro import LeakPro

# Read the config file
config_path = "audit_dpsgd.yaml"

# Prepare leakpro object
leakpro = LeakPro(CelebAInputHandlerDPsgd, config_path)

# Run the audit 
mia_results = leakpro.run_audit(create_pdf=True)