In [1]:
import numpy as np
import pandas as pd
from unittest import TestCase
import matplotlib.pyplot as plt
import cv2
import os

from diffprivlib.mechanisms import ExponentialCategorical
from diffprivlib.mechanisms import Laplace

import time
import pickle
from tqdm.notebook import tqdm as tqdm

import torch
from torchvision import datasets, transforms
from torchvision.utils import make_grid
from torch.utils.data import DataLoader
from torchvision.utils import save_image
from dp_pix import*
from load_dp_cifar10_dataset import*

In [2]:
# Function to display a batch of original and DP-obfuscated images
def show_images(original, dp_images, epsilons):
    fig, axes = plt.subplots(1, len(epsilons) + 1, figsize=(12, 6))
    
    # Original image
    axes[0].imshow(np.transpose(original.numpy(), (1, 2, 0)))
    axes[0].set_title("Original")
    
    # DP-Pix images for each epsilon
    for idx, epsilon in enumerate(epsilons):
        axes[idx + 1].imshow(np.transpose(dp_images[epsilon].numpy(), (1, 2, 0)))
        axes[idx + 1].set_title(f"eps={epsilon}")
    
    plt.show()


# Function to save the DP-Pix dataset for each epsilon
def save_dp_dataset(trainloader, epsilons, block_size, m):
    for epsilon in epsilons:
        # Create a directory for each epsilon value
        output_dir = f'cifar10/dp_cifar10_eps_{epsilon}'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        print(f"Processing DP-Pix with epsilon = {epsilon}")
        
        for idx, (images, labels) in tqdm(enumerate(trainloader)):
            image = images[0]  # Take the first image from the batch
            dp_image = dp_pix(image, block_size, m, epsilon)
            
            # Save the DP image to the corresponding directory
            save_path = os.path.join(output_dir, f'{idx}.png')
            save_image(dp_image, save_path)
            

In [3]:
# # Adult dataset
# times = []
# columns = ["age", "workClass", "fnlwgt", "education", "education-num",
#            "marital-status", "occupation", "relationship", "race", "sex", 
#            "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

# data = pd.read_csv('adult/adult.data', names=columns, sep=r' *, *', engine='python', na_values='?')
# # Drop useless columns
# data.dropna(inplace=True)
# # Reset the index
# data.reset_index(drop=True, inplace=True)
# # Load utilities
# file_path = 'adult/utilities.pkl'
# with open(file_path, 'rb') as file:
#     utility_dict = pickle.load(file)
    
# EPS = [0.5, 2.5, 5., 25., 50., 100.]
# for eps in EPS:
#     t0 = time.time()
#     data_copy = data.copy()
#     for attribute in data_copy.columns:
#         if attribute in utility_dict.keys():
#             utility_list = utility_dict[attribute]
#             try:
#                 mech = ExponentialCategorical(epsilon = eps/(len(data_copy.columns)-3), utility_list = utility_list)
#             except:
#                 utility_list = [[str(key1), str(key2), utility_value] for key1, key2, utility_value in utility_list]
#                 mech = ExponentialCategorical(epsilon = eps/(len(data_copy.columns)-3), utility_list = utility_list)
#             data_copy[attribute] = data_copy[attribute].apply(lambda x: mech.randomise(str(x))).astype(data_copy[attribute].dtype)
    
#     data_copy.to_csv('adult/dp_adult_eps={}.csv'.format(eps), index=False)
#     times.append(time.time() - t0)
# mean_time = np.mean(times)
# std_time = np.std(times)
# print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_time, std_time))
# print(times)

In [4]:
# Embedding and utility calculation runtime: 167.43188667297363
# Anonymizing D time:19.52(±3.52)

In [5]:
# # Heart dataset
# times = []
# data = pd.read_csv('heart/cardio_train.csv', sep=';')
# data.drop(columns=['id'], inplace=True)
# data.dropna(inplace=True)
# # Reset the index
# data.reset_index(drop=True, inplace=True)
    
# EPS = [0.5, 2.5, 5., 25., 50., 100., 250, 500, 1000]
# for eps in EPS:
#     t0 = time.time()
#     data_copy = data.copy()
#     for attribute in data_copy.columns:
#         if 'cardio' in attribute:continue
#         a, b = data_copy[attribute].min(), data_copy[attribute].max()
#         sensitivity = b - a
#         mech = Laplace(epsilon = eps/(len(data_copy.columns)-1), sensitivity=sensitivity)
#         data_copy[attribute] = data_copy[attribute].apply(lambda x: mech.randomise(x)).astype(data_copy[attribute].dtype)
#         data_copy[attribute] = data_copy[attribute].apply(lambda x: np.clip(x, a, b)).astype(data_copy[attribute].dtype)
    
#     data_copy.to_csv('heart/dp_heart_eps={}.csv'.format(eps), index=False)
#     times.append(time.time() - t0)
    
# mean_time = np.mean(times)
# std_time = np.std(times)
# print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_time, std_time))

In [6]:
# # Credit dataset
# times = []
# data=pd.read_csv('GiveMeSomeCredit/cs-training.csv')
# data.drop(columns=['Unnamed: 0'], inplace=True)
# data.dropna(inplace=True)
# # Reset the index
# data.reset_index(drop=True, inplace=True)

# EPS = [0.5, 2.5, 5., 25., 50., 100.]
# for eps in EPS:
#     t0 = time.time()
#     data_copy = data.copy()
#     for attribute in data_copy.columns:
#         if 'SeriousDlqin2yrs' in attribute:continue
#         a, b = data_copy[attribute].min(), data_copy[attribute].max()
#         sensitivity = b - a
#         mech = Laplace(epsilon = eps/(len(data_copy.columns)-1), sensitivity=sensitivity)
#         data_copy[attribute] = data_copy[attribute].apply(lambda x: mech.randomise(x)).astype(data_copy[attribute].dtype)
#         data_copy[attribute] = data_copy[attribute].apply(lambda x: np.clip(x, a, b)).astype(data_copy[attribute].dtype)
    
#     data_copy.to_csv('GiveMeSomeCredit/dp_credit_eps={}.csv'.format(eps), index=False)
#     times.append(time.time() - t0)
    
# mean_time = np.mean(times)
# std_time = np.std(times)
# print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_time, std_time))
# print(times)

In [3]:
# CIFAR10 dataset

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
])

trainset = datasets.CIFAR10(root='../data', train=True, download=False, transform=transform)
labels = trainset.targets
trainloader = DataLoader(trainset, batch_size=1, shuffle=False)

# # Parameters
# epsilons = [0.5, 2.5, 5., 25., 50., 100.]  # Epsilon values
# m = 16          # Maximum number of different pixels in the image for DP
# block_size = 4  # Size of pixelation block (b)

# for eps in epsilons:
#     t0 = time.time()
#     # Process the trainset and save DP obfuscated datasets for each epsilon
#     save_dp_dataset(trainloader, [eps], block_size, m)
#     print('Anonymizing D time:{:0.2f}'.format((time.time() - t0)/len([eps])))

In [8]:
# m=16, b=16:Anonymizing D time:74.36
# m=16, b=8:Anonymizing D time:111.13
# m=16, b=6: Anonymizing D time:137.70
# m=16, b=4:Anonymizing D time:158.96
# m=8, b=4: Anonymizing D time:172.49

In [None]:
train_loader = DataLoader(trainset, batch_size=10, shuffle=False)
# Display a batch of original CIFAR-10 images
for _, (images, labels) in enumerate(train_loader):
    # Make a grid (2 rows and 5 columns) to display the first 10 images
    grid_img = make_grid(images, nrow=5)
    plt.imshow(grid_img.permute(1, 2, 0))  # Display the image grid
#     plt.title(f'Labels: {labels.tolist()}')  # Show labels in the title
    plt.axis('off')  # Hide the axes
    plt.tight_layout()
    plt.savefig('cifar_dp_samples/original_images.png', bbox_inches='tight', pad_inches=0, dpi = 600)
    plt.show()
    break  # Display just the first batch

# Load DP-obfuscated CIFAR-10 dataset with a specific epsilon value
epsilon = 100.0  # Specify the epsilon value

# Define the transform to convert images to tensors
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert image to PyTorch tensor
])

# Directory path for the corresponding epsilon dataset
dataset_dir = f'./cifar10/m16_b4/dp_cifar10_eps_{epsilon}'

# Use the original CIFAR-10 labels for DP-obfuscated images
labels = trainset.targets

# Create a dataset object for the DP-obfuscated dataset
dp_dataset = DPCIFAR10Dataset(root_dir=dataset_dir, labels=labels, transform=transform)

# Create a DataLoader for the DP-obfuscated dataset
dp_loader = DataLoader(dp_dataset, batch_size=10, shuffle=False)

# Step 4: Display a batch of DP-obfuscated images
for _, (images, labels) in enumerate(dp_loader):
    # Make a grid (2 rows and 5 columns) to display the first 10 images
    grid_img = make_grid(images, nrow=5)
    plt.imshow(grid_img.permute(1, 2, 0))  # Display the image grid
#     plt.title(f'DP Labels: {labels.tolist()}')  # Show labels in the title
    plt.axis('off')  # Hide the axes
    plt.tight_layout()
    plt.savefig('cifar_dp_samples/dp_images_eps={}.png'.format(epsilon), bbox_inches='tight', pad_inches=0, dpi = 600)
    plt.show()
    break  # Display just the first batch