In [None]:
import numpy as np
import pandas as pd
from unittest import TestCase

from diffprivlib.mechanisms import ExponentialCategorical
from diffprivlib.mechanisms import Laplace

import time
import pickle

In [None]:
times = []
columns = ["age", "workClass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex", 
           "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

data = pd.read_csv('data/adult/adult.data', names=columns, sep=r' *, *', engine='python', na_values='?')
# Drop useless columns
data.dropna(inplace=True)
# Reset the index
data.reset_index(drop=True, inplace=True)
# Load utilities
file_path = 'data/adult/utilities.pkl'
with open(file_path, 'rb') as file:
    utility_dict = pickle.load(file)
    
EPS = [0.5, 2.5, 5., 25., 50., 100. ]
for eps in EPS:
    t0 = time.time()
    data_copy = data.copy()
    for attribute in data_copy.columns:
        if attribute in utility_dict.keys():
            utility_list = utility_dict[attribute]
            try:
                mech = ExponentialCategorical(epsilon = eps/(len(data_copy.columns)-1), utility_list = utility_list)
            except:
                utility_list = [[str(key1), str(key2), utility_value] for key1, key2, utility_value in utility_list]
                mech = ExponentialCategorical(epsilon = eps/(len(data_copy.columns)-1), utility_list = utility_list)
            data_copy[attribute] = data_copy[attribute].apply(lambda x: mech.randomise(str(x))).astype(data_copy[attribute].dtype)
    
    data_copy.to_csv('data/adult/dp_adult_eps={}.csv'.format(eps), index=False)
    times.append(time.time() - t0)
mean_time = np.mean(times)
std_time = np.std(times)
print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_time, std_time))

In [None]:
# Embedding and utility calculation runtime: 167.43188667297363
# Anonymizing D time:19.52(±3.52)

In [None]:
times = []
data = pd.read_csv('data/heart/cardio_train.csv', sep=';')
data.drop(columns=['id'], inplace=True)
data.dropna(inplace=True)
# Reset the index
data.reset_index(drop=True, inplace=True)
    
EPS = [0.5, 2.5, 5., 25., 50., 100. ]
for eps in EPS:
    t0 = time.time()
    data_copy = data.copy()
    for attribute in data_copy.columns:
        if 'cardio' in attribute:continue
        a, b = data_copy[attribute].min(), data_copy[attribute].max()
        sensitivity = b - a
        mech = Laplace(epsilon = eps/(len(data_copy.columns)-1), sensitivity=sensitivity)
        data_copy[attribute] = data_copy[attribute].apply(lambda x: mech.randomise(x)).astype(data_copy[attribute].dtype)
        data_copy[attribute] = data_copy[attribute].apply(lambda x: np.clip(x, a, b)).astype(data_copy[attribute].dtype)
    
    data_copy.to_csv('data/heart/dp_heart_eps={}.csv'.format(eps), index=False)
    times.append(time.time() - t0)
    
mean_time = np.mean(times)
std_time = np.std(times)
print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_time, std_time))

In [None]:
# Anonymizing D time:28.63(±1.51)

In [None]:
times = []
data=pd.read_csv('data/GiveMeSomeCredit/cs-training.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data.dropna(inplace=True)
# Reset the index
data.reset_index(drop=True, inplace=True)

EPS = [0.5, 2.5, 5., 25., 50., 100.]
for eps in EPS:
    t0 = time.time()
    data_copy = data.copy()
    for attribute in data_copy.columns:
        if 'SeriousDlqin2yrs' in attribute:continue
        a, b = data_copy[attribute].min(), data_copy[attribute].max()
        sensitivity = b - a
        mech = Laplace(epsilon = eps/(len(data_copy.columns)-1), sensitivity=sensitivity)
        data_copy[attribute] = data_copy[attribute].apply(lambda x: mech.randomise(x)).astype(data_copy[attribute].dtype)
        data_copy[attribute] = data_copy[attribute].apply(lambda x: np.clip(x, a, b)).astype(data_copy[attribute].dtype)
    
    data_copy.to_csv('data/GiveMeSomeCredit/dp_credit_eps={}.csv'.format(eps), index=False)
    times.append(time.time() - t0)
    
mean_time = np.mean(times)
std_time = np.std(times)
print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_time, std_time))

In [None]:
# Anonymizing D time:44.09(±0.24)