In [3]:
import numpy as np
import pandas as pd
from unittest import TestCase

from diffprivlib.mechanisms import ExponentialCategorical
from diffprivlib.mechanisms import Laplace

import time
import pickle

In [2]:
times = []
columns = ["age", "workClass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex", 
           "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

data = pd.read_csv('adult/adult.data', names=columns, sep=r' *, *', engine='python', na_values='?')
# Drop useless columns
data.dropna(inplace=True)
# Reset the index
data.reset_index(drop=True, inplace=True)
# Load utilities
file_path = 'adult/utilities.pkl'
with open(file_path, 'rb') as file:
    utility_dict = pickle.load(file)
    
EPS = [0.5, 2.5, 5., 25., 50., 100.]
for eps in EPS:
    t0 = time.time()
    data_copy = data.copy()
    for attribute in data_copy.columns:
        if attribute in utility_dict.keys():
            utility_list = utility_dict[attribute]
            try:
                mech = ExponentialCategorical(epsilon = eps/(len(data_copy.columns)-3), utility_list = utility_list)
            except:
                utility_list = [[str(key1), str(key2), utility_value] for key1, key2, utility_value in utility_list]
                mech = ExponentialCategorical(epsilon = eps/(len(data_copy.columns)-3), utility_list = utility_list)
            data_copy[attribute] = data_copy[attribute].apply(lambda x: mech.randomise(str(x))).astype(data_copy[attribute].dtype)
    
    data_copy.to_csv('adult/dp_adult_eps={}.csv'.format(eps), index=False)
    times.append(time.time() - t0)
mean_time = np.mean(times)
std_time = np.std(times)
print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_time, std_time))
print(times)

Anonymizing D time:20.82(±0.43)
[21.166807174682617, 20.923747539520264, 20.989782571792603, 21.054184198379517, 20.892335414886475, 19.878856897354126]


In [None]:
# Embedding and utility calculation runtime: 167.43188667297363
# Anonymizing D time:19.52(±3.52)

In [4]:
times = []
data = pd.read_csv('heart/cardio_train.csv', sep=';')
data.drop(columns=['id'], inplace=True)
data.dropna(inplace=True)
# Reset the index
data.reset_index(drop=True, inplace=True)
    
EPS = [0.5, 2.5, 5., 25., 50., 100., 250, 500, 1000]
for eps in EPS:
    t0 = time.time()
    data_copy = data.copy()
    for attribute in data_copy.columns:
        if 'cardio' in attribute:continue
        a, b = data_copy[attribute].min(), data_copy[attribute].max()
        sensitivity = b - a
        mech = Laplace(epsilon = eps/(len(data_copy.columns)-1), sensitivity=sensitivity)
        data_copy[attribute] = data_copy[attribute].apply(lambda x: mech.randomise(x)).astype(data_copy[attribute].dtype)
        data_copy[attribute] = data_copy[attribute].apply(lambda x: np.clip(x, a, b)).astype(data_copy[attribute].dtype)
    
    data_copy.to_csv('heart/dp_heart_eps={}.csv'.format(eps), index=False)
    times.append(time.time() - t0)
    
mean_time = np.mean(times)
std_time = np.std(times)
print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_time, std_time))

Anonymizing D time:29.73(±0.20)


In [5]:
for eps, t in zip(EPS, times):
    print(eps, ':', t)

0.5 : 29.543476343154907
2.5 : 29.59823989868164
5.0 : 29.734660625457764
25.0 : 29.40824007987976
50.0 : 30.041175365447998
100.0 : 29.871707439422607
250 : 29.984375
500 : 29.705463409423828
1000 : 29.64196276664734


In [6]:
times = []
data=pd.read_csv('GiveMeSomeCredit/cs-training.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data.dropna(inplace=True)
# Reset the index
data.reset_index(drop=True, inplace=True)

EPS = [0.5, 2.5, 5., 25., 50., 100.]
for eps in EPS:
    t0 = time.time()
    data_copy = data.copy()
    for attribute in data_copy.columns:
        if 'SeriousDlqin2yrs' in attribute:continue
        a, b = data_copy[attribute].min(), data_copy[attribute].max()
        sensitivity = b - a
        mech = Laplace(epsilon = eps/(len(data_copy.columns)-1), sensitivity=sensitivity)
        data_copy[attribute] = data_copy[attribute].apply(lambda x: mech.randomise(x)).astype(data_copy[attribute].dtype)
        data_copy[attribute] = data_copy[attribute].apply(lambda x: np.clip(x, a, b)).astype(data_copy[attribute].dtype)
    
    data_copy.to_csv('GiveMeSomeCredit/dp_credit_eps={}.csv'.format(eps), index=False)
    times.append(time.time() - t0)
    
mean_time = np.mean(times)
std_time = np.std(times)
print('Anonymizing D time:{:0.2f}(±{:0.2f})'.format(mean_time, std_time))
print(times)

Anonymizing D time:43.49(±0.23)
[43.428497076034546, 43.209702491760254, 43.292699098587036, 43.907886266708374, 43.61999177932739, 43.48688077926636]
