In [8]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import numpy as np
import opendp.prelude as dp
dp.enable_features("contrib")
dp.enable_features("floating-point")
from utilities import * 
from pure_ldp.frequency_oracles import *
from pure_ldp.heavy_hitters import *

In [9]:
from StabilityHist import *
from Laplace import *
from Randomised_Response import *
from Unary_Encoding import *
from OLH import *
from Hadamard import *
from postprocessing import *

### Define Parameters

In [24]:
#Level: county or ed
level = "ed"

#Mechanism: laplace, stabilityhist, unaryencoding, randresponse, olh, hadamard, rappor
#Mechanism = "laplace"
#Mechanism_name = "Laplace Mechanism"

Mechanism = "randresponse"

#Path to data
path = "./"
save = True

max_influence = 2
epsilon = np.arange(0.5,5.5, 0.5)


In [25]:
if not (level == "county" or level == "ed"):
    raise Exception(f"The level does not equal county or ed. The currrent input is {level=}")


if not (Mechanism == "laplace" or Mechanism == "stabilityhist" or Mechanism == "unaryencoding"  or Mechanism == "randresponse" or Mechanism == "rappor"  or Mechanism == "olh" or Mechanism == "hadamard"): 
    raise Exception(f"The Mechanism is not supported or there is a typo with the input. Please check availiable Mechanisms are try again. /n  The current input is {Mechanism=}")



if level == "county":
    Level = level.capitalize()
if level == "ed":
    Level = level.upper()

def get_variables(path, level, Level):
    agg_data_df = pd.read_csv(path + f"Dagg_commute_{level}_level_all.csv")
    data_df = pd.read_csv(path + f"Dcommute_{level}_level_all.csv")

    col_names = [f"{Level}_commute",f"{Level}_Origin",f"{Level}_Destination"]
    size = len(data_df) #Number of individuals in dataset

    categories = agg_data_df[f'{Level}_commute'].unique()
    len(categories) # MUST TAKE FROM DATA WHERE CATEGORIES CAN HAVE COUNT 0 ie.not where each individual is row
    categories = list(categories) #Number of possible categories 

    data_df.columns = col_names
    commutes = data_df[f"{Level}_commute"].tolist()

    return(size, categories, col_names, data_df, commutes)

#Outputs size=number of individuals, categories=list of all possible commutes, data_df = data where each row corresponds to an indidual, commutes = individuals commutes eg. commutes[1] = commute of individual 1 
size, categories, col_names, data_df, commutes = get_variables(path, level, Level)

if Mechanism == "stabilityhist":
    delta = 1/(2*size)
else:
    delta = 0 

budget = [(e, delta) for e in epsilon]
d = len(categories)


### Compute Sensitive Counts

In [26]:
with open(f'commute_{level}_level_all.csv') as input_data:
    data_all = input_data.read()
    
#This is the dataset without differential privacy. 
histogram = (
    dp.t.make_split_dataframe(separator=",", col_names=col_names) >>
    dp.t.make_select_column(key=f"{Level}_commute", TOA=str) >>
    # Compute counts for each of the categories
    dp.t.then_count_by_categories(categories=categories)
)

sensitive_counts = histogram(data_all)
sensitive_counts = sensitive_counts[:-1]

In [27]:
datadict = dict((categories[i], i) for i in range(len(categories)))
data = [datadict[commutes[i]] for i in range(len(commutes))]

In [28]:
len(categories)

26244

### Run Mechanisms

In [29]:
if Mechanism == "laplace":
   with open(f'commute_{level}_level_all.csv') as input_data:
      data = input_data.read()
   released_counts, total_elapsed_time, all_rmse = Laplace_Mechamism(budget, max_influence, data, histogram, sensitive_counts)
elif Mechanism == "stabilityhist":
   with open(f'commute_{level}_level_all.csv') as input_data:
      data = input_data.read()
   released_counts, total_elapsed_time, all_rmse =  Stability_Hist(col_names, Level, budget, max_influence, size, data, histogram, categories, sensitive_counts)
elif Mechanism == "randresponse":
   released_counts_client, elapsed_time_client = run_client(Mechanism, Level, budget, size, categories, commutes, sensitive_counts)
   released_counts, elapsed_time_server = Randomised_Response_Server(released_counts_client, sensitive_counts,  size, budget, categories)
   total_elapsed_time = [sum(element) for element in zip(elapsed_time_client, elapsed_time_server)]  
elif Mechanism == "unaryencoding":
   released_counts_client, elapsed_time_client = Unary_Encoding_Client(budget, size, categories, commutes, sensitive_counts)
   released_counts, elapsed_time_server = Unary_Encoding_Server(released_counts_client, sensitive_counts,  size, budget)
   total_elapsed_time = [sum(element) for element in zip(elapsed_time_client, elapsed_time_server)]
elif Mechanism == "olh":
   released_counts, total_elapsed_time = OLH(budget, data, d)
elif Mechanism == "hadamard":
   released_counts, total_elapsed_time = Hadamard(budget, data, d)

Starting Randomised Response with an epsilon value of  0.5
Finished Randomised Response with an epsilon value of  0.5
Starting Randomised Response with an epsilon value of  1.0
Finished Randomised Response with an epsilon value of  1.0
Starting Randomised Response with an epsilon value of  1.5
Finished Randomised Response with an epsilon value of  1.5
Starting Randomised Response with an epsilon value of  2.0
Finished Randomised Response with an epsilon value of  2.0
Starting Randomised Response with an epsilon value of  2.5
Finished Randomised Response with an epsilon value of  2.5
Starting Randomised Response with an epsilon value of  3.0
Finished Randomised Response with an epsilon value of  3.0
Starting Randomised Response with an epsilon value of  3.5
Finished Randomised Response with an epsilon value of  3.5
Starting Randomised Response with an epsilon value of  4.0
Finished Randomised Response with an epsilon value of  4.0
Starting Randomised Response with an epsilon value of  4

In [30]:
released_counts[0][1:10]

array([ -13722.08779316, -135089.99989944,   26733.88290894,
        -13722.08779316,  -54178.05849525,  107645.82431313,
        107645.82431313,  107645.82431313,  188557.76571732])

### Postprocessing

In [31]:
#base_counts, base_time, base_rmse = Base(released_counts, sensitive_counts, budget)
base_pros_counts, base_pros_time, base_pros_rmse = Base_Pros(released_counts, sensitive_counts, budget)
#base_cut_counts, base_cut_time, base_cut_rmse = Base_Cut(released_counts, sensitive_counts, budget, size)

In [32]:
# #1. Base
# private_dataset1_df = pd.DataFrame(categories, columns = [f'{Level} Level Commute'])
# private_dataset1_df['True Count'] =  sensitive_counts
# for i, name in enumerate(budget):
#     private_dataset1_df[f'Privacy {name[0]}'] = base_counts[i]

#2.Base Pro
private_dataset2_df = pd.DataFrame(categories, columns = [f'{Level} Level Commute'])
private_dataset2_df['True Count'] =  sensitive_counts
for i, name in enumerate(budget):
    private_dataset2_df[f'Privacy {name[0]}'] = base_pros_counts[i]

# #3.Base Cut
# private_dataset3_df = pd.DataFrame(categories, columns = [f'{Level} Level Commute'])
# private_dataset3_df['True Count'] =  sensitive_counts
# for i, name in enumerate(budget):
#     private_dataset3_df[f'Privacy {name[0]}'] = base_cut_counts[i]

In [33]:
private_dataset2_df

Unnamed: 0,ED Level Commute,True Count,Privacy 0.5,Privacy 1.0,Privacy 1.5,Privacy 2.0,Privacy 2.5,Privacy 3.0,Privacy 3.5,Privacy 4.0,Privacy 4.5,Privacy 5.0
0,Arran Quay A-Arran Quay A,1,229014,0,0,0,0,2293,0,823,0,306
1,Arran Quay A-Arran Quay B,17,0,55922,0,0,0,3670,0,0,0,0
2,Arran Quay A-Arran Quay C,7,0,0,4988,0,0,0,0,0,204,127
3,Arran Quay A-Arran Quay D,1,26734,0,0,2723,1559,917,1367,1804,0,1022
4,Arran Quay A-Arran Quay E,0,0,86471,0,0,0,0,3822,1314,204,306
...,...,...,...,...,...,...,...,...,...,...,...,...
26239,Wood Quay B-Walkinstown A,4,67190,0,0,0,6255,7798,0,2295,0,0
26240,Wood Quay B-Walkinstown B,0,0,0,20066,0,1559,9174,549,1314,0,0
26241,Wood Quay B-Walkinstown C,2,107646,0,20066,6831,8603,2293,0,0,0,0
26242,Wood Quay B-Wood Quay A,22,26734,25373,4988,2723,6255,0,0,2295,0,0


In [34]:
if save:
    private_dataset2_df.to_csv(f'Data/D_{level}_{Mechanism}_dp_df.csv', sep=',', index=False, encoding='utf-8', mode='w')
    names = ("epsilon, delta, rmse, total_elapsed_time, postprocessing_time")
    np.savetxt(f'Data/D_rmse_{Mechanism}_{Level}.csv', [(budget[i][0], budget[i][1], base_pros_rmse[i], total_elapsed_time[i], base_pros_time[i]) for i in range(len(budget))] , header = names, delimiter=',')

### Save File for Analysis

In [None]:
# if save:
#     private_dataset1_df.to_csv(f'Data/{level}_{Mechanism}_dp1_df.csv', sep=',', index=False, encoding='utf-8', mode='w')
#     names = ("epsilon, delta, rmse, total_elapsed_time, postprocessing_time")
#     np.savetxt(f'Data/rmse1_{Mechanism}_{Level}.csv', [(budget[i][0], budget[i][1], base_rmse[i], total_elapsed_time[i], base_time[i]) for i in range(len(budget))] , header = names, delimiter=',')

#     private_dataset2_df.to_csv(f'Data/{level}_{Mechanism}_dp2_df.csv', sep=',', index=False, encoding='utf-8', mode='w')
#     names = ("epsilon, delta, rmse, total_elapsed_time, postprocessing_time")
#     np.savetxt(f'Data/rmse2_{Mechanism}_{Level}.csv', [(budget[i][0], budget[i][1], base_pros_rmse[i], total_elapsed_time[i], base_pros_time[i]) for i in range(len(budget))] , header = names, delimiter=',')

#     private_dataset3_df.to_csv(f'Data/{level}_{Mechanism}_dp3_df.csv', sep=',', index=False, encoding='utf-8', mode='w')
#     names = ("epsilon, delta, rmse, total_elapsed_time, postprocessing_time")
#     np.savetxt(f'Data/rmse3_{Mechanism}_{Level}.csv', [(budget[i][0], budget[i][1], base_cut_rmse[i], total_elapsed_time[i], base_cut_time[i]) for i in range(len(budget))] , header = names, delimiter=',')


In [117]:
#Level: county or ed
level = "ed"

#Mechanism: laplace, stabilityhist, unaryencoding, randresponse, olh, hadamard, rappor
#Mechanism = "laplace"
#Mechanism_name = "Laplace Mechanism"

Mechanism = "olh"

#Path to data
path = "./"
save = True

max_influence = 2
epsilon = np.arange(0.5,5.5, 0.5)

if not (level == "county" or level == "ed"):
    raise Exception(f"The level does not equal county or ed. The currrent input is {level=}")


if not (Mechanism == "laplace" or Mechanism == "stabilityhist" or Mechanism == "unaryencoding"  or Mechanism == "randresponse" or Mechanism == "rappor"  or Mechanism == "olh" or Mechanism == "hadamard"): 
    raise Exception(f"The Mechanism is not supported or there is a typo with the input. Please check availiable Mechanisms are try again. /n  The current input is {Mechanism=}")



if level == "county":
    Level = level.capitalize()
if level == "ed":
    Level = level.upper()



#Outputs size=number of individuals, categories=list of all possible commutes, data_df = data where each row corresponds to an indidual, commutes = individuals commutes eg. commutes[1] = commute of individual 1 
size, categories, col_names, data_df, commutes = get_variables(path, level, Level)

if Mechanism == "stabilityhist":
    delta = 1/(2*size)
else:
    delta = 0 

budget = [(e, delta) for e in epsilon]
d = len(categories)

with open(f'commute_{level}_level_all.csv') as input_data:
    data_all = input_data.read()
    
#This is the dataset without differential privacy. 
histogram = (
    dp.t.make_split_dataframe(separator=",", col_names=col_names) >>
    dp.t.make_select_column(key=f"{Level}_commute", TOA=str) >>
    # Compute counts for each of the categories
    dp.t.then_count_by_categories(categories=categories)
)

sensitive_counts = histogram(data_all)
sensitive_counts = sensitive_counts[:-1]


datadict = dict((categories[i], i) for i in range(len(categories)))
data = [datadict[commutes[i]] for i in range(len(commutes))]

if Mechanism == "laplace":
   with open(f'commute_{level}_level_all.csv') as input_data:
      data = input_data.read()
   released_counts, total_elapsed_time, all_rmse = Laplace_Mechamism(budget, max_influence, data, histogram, sensitive_counts)
elif Mechanism == "stabilityhist":
   with open(f'commute_{level}_level_all.csv') as input_data:
      data = input_data.read()
   released_counts, total_elapsed_time, all_rmse =  Stability_Hist(col_names, Level, budget, max_influence, size, data, histogram, categories, sensitive_counts)
elif Mechanism == "randresponse":
   released_counts_client, elapsed_time_client = run_client(Mechanism, Level, budget, size, categories, commutes, sensitive_counts)
   released_counts, elapsed_time_server = Randomised_Response_Server(released_counts_client, sensitive_counts,  size, budget, categories)
   total_elapsed_time = [sum(element) for element in zip(elapsed_time_client, elapsed_time_server)]  
elif Mechanism == "unaryencoding":
   released_counts_client, elapsed_time_client = Unary_Encoding_Client(budget, size, categories, commutes, sensitive_counts)
   released_counts, elapsed_time_server = Unary_Encoding_Server(released_counts_client, sensitive_counts,  size, budget)
   total_elapsed_time = [sum(element) for element in zip(elapsed_time_client, elapsed_time_server)]
elif Mechanism == "olh":
   released_counts, total_elapsed_time = OLH(budget, data, d)
elif Mechanism == "hadamard":
   released_counts, total_elapsed_time = Hadamard(budget, data, d)

#base_counts, base_time, base_rmse = Base(released_counts, sensitive_counts, budget)
base_pros_counts, base_pros_time, base_pros_rmse = Base_Pros(released_counts, sensitive_counts, budget)
#base_cut_counts, base_cut_time, base_cut_rmse = Base_Cut(released_counts, sensitive_counts, budget, size)

# #1. Base
# private_dataset1_df = pd.DataFrame(categories, columns = [f'{Level} Level Commute'])
# private_dataset1_df['True Count'] =  sensitive_counts
# for i, name in enumerate(budget):
#     private_dataset1_df[f'Privacy {name[0]}'] = base_counts[i]

#2.Base Pro
private_dataset2_df = pd.DataFrame(categories, columns = [f'{Level} Level Commute'])
private_dataset2_df['True Count'] =  sensitive_counts
for i, name in enumerate(budget):
    private_dataset2_df[f'Privacy {name[0]}'] = base_pros_counts[i]

# #3.Base Cut
# private_dataset3_df = pd.DataFrame(categories, columns = [f'{Level} Level Commute'])
# private_dataset3_df['True Count'] =  sensitive_counts
# for i, name in enumerate(budget):
#     private_dataset3_df[f'Privacy {name[0]}'] = base_cut_counts[i]


if save:
    private_dataset2_df.to_csv(f'Data/D_{level}_{Mechanism}_dp_df.csv', sep=',', index=False, encoding='utf-8', mode='w')
    names = ("epsilon, delta, rmse, total_elapsed_time, postprocessing_time")
    np.savetxt(f'Data/D_rmse_{Mechanism}_{Level}.csv', [(budget[i][0], budget[i][1], base_pros_rmse[i], total_elapsed_time[i], base_pros_time[i]) for i in range(len(budget))] , header = names, delimiter=',')

Starting OLH with an epsilon value of  0.5




Finished OLH with an epsilon value of  0.5
Starting OLH with an epsilon value of  1.0


KeyboardInterrupt: 