In [2]:
import numpy as np
import pandas as pd


In [3]:
file = pd.read_csv('file.csv', header=0)
file

Unnamed: 0,id,age,income,visits,diagnosis,housing_bill,hospital_bill,medication_bill,insurance,total_bills
0,1,25,50000,3,Diabetes,1200,3500,600,200,5500
1,2,32,75000,5,Healthy,1500,0,0,150,1650
2,3,45,60000,2,Cancer,1800,12500,1200,300,15800
3,4,28,48000,4,Healthy,1100,0,0,120,1220
4,5,50,90000,6,Diabetes,2200,4200,800,250,7450
5,6,22,30000,1,Healthy,900,0,0,100,1000
6,7,35,65000,7,Cancer,1600,18700,1500,320,21820
7,8,40,80000,3,Diabetes,2000,3800,700,230,6730
8,9,29,55000,2,Healthy,1300,0,0,140,1440
9,10,60,95000,5,Cancer,2500,15400,1800,350,20050


In [4]:
#in this code, we will be using both expoential, laplace and gaussian mechanism to add noise to the data
#for comparison

#starting off with laplace mechanism (for ages)

#convert into numpy array
age = np.array(file['age'],dtype=float)
true_avg_age = np.mean(age)
print("True average age: ",true_avg_age)

#compute the sensitivity of the data
total_age = np.sum(age)
num_of_age = len(age)
change1 = (total_age -np.max(age))/(num_of_age-1)
change2 = (total_age-np.min(age))/(num_of_age-1)
sensitivity = max(np.abs(true_avg_age-change1),np.abs(true_avg_age-change2))

print("Sensitivity of the data is: ",sensitivity)

epsilon = 0.001
noise_scale = sensitivity/epsilon
#adding noise to the data
laplace_noise = np.random.laplace(0,noise_scale)
print(laplace_noise)

laplace_mechanism = true_avg_age + laplace_noise
print("Laplace mechanism: ",laplace_mechanism)


True average age:  44.5
Sensitivity of the data is:  0.9285714285714306
627.4696969220646
Laplace mechanism:  671.9696969220646


In [32]:
#doing the exponential mechanism (for the diagnosis)

diagnosis_column = file['diagnosis']

#exponential mechanism
def exponential_mechanism(full_count,epsilon,sensitivity):
    probabilities = np.exp(epsilon * np.array(full_count) / (2 * sensitivity))
    probabilities /= np.sum(probabilities)
    
    #select a movie based on the probabilities
    selected_dianosis = np.random.choice(diagnosises, p=probabilities)
    
    return selected_dianosis



#counting the number of diabetes patients
diabetes_count = np.sum(diagnosis_column == 'Diabetes')
healthy_count = np.sum(diagnosis_column == 'Healthy')
cancer_count = np.sum(diagnosis_column == 'Cancer')
hypertension_count = np.sum(diagnosis_column == 'Hypertension')

full_count = np.array([diabetes_count,healthy_count,cancer_count,hypertension_count])
diagnosises = np.array(['Diabetes','Healthy','Cancer','Hypertension'])
epsilon = 0.1
sensitivity = 1  # sensitivity of the data

selected_diagnosis = exponential_mechanism(full_count, epsilon, sensitivity)
print("Selected diagnosis: ", selected_diagnosis)
print((np.exp(epsilon * np.array(full_count) / (2 * sensitivity)) / np.sum(np.exp(epsilon * np.array(full_count) / (2 * sensitivity)))) * 100)



Selected diagnosis:  Cancer
[22.57835737 35.40991299 22.57835737 19.43337228]


In [39]:
#doing the gaussian mechanism (for the bills)
bill_names = ["Housing Bills","Medication Bills","Insurance Bills"]
bill_amounts = {
    "Housing Bills": file["housing_bill"],
    "Medication Bills": file["medication_bill"],
    "Insurance Bills": file["insurance"]
}

#calculating the true averages salaries for each department
avg_bill = {} 
sum_bill = {}
min_bill = {}
max_bill = {}
total_bill = {}
for name in bill_names:
    bills = bill_amounts[name]
    avg_bill[name] = np.mean(bills)
    sum_bill[name] = np.sum(bills)
    min_bill[name] = np.min(bills)
    max_bill[name] = np.max(bills)
    total_bill[name] = len(bills)


#computing the sensitivity
sensitivity = {}
change1 = {}
change2 = {}


for name in bill_names:
    change1[name] = (sum_bill[name]-max_bill[name])/(total_bill[name]-1)
    change2[name] = (sum_bill[name]-min_bill[name])/(total_bill[name]-1)
    sensitivity[name] = max(np.abs(avg_bill[name]-change1[name]),np.abs(avg_bill[name]-change2[name]))


#calculating global sensitivity
global_sensitivity = 0
for name in bill_names:
    global_sensitivity += sensitivity[name]**2
global_sensitivity = np.sqrt(global_sensitivity)


epsilon = 0.5
delta = 0.01
#calculating the gaussian noise standard deviation
for i in range(20): 
    sigma = (global_sensitivity)*np.sqrt(2*np.log(1.25/delta))/epsilon



    #adding noise to eacu department averages
    noisy_avg_bills = {}
    for name in bill_names:
        noise = np.random.normal(0,sigma)
        noisy_avg_bills[name] = avg_bill[name] + noise



    #final output
    print("Final Noisy Output:")
    print(noisy_avg_bills)
    print("vs")
    print(avg_bill)
    print()


Final Noisy Output:
{'Housing Bills': np.float64(1889.336467307169), 'Medication Bills': np.float64(270.1300740049962), 'Insurance Bills': np.float64(-377.5462834475444)}
vs
{'Housing Bills': np.float64(1959.2), 'Medication Bills': np.float64(649.0), 'Insurance Bills': np.float64(233.4)}

Final Noisy Output:
{'Housing Bills': np.float64(1657.8153673049371), 'Medication Bills': np.float64(811.7891818032281), 'Insurance Bills': np.float64(-57.83358394143127)}
vs
{'Housing Bills': np.float64(1959.2), 'Medication Bills': np.float64(649.0), 'Insurance Bills': np.float64(233.4)}

Final Noisy Output:
{'Housing Bills': np.float64(1844.2963036907847), 'Medication Bills': np.float64(960.0038940525551), 'Insurance Bills': np.float64(188.04253419632522)}
vs
{'Housing Bills': np.float64(1959.2), 'Medication Bills': np.float64(649.0), 'Insurance Bills': np.float64(233.4)}

Final Noisy Output:
{'Housing Bills': np.float64(1767.62594965619), 'Medication Bills': np.float64(117.31815266215779), 'Insuran

In [None]:
#to make a table of all of this, i need to ignore the fact that the random vairables are random and just used iterations in terms of the value of epsilons 
# and deltas and the number of iterations to get the final output
# giving a table of final outputs of data for me to get a conclusion of which mechanism is better for the data