# CS 3110/5110: Data Privacy
## Homework 10

In [2]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs3110-data-privacy/raw/main/homework/adult_with_pii.csv')

## Question 1 (10 points)

Implement a function `dp_marginal` that calculates a differentially private one-way marginal for a given column of the adult dataset.

In [3]:
def dp_marginal(col, epsilon):
    # Step 1: Get the true counts
    counts = adult[col].value_counts()

    # Step 2: Add Laplace noise to each count
    noisy_counts = counts.apply(lambda x: laplace_mech(x, sensitivity=1, epsilon=epsilon))

    # Step 3: Remove negative counts and normalize to get probabilities
    noisy_counts = noisy_counts.clip(lower=0)
    total = noisy_counts.sum()
    probabilities = noisy_counts / total

    return probabilities

dp_marginal('Occupation', 1.0)

Prof-specialty       0.134687
Craft-repair         0.133392
Exec-managerial      0.132338
Adm-clerical         0.122701
Sales                0.118802
Other-service        0.107250
Machine-op-inspct    0.065206
Transport-moving     0.052036
Handlers-cleaners    0.044643
Farming-fishing      0.032348
Tech-support         0.030299
Protective-serv      0.021139
Priv-house-serv      0.004862
Armed-Forces         0.000296
Name: Occupation, dtype: float64

In [4]:
# TEST CASE
marginal = dp_marginal('Age', 1.0)
assert marginal[36] > 0.02 and marginal[36] < 0.03
assert marginal[85] > 0.00005 and marginal[85] < 0.0005

marginal = dp_marginal('Occupation', 1.0)
assert marginal['Prof-specialty'] > 0.13 and marginal['Prof-specialty'] < 0.14
assert marginal['Protective-serv'] > 0.02 and marginal['Protective-serv'] < 0.03

## Question 2 (10 points)

Implement a function `dp_synthetic_data` that generates `n` samples of synthetic data for the given columns, by creating one-way marginals for *each column* and then sampling synthetic data elements for each column separately.

In [5]:
def dp_synthetic_data(cols, n, epsilon):
    synthetic_data = pd.DataFrame()

    # Split epsilon equally among the columns
    epsilon_per_column = epsilon / len(cols)

    for col in cols:
        # Get the DP marginal for the column
        probabilities = dp_marginal(col, epsilon_per_column)

        # Sample data based on the marginal
        values = probabilities.sample(n=n, replace=True, weights=probabilities.values).index
        synthetic_data[col] = values

    return synthetic_data

dp_synthetic_data(['Age', 'Occupation', 'Marital Status', 'Education', 'Relationship'], 100, 1.0)

Unnamed: 0,Age,Occupation,Marital Status,Education,Relationship
0,28,Sales,Never-married,7th-8th,Not-in-family
1,39,Tech-support,Married-civ-spouse,HS-grad,Other-relative
2,21,Prof-specialty,Married-civ-spouse,Some-college,Unmarried
3,20,Machine-op-inspct,Married-civ-spouse,HS-grad,Wife
4,57,Transport-moving,Never-married,Bachelors,Wife
...,...,...,...,...,...
95,26,Sales,Married-civ-spouse,HS-grad,Husband
96,55,Exec-managerial,Married-civ-spouse,HS-grad,Unmarried
97,19,Sales,Married-civ-spouse,Bachelors,Not-in-family
98,44,Exec-managerial,Never-married,11th,Own-child


In [6]:
# TEST CASE
assert stats.wasserstein_distance(dp_synthetic_data(['Age'], len(adult), 1.0)['Age'], adult['Age']) < 0.2
assert stats.wasserstein_distance(dp_synthetic_data(['Education-Num'], len(adult), 1.0)['Education-Num'], 
                                  adult['Education-Num']) < 0.03

## Question 3 (10 points)

Implement a function `dp_two_marginal` that builds a 2-way marginal with differential privacy.

In [7]:
def two_way_hist(col1, col2, epsilon):
    hist = adult[[col1, col2]].value_counts()
    noisy_hist = hist.apply(lambda x: laplace_mech(x, 1, epsilon))
    return noisy_hist

def dp_two_marginal(col1, col2, epsilon):
    syn_rep = two_way_hist(col1, col2, epsilon)
    syn_rep_nonzero = syn_rep.clip(lower=0)
    syn_normalized = syn_rep_nonzero / syn_rep_nonzero.sum()
    return syn_normalized.to_frame(name='probability').reset_index()

dp_two_marginal('Education', 'Workclass', 1.0)

Unnamed: 0,Education,Workclass,probability
0,HS-grad,Private,0.253296
1,Some-college,Private,0.165840
2,Bachelors,Private,0.115558
3,Assoc-voc,Private,0.032737
4,11th,Private,0.030053
...,...,...,...
96,5th-6th,Federal-gov,0.000002
97,1st-4th,State-gov,0.000000
98,Preschool,State-gov,0.000021
99,11th,Never-worked,0.000023


In [8]:
# TEST CASE
marginal = dp_two_marginal('Education', 'Workclass', 1.0)
m1 = marginal[(marginal['Education'] == 'HS-grad') & (marginal['Workclass'] == 'Private')]['probability'].values[0]
m2 = marginal[(marginal['Education'] == 'Bachelors') & (marginal['Workclass'] == 'Federal-gov')]['probability'].values[0]
print(m1, m2)
assert m1 > 0.24 and m1 < 0.26
assert m2 > 0.005 and m2 < 0.007

0.2529930783645361 0.006913061300177663


## Question 4 (30 points)

Implement a function `dp_synthetic_data_two_marginal` that generates synthetic data for the `Age`, `Workclass`, `Occupation`, and `Education` columns *while preserving correlations between them* by using overlapping 2-way marginals.

In [9]:
def two_way_hist(col1, col2, epsilon):
    hist = adult[[col1, col2]].value_counts()
    noisy_hist = hist.apply(lambda x: laplace_mech(x, 1, epsilon))
    return noisy_hist

def dp_synthetic_data_two_marginal(n, epsilon):
    cols = ['Age', 'Workclass', 'Occupation', 'Education']
    marginals = []

    # Loop over the columns in steps of 2 to create pairs
    for i in range(0, len(cols), 2):
        syn_rep = two_way_hist(cols[i], cols[i+1], epsilon)
        syn_rep_nonzero = syn_rep.clip(lower=0)
        syn_normalized = syn_rep_nonzero / syn_rep_nonzero.sum()

        marginals.append(syn_normalized.to_frame(name='probability').reset_index())

    syn_data = []
    for _ in range(n):
        sample = {}

        # Sample from the first marginal
        values = marginals[0].sample(weights=marginals[0]['probability']).iloc[0]
        sample[cols[0]] = values[cols[0]]
        sample[cols[1]] = values[cols[1]]

        # Sample from the second marginal
        values = marginals[1].sample(weights=marginals[1]['probability']).iloc[0]
        sample[cols[2]] = values[cols[2]]
        sample[cols[3]] = values[cols[3]]

        syn_data.append(sample)
    
    result = pd.DataFrame(syn_data)
    return result

# Call the corrected function
synthetic_data = dp_synthetic_data_two_marginal(100, 1.0)

In [11]:
# TEST CASE
synthetic_data = dp_synthetic_data_two_marginal(100, 1.0)

s1 = synthetic_data['Age'].mean()
s2 = len(synthetic_data[synthetic_data['Workclass'] == 'Private'])
s3 = len(synthetic_data[synthetic_data['Occupation'] == 'Adm-clerical'])
s4 = len(synthetic_data[synthetic_data['Education'] == 'Bachelors'])

print(s1, s2, s3, s4)

assert s1 > 35 and s1 < 45
assert s2 > 65 and s2 < 90
assert s3 > 5 and s3 < 25
assert s4 > 5 and s4 < 35

39.37 78 10 18
