# CS 3110/5110: Data Privacy
## In-Class Exercise, week of 10/28/2024

In [57]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')

In [58]:
# Load data files
import numpy as np
import urllib.request
import io

url_x = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_x.npy'
url_y = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_y.npy'

with urllib.request.urlopen(url_x) as url:
    f = io.BytesIO(url.read())
X = np.load(f)

with urllib.request.urlopen(url_y) as url:
    f = io.BytesIO(url.read())
y = np.load(f)

In [59]:
# Split data into training and test sets
training_size = int(X.shape[0] * 0.8)

X_train = X[:training_size]
X_test = X[training_size:]

y_train = y[:training_size]
y_test = y[training_size:]

print('Train and test set sizes:', len(y_train), len(y_test))

Train and test set sizes: 36176 9044


## Question 1

Using scikit-learn, train a logistic regression model on the training data loaded above.

In [60]:
from sklearn.linear_model import LogisticRegression

In [61]:
def train_model():
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model

model = train_model()
print('Model coefficients:', model.coef_[0])
print('Model accuracy:', np.sum(model.predict(X_test) == y_test)/X_test.shape[0])

Model coefficients: [ 4.78327308e-01 -1.79806028e-01 -3.39637532e-02  6.86128365e-02
 -5.90903592e-01 -3.66078081e-01 -1.06091721e+00 -6.69647736e-01
 -6.36797526e-01 -4.60550239e-01 -5.10404525e-01 -3.97748916e-01
 -7.57623491e-01 -7.81058902e-01  6.35961618e-02  1.18095574e-01
  5.10802728e-01  1.00300755e+00 -1.12204317e-01  7.38182516e-01
 -1.18480223e+00  1.28199853e+00  1.10426298e-01 -8.97809166e-01
  1.50211124e+00  1.25219604e+00 -5.83417398e-01 -1.31365464e+00
 -8.89699764e-01 -7.54454833e-01 -5.67112384e-02  5.14967749e-02
 -7.26307020e-04  7.21919618e-01 -8.96583904e-01 -6.94331444e-01
 -3.23904746e-01 -9.41122244e-01 -1.09675692e+00  4.72945528e-01
  4.76971382e-01  2.21660397e-01  5.09693212e-01 -1.29278625e-01
 -3.74347246e-01  1.91420884e-03 -7.74112146e-01 -1.05078670e+00
 -1.90500912e-01  7.03104276e-01 -5.02404187e-01 -4.49306974e-02
 -4.87959500e-01 -4.17916902e-01 -2.31517233e-01 -1.17503594e+00
 -5.09692581e-01  8.94653030e-01  6.12652401e-01 -2.84243265e-01
 -1.2

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Question 2

Implement the *average gradient* of the loss below.

In [62]:
theta = [0 for _ in range(104)]
i=1
gradient(theta, X_train[i], y_train[i]).shape

(104,)

In [63]:
# The loss function measures how good our model is. The training goal is to minimize the loss.
# This is the logistic loss function.
def loss(theta, xi, yi):
    exponent = - yi * (xi.dot(theta))
    return np.log(1 + np.exp(exponent))

# This is the gradient of the logistic loss
# The gradient is a vector that indicates the rate of change of the loss in each direction
def gradient(theta, xi, yi):
    exponent = yi * (xi.dot(theta))
    return - (yi*xi) / (1+np.exp(exponent))

In [64]:
def avg_grad(theta, X, y):
    #list of vectors, each vector has length of 104
    all_grads = [gradient(theta, X[i], y[i]) for i in range(len(X))]
    avg_grad = np.mean(all_grads, axis=0)
    return avg_grad

In [65]:
avg_grad(theta, X_train, y_train).shape

(104,)

## Question 3

Use the average gradient from above to implement a gradient descent algorithm.

In [66]:
def gradient_descent(iterations):
    theta = [0  for _ in range(104)]
    for _ in range(iterations):
        theta = theta - avg_grad(theta, X_train, y_train)
    return theta

theta = gradient_descent(10)
theta

array([ 1.63933476e-02, -2.62737908e-02, -3.76703876e-01,  5.75414219e-02,
       -6.45755008e-02, -2.73486282e-02, -1.30672661e-03, -5.91543132e-02,
       -7.64880798e-02, -2.62061378e-02, -1.57561761e-02, -2.86662073e-02,
       -4.72301785e-02, -3.76260473e-02, -9.90166218e-03, -1.97245917e-02,
        1.55177596e-01,  4.49929106e-02, -3.29796604e-01,  1.19518106e-01,
       -4.89454322e-03,  6.88790663e-02, -1.55396891e-01, -1.80599005e-01,
        1.54260151e-03,  3.89119966e-01, -1.97894731e-02, -5.15748994e-01,
       -5.58626968e-02, -4.09361505e-02, -1.28065226e-01, -1.62203385e-04,
       -1.08119029e-01,  1.88498244e-01, -6.47481586e-02, -8.66006277e-02,
       -9.99400278e-02, -2.05565204e-01, -1.11388992e-02,  1.61706332e-01,
       -3.59862313e-03, -1.50012146e-02,  1.68869178e-03, -5.12278071e-02,
        3.20211424e-01, -2.99730645e-01, -6.21807885e-02, -2.79987375e-01,
       -1.81265739e-01,  8.06793703e-02, -1.62147320e-02, -2.28589562e-02,
       -1.37829430e-01, -

In [67]:
# Prediction: take a model (theta) and a single example (xi) and return its predicted label
def predict(xi, theta, bias=0):
    label = np.sign(xi @ theta + bias)
    return label

def accuracy(theta):
    return np.sum(predict(X_test, theta) == y_test)/X_test.shape[0]

accuracy(theta)

0.7787483414418399

## Question 4

Implement a *noisy gradient descent* algorithm.

1. Calculate gradients for each example
2. Clip the gradients to have bounded $L2$ norm
3. Sum the clipped gradients
4. Use the Gaussian mechanism to add noise to the sum of gradients

In [68]:
adult['Age'].clip(lower=0, upper=100).sum()

1256257

In [69]:
def L2_clip(v, b):
    norm = np.linalg.norm(v, ord=2)
    
    if norm > b:
        return b * (v / norm)
    else:
        return v

def noisy_gradient_descent(iterations, epsilon, delta):
    theta = np.array([0 for _ in range(104)]) # initial model
    epsilon_i = epsilon/iterations
    delta_i = delta/ iterations
    for _ in range(iterations):
        #grad = avg_grad(theta, X_train, y_train)
        #goal
        #1. compute one gradient per example in X_train
        all_grads = [gradient(theta, X_train[i], y_train[i]) for i in range(len(X_train))]
        
        #2. calll L2_clip on each gradient
        b = 3
        clipped_grads = [L2_clip(g, b) for g in all_grads]
    
        #3. take the sum of the clipped gradients and add noise
        grad_sum = np.sum(clipped_grads, axis=0)
        #TODO: sensitivity is correct by clipping 
        noisy_grad_sum = gaussian_mech_vec(grad_sum, sensitivity=5, epsilon=epsilon_i, delta=delta_i)
        #Danger: reveals the size of the training data 
        #this is not a big deal to reveal 
        noisy_grad = np.array(noisy_grad_sum)/ len(X_train)
        theta = theta - noisy_grad
    return theta

theta = noisy_gradient_descent(10, 10000000, 1e-5)
print('Final accuracy:', accuracy(theta))

Final accuracy: 0.7787483414418399


In [70]:
# TEST CASE

assert accuracy(noisy_gradient_descent(5, 0.001, 1e-5)) < 0.76
assert accuracy(noisy_gradient_descent(5, 1.0, 1e-5)) > 0.70

## Question 5

What is the *total privacy cost* of the noisy gradient descent algorithm above, and why? Argue informally that the algorithm satisfies this privacy cost. Use sequential composition.

1. Sensitivity:
   - I use a sensitivity of b, the l2 clipping parameter, when calling 'gaussian_mech_vec'
   - I enforce an L2 global  sensitivity of 'b' by performing L2 clipping on each gradient
3. Composition:
   -I run the 'gaussian_mech_vec' 'iterations' times, with 'epsilon_i', 'detla_i' each time
   - Total cost is 'iterations * epsilon_i = epsilon', 'iterations * delta_i = delta'- differential privacy by sequential composition of (epsilon,delta)-DP
5. post-processing:
   - Theta starts as all 0s, so it has no info from the training data
   -  Every iteration of the loop updates theta, but only using differentiall private results
   -   Therefore, its ok to return theta by post-processing
   -   EXCEPT: the length of the training data does leak into theta


## Question 6

Repeat the above, but using advanced composition.

In [71]:
# goal: total privacy cost is (epsilon, delta)-DP by advanced composition
iterations = 10
target_epsilon =1.0
target_delta= 1e-5

delta_prime = target_delta/2
epsilon_i = target_epsilon/(2*np.sqrt(2*iterations*np.log(1/delta_prime)))
delta_i = (target_delta/2)/iterations


print(2*epsilon_i*np.sqrt(2*iterations*np.log(1/delta_prime)))
print(iterations*delta_i + delta_prime)


0.9999999999999999
1.0000000000000003e-05


I would set:
- $\epsilon_i$ = `target_epsilon/(2*np.sqrt(2*iterations*np.log(1/delta_prime)))`
- $\delta_i$  = `(target_delta/2)/iterations`

## Question 7

Implement a version of noisy gradient descent that satisfies a *total* of $(\epsilon, \delta)$-differential privacy. Use sequential composition.

In [72]:
# WE DID THIS IS QUESTION 4

In [52]:
# TEST CASE

assert accuracy(noisy_gradient_descent(5, 0.001, 1e-5)) < 0.76
assert accuracy(noisy_gradient_descent(5, 1.0, 1e-5)) > 0.70

AssertionError: 