# CS 3110/5110: Data Privacy
## In-Class Exercise, week of 10/28/2024

In [2]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
!pip install scikit-learn

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Load data files
import numpy as np
import urllib.request
import io

url_x = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_x.npy'
url_y = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_y.npy'

with urllib.request.urlopen(url_x) as url:
    f = io.BytesIO(url.read())
X = np.load(f)

with urllib.request.urlopen(url_y) as url:
    f = io.BytesIO(url.read())
y = np.load(f)

In [4]:
# Split data into training and test sets
training_size = int(X.shape[0] * 0.8)

X_train = X[:training_size]
X_test = X[training_size:]

y_train = y[:training_size]
y_test = y[training_size:]

print('Train and test set sizes:', len(y_train), len(y_test))

Train and test set sizes: 36176 9044


## Question 1

Using scikit-learn, train a logistic regression model on the training data loaded above.

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
def train_model():
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model

model = train_model()
print('Model coefficients:', model.coef_[0])
print('Model accuracy:', np.sum(model.predict(X_test) == y_test)/X_test.shape[0])

Model coefficients: [ 4.78327308e-01 -1.79806028e-01 -3.39637532e-02  6.86128365e-02
 -5.90903592e-01 -3.66078081e-01 -1.06091721e+00 -6.69647736e-01
 -6.36797526e-01 -4.60550239e-01 -5.10404525e-01 -3.97748916e-01
 -7.57623491e-01 -7.81058902e-01  6.35961618e-02  1.18095574e-01
  5.10802728e-01  1.00300755e+00 -1.12204317e-01  7.38182516e-01
 -1.18480223e+00  1.28199853e+00  1.10426298e-01 -8.97809166e-01
  1.50211124e+00  1.25219604e+00 -5.83417398e-01 -1.31365464e+00
 -8.89699764e-01 -7.54454833e-01 -5.67112384e-02  5.14967749e-02
 -7.26307020e-04  7.21919618e-01 -8.96583904e-01 -6.94331444e-01
 -3.23904746e-01 -9.41122244e-01 -1.09675692e+00  4.72945528e-01
  4.76971382e-01  2.21660397e-01  5.09693212e-01 -1.29278625e-01
 -3.74347246e-01  1.91420884e-03 -7.74112146e-01 -1.05078670e+00
 -1.90500912e-01  7.03104276e-01 -5.02404187e-01 -4.49306974e-02
 -4.87959500e-01 -4.17916902e-01 -2.31517233e-01 -1.17503594e+00
 -5.09692581e-01  8.94653030e-01  6.12652401e-01 -2.84243265e-01
 -1.2

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Question 2

Implement the *average gradient* of the loss below.

In [7]:
# The loss function measures how good our model is. The training goal is to minimize the loss.
# This is the logistic loss function.
def loss(theta, xi, yi):
    exponent = - yi * (xi.dot(theta))
    return np.log(1 + np.exp(exponent))

# This is the gradient of the logistic loss
# The gradient is a vector that indicates the rate of change of the loss in each direction
def gradient(theta, xi, yi):
    exponent = yi * (xi.dot(theta))
    return - (yi*xi) / (1+np.exp(exponent))

In [None]:
def avg_grad(theta, X, y):
    # YOUR CODE HERE
    raise NotImplementedError()

## Question 3

Use the average gradient from above to implement a gradient descent algorithm.

In [None]:
def gradient_descent(iterations):
    # YOUR CODE HERE
    raise NotImplementedError()

theta = gradient_descent(10)
theta

In [None]:
# Prediction: take a model (theta) and a single example (xi) and return its predicted label
def predict(xi, theta, bias=0):
    label = np.sign(xi @ theta + bias)
    return label

def accuracy(theta):
    return np.sum(predict(X_test, theta) == y_test)/X_test.shape[0]

accuracy(theta)

## Question 4

Implement a *noisy gradient descent* algorithm.

1. Calculate gradients for each example
2. Clip the gradients to have bounded $L2$ norm
3. Sum the clipped gradients
4. Use the Gaussian mechanism to add noise to the sum of gradients

In [None]:
def L2_clip(v, b):
    norm = np.linalg.norm(v, ord=2)
    
    if norm > b:
        return b * (v / norm)
    else:
        return v

def noisy_gradient_descent(iterations, epsilon, delta):
    # YOUR CODE HERE
    raise NotImplementedError()

theta = noisy_gradient_descent(10, 1.0, 1e-5)
print('Final accuracy:', accuracy(theta))

In [None]:
# TEST CASE

assert accuracy(noisy_gradient_descent(5, 0.001, 1e-5)) < 0.76
assert accuracy(noisy_gradient_descent(5, 1.0, 1e-5)) > 0.70

## Question 5

What is the *total privacy cost* of the noisy gradient descent algorithm above, and why? Argue informally that the algorithm satisfies this privacy cost. Use sequential composition.

YOUR ANSWER HERE

## Question 6

Repeat the above, but using advanced composition.

YOUR ANSWER HERE

## Question 7

Implement a version of noisy gradient descent that satisfies a *total* of $(\epsilon, \delta)$-differential privacy. Use sequential composition.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
# TEST CASE

assert accuracy(noisy_gradient_descent(5, 0.001, 1e-5)) < 0.76
assert accuracy(noisy_gradient_descent(5, 1.0, 1e-5)) > 0.70