# MLOps Assignment 1: Differential Privacy Model

In [10]:
import tensorflow as tf

In [2]:
import numpy as np
import pandas as pd
from tensorflow import keras

In [1]:
import tensorflow_privacy
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasSGDOptimizer
from tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy_lib import compute_dp_sgd_privacy

In [None]:
from sklearn.preprocessing import StandardScaler

## Load Data and Prepare it for Training

In [3]:
train_df = pd.read_csv("/content/athletes_v2_train.csv")
test_df = pd.read_csv("/content/athletes_v2_test.csv")

In [None]:
# structure train and test X and Y dfs
features = ['age', 'weight', 'height', 'gender']
target_column = 'total_lift'

x_train = train_df[features]
y_train = train_df[target_column].values

x_test = test_df[features]
y_test = test_df[target_column].values

# one-hot encoding for categorical col gender
x_train = pd.get_dummies(x_train, columns=['gender'])
x_test = pd.get_dummies(x_test, columns=['gender'])

# reindex
x_test = x_test.reindex(columns=x_train.columns, fill_value=0)

In [None]:
# normalize numerical columns 
numerical_cols = ['age', 'weight', 'height']
scaler = StandardScaler()

x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])
x_test[numerical_cols] = scaler.transform(x_test[numerical_cols])

# need to convert to numpy arrays for the model to work
x_train = x_train.values.astype(np.float32)
x_test = x_test.values.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

## DP Model Training 

In [None]:
# DP training params
learning_rate = 0.15
noise_multiplier = 1.1
l2_norm_clip = 1.0
batch_size = 64
epochs = 10
num_microbatches = batch_size  # note: num_microbatches must be same as batch size 

# create tf dfs for model training and use batch size consistency to avoid errors 
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(x_train))
train_dataset = train_dataset.batch(batch_size, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.batch(batch_size, drop_remainder=True)

In [None]:
# define model 
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(x_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
   tf.keras.layers.Dense(1)
])


# dp model example uses SGD optimizer 
optimizer = DPKerasSGDOptimizer(
    l2_norm_clip=l2_norm_clip,
    noise_multiplier=noise_multiplier,
    num_microbatches=num_microbatches,
    learning_rate=learning_rate
)

# per-example loss (no reduction) to avoid errors 
loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)

model.compile(optimizer=optimizer, loss=loss, metrics=['mae', 'mse'])

In [None]:
# train model 
history = model.fit(
    train_dataset,
    epochs = epochs,
    validation_data = test_dataset
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# predict on test set and calculate RMSE 
predictions = model.predict(x_test).flatten()

rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print(f"Test RMSE: {rmse:.4f}")

Test RMSE: 179.0622


In [None]:
# calculate R^2
ss_res = np.sum((y_test - predictions) ** 2)  
ss_tot = np.sum((y_test - np.mean(y_test)) ** 2)  
r2 = 1 - (ss_res / ss_tot)

print(f"Test R^2: {r2:.4f}")

Test R^2: 0.5846


The DP model performs very similar to the non-DP model.

In [None]:
# calculate DP epsilon 
num_train_examples = len(x_train)
batch_size = 64
noise_multiplier = 1.1
epochs = 10
delta = 1 / num_train_examples 

epsilon, _ = compute_dp_sgd_privacy(
    n=num_train_examples,
    batch_size=batch_size,
    noise_multiplier=noise_multiplier,
    epochs=epochs,
    delta=delta
)

print(f"DP epsilon after training: {epsilon:.3f} for delta={delta}")



DP epsilon after training: 0.784 for delta=4.164584374479427e-05


The DP epsilon is 0.784, indicating a moderate privacy level used in the model. There is room for potential improvement to lower the epsilon value and increase the privacy level.