variables: k, epsilon, n_iter, privacy budget

- epsilon = 0.5,1,2,5
- k depends on dataset, can try around 10 values close to true number of groups. 
- n_iter = 3,4,5,6,7,8 maybe? 

Privacy Budget Choices
- Uniform
- Dichotomy
- Series Sum

Most basic analysis required: for epsilon=1 vary the number of centers, number of iterations and try different privacy budget allocation

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from lloyd import PrivacyBudget, dplloyd
from evaluation_utils import kmeans_loss

master_rng = np.random.default_rng(42)

## Small Synthetic Gaussian 

In [16]:
data = np.load("datasets/synthetic-gaussian.npy")

# lets vary number of iterations for e=1, k=4 and series sum privacy budget

for total_iterations in range(3,9):
    p = PrivacyBudget(epsilon=1, method="series sum", total_iter=total_iterations)
    # do 20 randomised trials
    base_seed = master_rng.integers(low=0, high=100000)
    trials = [dplloyd(k=4, X=data, n_iter=total_iterations, priv=p, seed=base_seed + x) for x in range(20)]
    losses = [kmeans_loss(centers, data) for centers in trials]
    print(f"base seed={base_seed}, iterations={total_iterations}, average loss={np.mean(losses)}")

base seed=85859, iterations=3, average loss=1.0673327972781075
base seed=8594, iterations=4, average loss=1.0707795605208348
base seed=69736, iterations=5, average loss=1.3624696245680301
base seed=20146, iterations=6, average loss=2.5278038733081623
base seed=9417, iterations=7, average loss=0.9026956926190959
base seed=52647, iterations=8, average loss=0.8145412443248888


In [None]:
# dichotomy privacy budget

for total_iterations in range(3,9):
    p = PrivacyBudget(epsilon=1, method="dichotomy", total_iter=total_iterations)
    # do 20 randomised trials
    base_seed = master_rng.integers(low=0, high=100000)
    trials = [dplloyd(k=4, X=data, n_iter=total_iterations, priv=p, seed=base_seed + x) for x in range(20)]
    losses = [kmeans_loss(centers, data) for centers in trials]
    print(f"iterations={total_iterations}, average loss={np.mean(losses)}")

iterations=3, average loss=0.5572632777117139
iterations=4, average loss=0.6896677174703065
iterations=5, average loss=0.8319244506295668
iterations=6, average loss=0.9777989294706126
iterations=7, average loss=0.905163419383317
iterations=8, average loss=0.8882143467764981


In [None]:
# uniform privacy budget

for total_iterations in range(3,9):
    p = PrivacyBudget(epsilon=1, method="uniform", total_iter=total_iterations)
    # do 20 randomised trials
    base_seed = master_rng.integers(low=0, high=100000)
    trials = [dplloyd(k=4, X=data, n_iter=total_iterations, priv=p, seed=base_seed + x) for x in range(20)]
    losses = [kmeans_loss(centers, data) for centers in trials]
    print(f"iterations={total_iterations}, average loss={np.mean(losses)}")

iterations=3, average loss=0.5860102970092427
iterations=4, average loss=0.7786956049666095
iterations=5, average loss=1.0364941781415793
iterations=6, average loss=1.4731987195655283
iterations=7, average loss=2.1135102417859475
iterations=8, average loss=3.127165028611941


### Based on these initial results, dichotomy with iterations between 3 and 5 may be a good choice. 

In [None]:
# Varying K 

for k in range(2, 9):
    p = PrivacyBudget(epsilon=1, method="uniform", total_iter=4)

    base_seed = master_rng.integers(low=0, high=100000)
    trials = [dplloyd(k=k, X=data, n_iter=total_iterations, priv=p, seed=base_seed + x) for x in range(20)]
    losses = [kmeans_loss(centers, data) for centers in trials]

    print(f"k={k}, average loss={np.mean(losses)}")



[np.float64(1.1298866177907585), np.float64(0.7685154008052362), np.float64(4.6510554920305545), np.float64(3.0756037053082363), np.float64(13.42416340469369), np.float64(4.752333094577605), np.float64(3.313769231176553), np.float64(0.6922898224832911), np.float64(1.8675141486695674), np.float64(1.4466064411941704), np.float64(2.3200590781497237), np.float64(1.639717702761888), np.float64(0.6991628555203583), np.float64(0.6755882807119039), np.float64(1.2199034870976921), np.float64(0.8430810394170778), np.float64(1.9394909288172366), np.float64(2.6171562786580083), np.float64(1.2347016505819888), np.float64(2.4543488128114874)]
k=2, average loss=2.5382473736628515
[np.float64(1.1298866177907585), np.float64(0.7685154008052362), np.float64(4.6510554920305545), np.float64(3.0756037053082363), np.float64(13.42416340469369), np.float64(4.752333094577605), np.float64(3.313769231176553), np.float64(0.6922898224832911), np.float64(1.8675141486695674), np.float64(1.4466064411941704), np.float

KeyboardInterrupt: 