In [None]:
import numpy as np 
import matplotlib.pyplot as plt
from bucket import create_bucket_synopsis, bucket_using_privacy_accountant, Params
from evaluation_utils import kmeans_loss
from lloyd import lloyd_with_weights, dplloyd, PrivacyBudget
from grid import create_grid_synopsis
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

master_rng = np.random.default_rng(42)

def lsh_experiment(algo: int, data: np.ndarray, p: Params, n_trials: int = 20):
    s = master_rng.integers(low=0, high=100000)
    total_loss = 0
    n_successful_trials = n_trials
    for x in range(n_trials):
        if algo == 1:
            print("starting synopsis... ")
            private_points, private_weights = create_bucket_synopsis(data, p, s+x)
        else:
            private_points, private_weights = create_bucket_synopsis(data, p, s+x)
        if private_points.shape[0] <= p.k: # if number of points is less than or equal to desired number of centers
            centers = private_points
        else:
            centers = lloyd_with_weights(k=p.k, X=private_points, weights=private_weights, n_iter=5, rs=s+x)
        try:
            loss = kmeans_loss(centers, data)
        except:
            loss = 0
            n_successful_trials -=1
        total_loss += loss
        print(f"Trial {x+1} done")
    print("Number completed trials: ", n_successful_trials)
    return total_loss / n_successful_trials

# one function to apply grid synopsis and then non-private kmeans
def cluster_grid(data: np.ndarray, k: int, e:float, M:int, seed:int) -> np.ndarray:

    grid_synopsis = create_grid_synopsis(data, e, data.shape[1], M, seed)

    centers = lloyd_with_weights(k=k, X=grid_synopsis[:,:-1], weights=grid_synopsis[:,-1], n_iter=10, rs = seed)

    return centers

def grid_experiment(data: np.ndarray, k: int, e:float, M:int, n_trials: int) -> float:

    s = master_rng.integers(low=0, high=100000)
    total_loss = 0

    for x in range(0, n_trials):
        centers = cluster_grid(data, k, e, M, s + x)
        total_loss += kmeans_loss(centers, data)

    return total_loss / n_trials

def dplloyd_experiment(X, k, epsilon, method, iterations, trials=20, output=False):
    p = PrivacyBudget(epsilon=epsilon, method=method, total_iter=iterations)
    # do 20 randomised trials
    base_seed = master_rng.integers(low=0, high=100000)
    trials = [dplloyd(k=k, X=X, n_iter=iterations, priv=p, seed=base_seed + x) for x in range(20)]
    losses = [kmeans_loss(centers, X) for centers in trials]
    avg_loss = np.mean(losses)
    if output: 
        print(f"base seed={base_seed}, average loss={avg_loss}")
    return avg_loss

def non_private_radius(x: np.ndarray) -> float:
    return np.max(np.linalg.norm(x, axis=-1))

plt.style.use('seaborn-v0_8') 

For each dataset, plot the k-means loss of each algorithm over a variety of different ks. Use the best found parameters in the previous analysis.

- Blue: DPLloyd
- Orange: Grid
- Green: LSH 

## Plot 1 : Small Synthetic (k=1,2,3,4,5)

## Plot 2: Airports (k=3,4,5,6,7,8,9,10)

## Plot 3: Iris (k=2,3,4,5,6)

## Plot 4: Large Synthetic (k=7,8,9,10,11,12,13)

## Plot 5: Concrete (k=3,4,5,6,7,8)

## Plot 6: Forest (k=1,2,3,4,5)

