In [1]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from evaluation_utils import kmeans_loss

def non_private_scores(dataset, num_trials=20):
    for k in range(1,11):
        kmeans = KMeans(n_clusters=k, n_init=num_trials).fit(dataset)
        loss = kmeans_loss(kmeans.cluster_centers_, dataset)
        print(f"k={k}, loss={loss}") 

In [2]:
# generate the synthetic gaussian dataset
# dimension 3, centers 4, size 400

generator = np.random.default_rng(42)

centers = generator.uniform(low=0, high=10, size=(3, 2))
dataset = np.concat([generator.multivariate_normal(mean=c, cov=np.identity(2), size=100) for c in centers], axis=0)

scaler = MinMaxScaler((-1,1))
normalised_dataset = scaler.fit_transform(dataset)

np.save("datasets/synthetic-gaussian.npy", normalised_dataset)

In [8]:
# generate high dimensional synthetic
# dimension 100, centers 4, size 100,000

generator = np.random.default_rng(42)

centers = generator.uniform(low=0, high=10, size=(4, 100))
dataset = np.concat([generator.multivariate_normal(mean=c, cov=np.identity(100), size=25000) for c in centers], axis=0)

scaler = MinMaxScaler((-1,1))
normalised_dataset = scaler.fit_transform(dataset)

np.save("datasets/large-synthetic.npy", normalised_dataset)

In [9]:
airports = pd.read_csv("datasets/World_Airports.csv")
airports = airports[(airports["type"] == "large_airport") | (airports["type"] == "medium_airport")][["X", "Y"]].dropna().to_numpy()

scaler = MinMaxScaler((-1,1))
normalised_dataset = scaler.fit_transform(airports)

np.save("datasets/airports.npy", normalised_dataset)

In [13]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
concrete_compressive_strength = fetch_ucirepo(id=165) 
  
# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 

np.save("datasets/concrete.npy", X.to_numpy())


In [17]:
from sklearn.datasets import load_iris

n = load_iris().data
np.save("datasets/iris.npy", n)


In [18]:
synthetic_gaussian = np.load("datasets/synthetic-gaussian.npy")
large_gaussian = np.load("datasets/large-synthetic.npy")
airports = np.load("datasets/airports.npy")
iris = np.load("datasets/iris.npy")
concrete = np.load("datasets/concrete.npy")

print("\tSynthetic Gaussian")
non_private_scores(synthetic_gaussian)

print("\tLarge Synthetic Gaussian")
non_private_scores(large_gaussian)

print("\tAirports")
non_private_scores(airports)

print("\tIris")
non_private_scores(iris)

print("\tConcrete")
non_private_scores(concrete)


	Synthetic Gaussian
k=1, loss=4.5424706666666665
k=2, loss=1.0156530117357194
k=3, loss=0.5256762761743068
k=4, loss=0.38152315476190474
k=5, loss=0.3096412136752137
k=6, loss=0.26026658164058164
k=7, loss=0.2286548644338118
k=8, loss=0.19992629300524045
k=9, loss=0.18525829763803456
k=10, loss=0.1743984350290233
	Large Synthetic Gaussian
k=1, loss=4.5424706666666665
k=2, loss=1.0156530117357194
k=3, loss=0.5256762761743068
k=4, loss=0.38152315476190474
k=5, loss=0.3098148677248677
k=6, loss=0.26026658164058164
k=7, loss=0.22939339826839833
k=8, loss=0.20043062049062055
k=9, loss=0.18547551986025676
k=10, loss=0.17308632506244298
	Airports
k=1, loss=4.5424706666666665
k=2, loss=1.0156530117357194
k=3, loss=0.5256762761743068
k=4, loss=0.38152315476190474
k=5, loss=0.3096412136752137
k=6, loss=0.2604402356902357
k=7, loss=0.2286548644338118
k=8, loss=0.20088293703076318
k=9, loss=0.18683317875107353
k=10, loss=0.17426051107818913
	Iris
k=1, loss=4.5424706666666665
k=2, loss=1.0156530117