In [18]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from evaluation_utils import kmeans_loss

In [9]:
# generate the synthetic gaussian dataset
# dimension 3, centers 4, size 400

generator = np.random.default_rng(42)

centers = generator.uniform(low=0, high=10, size=(4, 3))
dataset = np.concat([generator.multivariate_normal(mean=c, cov=np.identity(3), size=125) for c in centers], axis=0)

scaler = MinMaxScaler((-1,1))
normalised_dataset = scaler.fit_transform(dataset)

np.save("synthetic-gaussian.npy", normalised_dataset)

In [16]:
synthetic_gaussian = np.load("synthetic-gaussian.npy")

# vary the target number of clusters

for k in range(1,11):
    kmeans = KMeans(n_clusters=k).fit(synthetic_gaussian)
    loss = kmeans_loss(kmeans.cluster_centers_, synthetic_gaussian)
    print(f"k={k}, loss={loss}") 

k=1, loss=0.6279472678265365
k=2, loss=0.22728119707756012
k=3, loss=0.14904959626374673
k=4, loss=0.09404854544826846
k=5, loss=0.08623461851096122
k=6, loss=0.07606246804934169
k=7, loss=0.07145971910410755
k=8, loss=0.06600693803712546
k=9, loss=0.06066679564999772
k=10, loss=0.05669602332745931


In [25]:
airports = pd.read_csv("datasets/World_Airports.csv")
airports = airports[(airports["type"] == "large_airport") | (airports["type"] == "medium_airport")][["X", "Y"]].dropna().to_numpy()

scaler = MinMaxScaler((-1,1))
normalised_dataset = scaler.fit_transform(airports)

np.save("datasets/airports.npy", normalised_dataset)

In [26]:
airports = np.load("datasets/airports.npy")

for k in range(1,11):
    kmeans = KMeans(n_clusters=k).fit(airports)
    loss = kmeans_loss(kmeans.cluster_centers_, airports)
    print(f"k={k}, loss={loss}") 

k=1, loss=0.2809400945640919
k=2, loss=0.11174172858736868
k=3, loss=0.0650399484829071
k=4, loss=0.050561565371525515
k=5, loss=0.03994199410853103
k=6, loss=0.03050875532795339
k=7, loss=0.026943350656562143
k=8, loss=0.024860611634793053
k=9, loss=0.020575179769925827
k=10, loss=0.01973840819033864
