In [None]:
import numpy as np
import time
import csv

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

from sklearn.datasets import fetch_olivetti_faces

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

# Datasets

## AT&T Face

In [None]:
X, _ = fetch_olivetti_faces(return_X_y=True)

## EMNIST

In [None]:
emnist = datasets.EMNIST(
    root = 'data',
    split = 'byclass',
    train = True,
    download = True,
    transform = transforms.ToTensor()
)
dataloader = DataLoader(emnist, batch_size=697932, shuffle=False)
X, y = next(iter(dataloader))
X = [ X[torch.where(y == i)[0][:80]].flatten(start_dim=1) for i in range(10,36) ]
X = torch.cat(X, dim=0).numpy()

## Coil20

In [None]:
!mkdir data
%cd ./data
!wget http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-20/coil-20-proc.zip
!unzip coil-20-proc.zip
%cd ..

coil20 = datasets.ImageFolder(
    './data',
    transform = transforms.Compose([
        transforms.Grayscale(1),
        transforms.ToTensor()
    ])
)
dataloader = DataLoader(coil20, batch_size=1440, shuffle=False)
X, _ = next(iter(dataloader))
X = X.flatten(start_dim=1).numpy()

# Initializations

In [None]:
def centers_from_pca(X, k):
    X_pca = PCA(n_components=k).fit_transform(X)
    y_pca = KMeans(n_clusters=k, init=centers_for_kmeanspp(X_pca, k), n_init=1, algorithm='lloyd').fit_predict(X_pca)

    centers = []
    for i in range(k):
        X_in_cluster = X[np.nonzero(y_pca == i)]
        centers.append(np.sum(X_in_cluster, axis=0) / X_in_cluster.shape[0])

    return np.array(centers)

In [None]:
def centers_for_kmeanspp(X, k):
    centers = []

    # First center
    centers.append(X[np.random.randint(X.shape[0])])

    for _ in range(1,k):
        shortest_distance_to_center = np.array([
            np.min([np.linalg.norm(X[i] - center)**2 for center in centers])
            for i in range(X.shape[0])
        ])

        proba = shortest_distance_to_center / np.sum(shortest_distance_to_center)

        centers.append(X[np.random.choice(range(X.shape[0]), p=proba)])

    return np.array(centers)

# Experiment

In [None]:
dataset_name = 'face'   # face | emnist | coil20

trial = 1000
dataset_name_to_n_clusters = {'face': 40, 'emnist': 26, 'coil20': 20}
n_clusters = dataset_name_to_n_clusters[dataset_name]

s = time.time()

# init = 'random' | centers_for_kmeanspp(X, n_clusters) | centers_from_pca(X, n_clusters)
res = [KMeans(n_clusters, init=centers_from_pca(X, n_clusters), n_init=1, algorithm='lloyd').fit(X).inertia_ for _ in range(trial)]

total_time = time.time() - s

with open('res', 'w') as f:
    csv.writer(f).writerow(res)
    f.write('[[' + str(total_time) + ']]')