<h3>1. Import dependencies</h3>

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import time

from utils.scores import purity_score as purity

from src import KMeans, MiniBatchKMeans, KMeansDataset

from sklearn.feature_extraction.text import TfidfVectorizer

<h3>2. Import data</h3>

In [2]:
#X, Y = load_yale()

X = torch.from_numpy(X)
Y = torch.from_numpy(Y)

if torch.cuda.is_available():
    X = X.cuda()
    Y = Y.cuda()

In [3]:
similarity_based = True
batch_size = 16
n_clusters = 15

In [4]:
dataset = KMeansDataset(X, similarity_based=similarity_based)
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=True)

<h3>3. Mini-Batch K-Means</h3>

In [5]:
start = time.time()
km = MiniBatchKMeans(n_clusters=n_clusters, n_init=1, init='random', similarity_based=similarity_based)
km.fit(dataloader)
labels = km.transform_tensor(X)
km_time = time.time() - start
acc = purity(Y, labels) * 100

print("Clustering finished in {:.3} seconds.".format(km_time))
print("[Supervised   Performance] Test Accuracy: {:.2f} %".format(acc))

Clustering finished in 2.15 seconds.
[Supervised   Performance] Test Accuracy: 73.33 %


<h3>4. Mini-Batch K-Means++</h3>

In [6]:
start = time.time()
km = MiniBatchKMeans(n_clusters=n_clusters, n_init=1, init='k-means++', similarity_based=similarity_based)
km.fit(dataloader)
labels = km.transform_tensor(X)
km_time = time.time() - start
acc = purity(Y, labels) * 100

print("Clustering finished in {:.3} seconds.".format(km_time))
print("[Supervised   Performance] Test Accuracy: {:.2f} %".format(acc))

Clustering finished in 2.14 seconds.
[Supervised   Performance] Test Accuracy: 86.67 %
