<h3>1. Import dependencies</h3>

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import time

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score
from utils.scores import purity_score as purity

from src import KMeans, MiniBatchKMeans, KMeansDataset

from sklearn.feature_extraction.text import TfidfVectorizer

<h3>2. Import data</h3>

In [2]:
df = pd.read_csv('bbc.csv')

vectorizer = TfidfVectorizer()
ds = vectorizer.fit_transform(df['content']).todense()
classes = df['label'].to_numpy()

idx = np.random.permutation(len(classes))
ds = ds[idx, :]
labels = LabelEncoder().fit_transform(np.asarray(classes))[idx]
n_train = int(len(labels) * 0.7)

print(ds.shape)

X_train = torch.from_numpy(ds[:n_train, :])
y_train = torch.from_numpy(labels[:n_train])
X_test = torch.from_numpy(ds[n_train:, :])
y_test = torch.from_numpy(labels[n_train:])

if torch.cuda.is_available():
    X_train = X_train.cuda()
    y_train = y_train.cuda()
    X_test = X_test.cuda()
    y_test = y_test.cuda()

(2127, 29422)


In [3]:
similarity_based = False
batch_size = 64
n_clusters = 5

In [4]:
dataset = KMeansDataset(X_train, similarity_based=similarity_based)
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=True)

<h3>3. K-Means</h3>

In [5]:
start = time.time()
km = KMeans(n_clusters=n_clusters, n_init=1, init='random', similarity_based=similarity_based)
km.fit(X_train)
labels = km.transform(X_test)
km_time = time.time() - start
acc = purity(y_test, labels) * 100

print("Clustering finished in {:.3} seconds.".format(km_time))
print("[Supervised   Performance] Test Accuracy: {:.2f} %".format(acc))

Clustering finished in 4.41 seconds.
[Supervised   Performance] Test Accuracy: 65.73 %


<h3>4. Mini-Batch K-Means</h3>

In [6]:
start = time.time()
km = MiniBatchKMeans(n_clusters=n_clusters, n_init=1, init='random', similarity_based=similarity_based)
km.fit(dataloader)
labels = km.transform_tensor(X_test)
km_time = time.time() - start
acc = purity(y_test, labels) * 100

print("Clustering finished in {:.3} seconds.".format(km_time))
print("[Supervised   Performance] Test Accuracy: {:.2f} %".format(acc))

Clustering finished in 45.6 seconds.
[Supervised   Performance] Test Accuracy: 84.66 %
