In [8]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.sparse import csr_matrix,vstack
from collections import Counter
from random import randrange
import numpy as np
import scipy

In [6]:
# K = no of cluster
# d = distance measurement - 'c' for cosine, 'e' for euclidean
def hard_kmeans(K, d, data, labels):
    max_iterations = 200
    centroids = []        # List of centroids
    cluster = {}          # {index of centroid: [mnist samples]}
    cluster_labels = {}   # {index of centroid: [mnist lables]}
    cluster_index = {}    # {index of centroid: [mnist indices]}
    iterations = 0
    N = data.shape[0]
    
    # Assume K centroids (by random)
    for i in range(K):
        centroids.append(data[randrange(0, N - 1)])

    print('K=%d' % (K))
    
    while(1):
        
        # Convert list of csr matrices to a 2d csr matrix
        centroids = vstack(centroids) if type(data).__name__ == 'csr_matrix' else centroids
        
        # Compute distance matrix
        if d == 'c':
            distance_matrix = cosine_distances(data, centroids)
        else:
            distance_matrix = euclidean_distances(data, centroids)

        # E step - compute memberships given centroids
        cluster = {}
        cluster_labels = {}
        cluster_index = {}
        for i in range(N):
            cluster.setdefault(np.argmin(distance_matrix[i]), []).append(data[i])
            cluster_labels.setdefault(np.argmin(distance_matrix[i]), []).append(labels[i])
            cluster_index.setdefault(np.argmin(distance_matrix[i]), []).append(i)

        # Store the current centroids before the M step
        prev_centroids = []
        for k in cluster:
            prev_centroids.append(centroids[k])

        # M step - compute centroids given memberships
        centroids = []
        for k in cluster:
            # np.mean(vstack(cluster[k]), axis=0) returns a number, csr_matrix() converts it to a csr matrix
            centroids.append(csr_matrix(np.mean(vstack(cluster[k]), axis=0))) if type(data).__name__ == 'csr_matrix' else centroids.append(np.mean(cluster[k], axis=0))

        iterations += 1
        
        # Termination conditions - on convergence, else after a fixed number of iterations 
        if np.array_equal(centroids, prev_centroids): #allclose , atol=1e-2
            print('Iteration', iterations,': CONVERGED!')
            break

        if iterations == max_iterations:
            print('Iteration', iterations,': max reached')
            break
            
    # Store list of labels as a Counter
    for key,value in cluster_labels.items():
        cluster_labels[key] = Counter(value)

    # Calculate purity
    purity = 0
    for cluster in cluster_labels:
        purity += max(cluster_labels[cluster].values())
    purity /= N
    
    # Calculate gini index
    gini_index = 0
    for key,value in cluster_labels.items():
        gini = 0
        for k,v in value.items():
            gini += (v / sum(cluster_labels[key].values())) ** 2
        gini_index += 1 - gini
    gini_index /= K
    
    # Final result
    print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4), '\n')

In [3]:
# Fetch data
ng_all = fetch_20newsgroups(subset='all')

# Data and labels
ng_data = ng_all.data

ng_labels = []
for i in range(len(ng_data)):
    ng_labels.append(ng_all.target_names[ng_all.target[i]])

print(len(ng_data))
print(len(ng_labels))

18846
18846


In [4]:
# Converting text to vectors
tfidf = TfidfVectorizer(stop_words='english')
vect_ng_all = tfidf.fit_transform(ng_all.data)
print(vect_ng_all.shape)

(18846, 173451)


In [None]:
k_list = [10, 20, 40]

print('-------------------------------------------------')
print('Using Euclidean distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='e', data=vect_ng_all, labels=ng_labels)

print('-------------------------------------------------')
print('Using Cosine distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='c', data=vect_ng_all, labels=ng_labels)

-------------------------------------------------
Using Euclidean distances ....
-------------------------------------------------
K=10




Iteration 200 : max reached
Purity - 0.2896 Gini Index - 0.6675 

K=20
