Source: http://scikit-learn.org/stable/auto_examples/text/document_clustering.html

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

In [3]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s% %(levelname)s% %(message)s%')

In [4]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]

In [5]:
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

In [13]:
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()
labels = dataset.target
true_k = np.unique(labels).shape[0]

3387 documents
4 categories



In [14]:
vectorizer = TfidfVectorizer(max_df = 0.5, max_features=100, min_df = 2, stop_words='english', use_idf = True)

In [15]:
X = vectorizer.fit_transform(dataset.data)

In [16]:
svd = TruncatedSVD(10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

In [21]:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=True)
km.fit(X)

Initialization complete
Iteration  0, inertia 2028.114
Iteration  1, inertia 1470.536
Iteration  2, inertia 1402.788
Iteration  3, inertia 1390.444
Iteration  4, inertia 1388.455
Iteration  5, inertia 1386.383
Iteration  6, inertia 1383.604
Iteration  7, inertia 1381.756
Iteration  8, inertia 1380.672
Iteration  9, inertia 1379.920
Iteration 10, inertia 1379.563
Iteration 11, inertia 1379.532
Converged at iteration 11


KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=4, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=True)

In [22]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Homogeneity: 0.296
Completeness: 0.297
V-measure: 0.297
Adjusted Rand-Index: 0.294
Silhouette Coefficient: 0.197


In [23]:
print("Top terms per cluster:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
        print()

Top terms per cluster:
Cluster 0: com
 article
 sgi
 posting
 don
 nntp
 host
 like
 know
 just
Cluster 1: god
 people
 don
 think
 just
 say
 article
 know
 like
 does
Cluster 2: graphics
 university
 posting
 host
 nntp
 computer
 image
 ca
 know
 cs
Cluster 3: space
 nasa
 gov
 access
 like
 net
 just
 article
 posting
 host
