In [2]:
from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np
import matplotlib.pyplot as plt

In [3]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')
dataset = fetch_20newsgroups(subset='all', categories=None,
                             shuffle=True, random_state=15)

In [4]:
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

18846 documents
20 categories



In [5]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [6]:
vectorizer = TfidfVectorizer(max_df=0.4, max_features=10,
                                 min_df=3, stop_words='english',
                                 use_idf=True)


X = vectorizer.fit_transform(dataset.data)

print("n_samples: %d, n_features: %d" % X.shape)
print()

n_samples: 18846, n_features: 10



In [7]:
km =  KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=1,
                verbose=False)

print("Clustering sparse data with %s" % km)
km.fit(X)
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=200, n_clusters=20, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)

Homogeneity: 0.034
Completeness: 0.035
V-measure: 0.034
Adjusted Rand-Index: 0.013
Silhouette Coefficient: 0.376

Cluster 0: know like just does university time think people don ax
Cluster 1: ax just university think time like does know don people
Cluster 2: university just does people think don like time know ax
Cluster 3: like just don know university time think people does ax
Cluster 4: does know university like don people time just think ax
Cluster 5: people like don just know think time does university ax
Cluster 6: time university just like think don does know people ax
Cluster 7: don just people think like time does know university ax
Cluster 8: time like know don just does think people university ax
Cluster 9: just think don university know time like people does ax
Cluster 10: think

In [8]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [19]:
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

two_components = lsa.fit_transform(X)

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

plt.scatter(two_components[:,0], two_components[:,1], c=labels, alpha=0.7)
plt.show()

Explained variance of the SVD step: 26%
