In [1]:
from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np
import matplotlib.pyplot as plt

In [2]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')
dataset = fetch_20newsgroups(subset='all', categories=None,
                             shuffle=True, random_state=15)

In [3]:
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

18846 documents
20 categories



In [4]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [5]:
vectorizer = TfidfVectorizer(max_df=0.4, max_features=10,
                                 min_df=3, stop_words='english',
                                 use_idf=True)


X = vectorizer.fit_transform(dataset.data)

print("n_samples: %d, n_features: %d" % X.shape)
print()

n_samples: 18846, n_features: 10



In [6]:
km =  KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=1,
                verbose=False)

print("Clustering sparse data with %s" % km)
km.fit(X)
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=200, n_clusters=20, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)

Homogeneity: 0.034
Completeness: 0.035
V-measure: 0.034
Adjusted Rand-Index: 0.012
Silhouette Coefficient: 0.397

Cluster 0: think like don just know university does people time ax
Cluster 1: don know university just time people think like does ax
Cluster 2: ax just university think time like does know don people
Cluster 3: does university just like don people time think know ax
Cluster 4: university just does think time don people like know ax
Cluster 5: think don just know university people time does like ax
Cluster 6: people university like don think know does just time ax
Cluster 7: people just don think know like university does time ax
Cluster 8: does know university like don just time think people ax
Cluster 9: just like university time does know don people think ax
Cluster 10: don u

In [7]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [8]:
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

Y = lsa.fit_transform(X)

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

plt.plot(Y)
plt.show()
print()

Explained variance of the SVD step: 26%

