In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics

In [2]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]    #The subset of whole dataset

In [3]:
dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

In [4]:
labels = dataset.target   #Label against each dataset

In [5]:
print(labels)

[0 1 1 ... 2 1 1]


In [6]:
print(len(dataset.data))
print( len(dataset.target_names))

3387
4


In [7]:
true_k= 5     #No. of cluster you want

In [9]:
vectorizer = TfidfVectorizer(max_df=0.5,
                             min_df=2,
                             stop_words='english')

In [10]:
X = vectorizer.fit_transform(dataset.data)

In [11]:
km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=1)

km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=20, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [12]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]   #centroids sorted position

In [13]:
terms = vectorizer.get_feature_names()

In [15]:
for i in range(true_k):
    print("cluster %d:" % i)
    for ind in order_centroids[i,:20]:
        print('%s' % terms[ind])
    print()

cluster 0:
usc
dean
kaflowitz
caspian
yeh
cbnewsj
convenient
tammy
cb
att
com
abortion
healy
decay
taoism
zyeh
zhenghao
hell
distribution
na

cluster 1:
com
graphics
university
image
posting
host
nntp
thanks
computer
know
ca
3d
program
like
software
file
files
mail
cs
help

cluster 2:
henry
toronto
zoo
spencer
zoology
utzoo
work
svr3
resembles
svr4
dunn
collision
sunos
dick
kipling
umd
high
speed
eng
orbit

cluster 3:
sandvik
kent
apple
newton
com
alink
ksand
cookamunga
tourist
bureau
private
activities
cheers
wrote
jesus
net
god
royalroads
article
christian

cluster 4:
okcforum
ico
vice
tek
bobbe
beauchaine
osrhe
com
conner
bronx
bil
queens
manhattan
sank
robert
blew
bob
sea
stay
beaverton



In [16]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Homogeneity: 0.486
Completeness: 0.289
V-measure: 0.362
Silhouette Coefficient: 0.016
