In [1]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import TruncatedSVD
import matplotlib
import matplotlib.pyplot as plot

In [2]:
newsgroups = fetch_20newsgroups(subset='all')
print('20 news groups contain followong topics:\n')
pprint(list(newsgroups.target_names))

20 news groups contain followong topics:

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [3]:
print ('Number of documents: {}'.format(len(newsgroups.data)))
print ('Number of categories: {}'.format(len(newsgroups.target_names)))

Number of documents: 18846
Number of categories: 20


## Clustering

In [4]:
# compute tf-idf for 20_news_groups
tf_idf = TfidfVectorizer(stop_words="english").fit_transform(newsgroups.data)

In [5]:
# clustering using kmeans (20 clusters)
kmeans_20 = KMeans(n_clusters=len(newsgroups.target_names), n_init=1).fit(tf_idf)

# clustering using kmeans (20 clusters)
kmeans_5 = KMeans(n_clusters=len(newsgroups.target_names), n_init=1).fit(tf_idf)

In [6]:
# dimensionality reduction from 20 to 2 
newsgroups_2d = TruncatedSVD(n_components = 2).fit_transform(tf_idf)

In [7]:
print('Clustering: {0} '.format(kmeans_20.labels_))
print('Number of classes: {0} '.format(len(kmeans_20.labels_)))

Clustering: [15 10  8 ...,  6  6 12] 
Number of classes: 18846 


## Display clustering of 20_news_groups

In [8]:
# display 20 clusters
plot.figure(figsize = (20, 20))
plot.scatter(newsgroups_2d[:,0], newsgroups_2d[:,1], c=kmeans_20.labels_, alpha=0.6)
plot.show()

In [10]:
# display 5 clusters
plot.figure(figsize = (20, 20))
plot.scatter(newsgroups_2d[:,0], newsgroups_2d[:,1], c=kmeans_5.labels_, alpha=0.6)
plot.show()