In [1]:
from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

In [2]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']


In [3]:
dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

In [4]:
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

3387 documents
4 categories



In [5]:
print (type(dataset.data))

<type 'list'>


In [6]:
#dataset.data[0]

In [7]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [8]:
dataset.target[3]

3

In [9]:
type(labels)

numpy.ndarray

In [10]:
len(labels)

3387

In [11]:
true_k

4L

In [12]:
vectorizer = TfidfVectorizer(max_df=0.6, min_df=0.05, stop_words='english')

In [13]:
x = vectorizer.fit_transform(dataset.data)

In [14]:
type(x)

scipy.sparse.csr.csr_matrix

In [15]:
x

<3387x268 sparse matrix of type '<type 'numpy.float64'>'
	with 90646 stored elements in Compressed Sparse Row format>

In [16]:
x[0][0]

<1x268 sparse matrix of type '<type 'numpy.float64'>'
	with 29 stored elements in Compressed Sparse Row format>

In [17]:
x.shape

(3387, 268)

In [18]:
km = KMeans(n_clusters=5, init='k-means++', max_iter=100)

In [19]:
km.fit(x)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [20]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(x, km.labels_, sample_size=1000))

Homogeneity: 0.317
Completeness: 0.315
V-measure: 0.316
Adjusted Rand-Index: 0.245
Silhouette Coefficient: 0.028


In [21]:
print("Top terms per cluster:")

Top terms per cluster:


In [22]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: com writes article don people just think like keith posting
Cluster 1: uk ac university writes article cs computer science posting host
Cluster 2: graphics university thanks image posting host nntp file ca program
Cluster 3: space nasa gov orbit sci just like earth writes article
Cluster 4: god jesus people bible believe christian com don say christians


In [35]:
(km.cluster_centers_[0])

array([ 0.01448162,  0.01193248,  0.00829344,  0.00876575,  0.01232331,
        0.01166224,  0.01155249,  0.01008132,  0.01146953,  0.00943948,
        0.0079625 ,  0.01559397,  0.01173588,  0.00863674,  0.00926877,
        0.00722901,  0.01014963,  0.00829203,  0.00950225,  0.00933436,
        0.0019706 ,  0.04181527,  0.02010882,  0.00365756,  0.00926073,
        0.01491213,  0.01174026,  0.01245811,  0.01497054,  0.07418178,
        0.00750647,  0.02573617,  0.02205811,  0.0057554 ,  0.01353889,
        0.0122662 ,  0.01119271,  0.02249281,  0.00963594,  0.01956697,
        0.01567679,  0.01422524,  0.01350942,  0.01671668,  0.01970666,
        0.01230498,  0.00915059,  0.02270722,  0.01749067,  0.00492433,
        0.01073058,  0.02579717,  0.01048312,  0.01706641,  0.00353765,
        0.13738299,  0.01488471,  0.01544344,  0.0108461 ,  0.01100379,
        0.0155294 ,  0.02894705,  0.00883489,  0.02079331,  0.01728104,
        0.01194019,  0.01451742,  0.00620633,  0.00747118,  0.02

In [34]:
(order_centroids[0])

array([ 55, 262,  29,  76, 172, 123, 234, 132, 124, 178,  21, 203, 127,
       161, 108,  73, 154, 236, 257, 195, 245, 202, 157,  61,  94, 200,
        69, 261, 175, 153, 266, 142,  51,  31, 233, 193, 256,  80, 131,
       190,  72, 158, 198, 253,  47,  93,  37,  32, 137, 187, 240, 226,
        63,  77,  22,  84, 264, 232,  44,  70, 134, 263, 120,  39, 192,
       248, 169, 194, 159,  74, 148, 250, 249, 201,  48, 207,  64, 252,
       130, 145,  53, 156, 181, 136,  92,  43, 224, 168, 122,  81, 222,
       196, 106, 109,  99, 147,  85, 173, 241, 235, 225,  40,  11,  71,
        60, 189, 110, 220,  57, 177, 258, 138,  95, 260, 267,  28, 230,
        25,  56, 170, 244,  66, 186,   0, 149, 205,  41, 223, 204, 150,
       140, 228, 103,  34,  42, 197, 100, 265,  91,  75, 176, 182, 146,
       143, 174,  27, 166,  89,   4, 217,  45,  35, 208, 211, 209, 162,
       129, 152,  98, 167, 179,  65,   1,  26,  12, 218,   5, 259, 121,
       229, 237,   6, 165,   8, 255, 215, 115,  83,  36, 128,  5

In [30]:
(order_centroids[0, :10])

array([ 55, 262,  29,  76, 172, 123, 234, 132, 124, 178], dtype=int64)

In [None]:
centroidTweets = [[] for _ in xrange(5)]

In [None]:
centroidTweets