In [1]:
"""
=======================================
Clustering text documents using k-means
=======================================

This is an example showing how the scikit-learn can be used to cluster
documents by topics using a bag-of-words approach. This example uses
a scipy.sparse matrix to store the features instead of standard numpy arrays.

Two feature extraction methods can be used in this example:

  - TfidfVectorizer uses a in-memory vocabulary (a python dict) to map the most
    frequent words to features indices and hence compute a word occurrence
    frequency (sparse) matrix. The word frequencies are then reweighted using
    the Inverse Document Frequency (IDF) vector collected feature-wise over
    the corpus.

  - HashingVectorizer hashes word occurrences to a fixed dimensional space,
    possibly with collisions. The word count vectors are then normalized to
    each have l2-norm equal to one (projected to the euclidean unit-ball) which
    seems to be important for k-means to work in high dimensional space.

    HashingVectorizer does not provide IDF weighting as this is a stateless
    model (the fit method does nothing). When IDF weighting is needed it can
    be added by pipelining its output to a TfidfTransformer instance.

Two algorithms are demoed: ordinary k-means and its more scalable cousin
minibatch k-means.

It can be noted that k-means (and minibatch k-means) are very sensitive to
feature scaling and that in this case the IDF weighting helps improve the
quality of the clustering by quite a lot as measured against the "ground truth"
provided by the class label assignments of the 20 newsgroups dataset.

This improvement is not visible in the Silhouette Coefficient which is small
for both as this measure seem to suffer from the phenomenon called
"Concentration of Measure" or "Curse of Dimensionality" for high dimensional
datasets such as text data. Other measures such as V-measure and Adjusted Rand
Index are information theoretic based evaluation scores: as they are only based
on cluster assignments rather than distances, hence not affected by the curse
of dimensionality.

Note: as k-means is optimizing a non-convex objective function, it will likely
end up in a local optimum. Several runs with independent random init might be
necessary to get a good convergence.

"""

# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
# License: BSD 3 clause

from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

In [13]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# parse commandline arguments
op = OptionParser()
op.add_option("--lsa",
              dest="n_components", type="int",
              help="Preprocess documents with latent semantic analysis.")
op.add_option("--no-minibatch",
              action="store_false", dest="minibatch", default=True,
              help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
              action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
              action="store_true", default=False,
              help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
              help="Maximum number of features (dimensions)"
                   " to extract from text.")
op.add_option("--verbose",
              action="store_true", dest="verbose", default=False,
              help="Print progress reports inside k-means algorithm.")

print(__doc__)
op.print_help()


Clustering text documents using k-means

This is an example showing how the scikit-learn can be used to cluster
documents by topics using a bag-of-words approach. This example uses
a scipy.sparse matrix to store the features instead of standard numpy arrays.

Two feature extraction methods can be used in this example:

  - TfidfVectorizer uses a in-memory vocabulary (a python dict) to map the most
    frequent words to features indices and hence compute a word occurrence
    frequency (sparse) matrix. The word frequencies are then reweighted using
    the Inverse Document Frequency (IDF) vector collected feature-wise over
    the corpus.

  - HashingVectorizer hashes word occurrences to a fixed dimensional space,
    possibly with collisions. The word count vectors are then normalized to
    each have l2-norm equal to one (projected to the euclidean unit-ball) which
    seems to be important for k-means to work in high dimensional space.

    HashingVectorizer does not provide IDF wei

In [39]:
(opts, args) = op.parse_args([""])
print (len(args))
if len(args) > 1:
    op.error("this script takes no arguments.")
    sys.exit(1)

1


In [40]:
opts

<Values at 0x7f97641a0908: {'minibatch': True, 'use_idf': True, 'n_features': 10000, 'verbose': False, 'n_components': None, 'use_hashing': False}>

In [48]:
###############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
# Uncomment the following to do the analysis on all the categories
categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

Loading 20 newsgroups dataset for categories:
None


In [49]:
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

labels = dataset.target
true_k = np.unique(labels).shape[0]

18846 documents
20 categories



In [47]:
dataset.data[2]


'Subject: PHIGS User Group Conference\nFrom: hamlin@ug.eds.com (Griff Hamlin)\nReply-To: hamlin@ug.eds.com (Griff Hamlin)\nDistribution: world\nOrganization: EDS Unigraphics, Cypress CA\nNntp-Posting-Host: 134.244.15.158\nLines: 173\n\n\n\n                FIRST ANNUAL PHIGS USER GROUP CONFERENCE\n\n          The First Annual PHIGS User Group Conference was held March 21-24\n          in Orlando, Florida.  The conference was organized by the Rensse-\n          laer Design Research Center in co-operation with  IEEE  and  SIG-\n          GRAPH.   Attendees  came  from five countries spanning three con-\n          tinents.   A  good  cross-section  of  the  PHIGS  community  was\n          represented  at this conference with participants including PHIGS\n          users, workstation vendors, third-party PHIGS implementors, stan-\n          dards  committee  members,  and  researchers  from  industry  and\n          academia.  The opening speaker, Dr. Richard Puk, challenged PHIGS\n       

In [50]:
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    if opts.use_idf:
        # Perform an IDF normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features,
                                   stop_words='english', non_negative=True,
                                   norm=None, binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    else:
        vectorizer = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       non_negative=False, norm='l2',
                                       binary=False)
else:
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
                                 min_df=2, stop_words='english',
                                 use_idf=opts.use_idf)

Extracting features from the training dataset using a sparse vectorizer


In [112]:
import pickle
#get the title and future work sections and pass that as the data
dataFile = 'PickleCreation/AllDataPickle_e1.pk'
data = pickle.load(open(dataFile,'rb'))
print ("size of data: " + str(len(data)) )
myData = []
for key in data:
    future_work_section = data[key][-1]
    title_section = data[key][0]
    if not (future_work_section == ""):
        myData.append(title_section + " " + future_work_section)
    
numDocs = len(myData)
print ("future-title docs = " + str(numDocs))

size of data: 20399
future-title docs = 12554


In [107]:
myX = vectorizer.fit_transform(myData)

In [51]:
X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

done in 91.630911s
n_samples: 18846, n_features: 10000



In [52]:
if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(opts.n_components)
    lsa = make_pipeline(svd, Normalizer(copy=False))

    X = lsa.fit_transform(X)

    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

In [130]:
###############################################################################
# Do the actual clustering

if opts.minibatch:
    #km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
    #                     init_size=1000, batch_size=1000, verbose=opts.verbose)
    km = MiniBatchKMeans(n_clusters=2, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=2,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)
done in 0.162s



In [82]:
#Debug
print (km.cluster_centers_[6,:])
print (km.labels_[0])
#km.score(X[0,:])
temp = X[0,:]
dist = np.linalg.norm(km.cluster_centers_[6,:]-temp)
print (dist)
print (km.score(X[0,:]))

7
[  3.33086716e-03   3.58877027e-03   9.65320965e-04 ...,   4.52109627e-05
   7.23900031e-04   2.14955888e-04]
6


In [131]:
#Clustering the title-futurework pairs
km.fit(myX)
print (len(km.cluster_centers_))
print (myX.size)

2
575653


In [135]:
# contains a dictionary of lists where the keys are the cluster centers
norm_squared_dic = {}
for i in range(numDocs):
    key = km.labels_[i]
    if key in norm_squared_dic:
        norm_squared_dic[key].append(-km.score(X[i,:]))
    else:
        norm_squared_dic[key] = [-km.score(X[i,:])]
            

In [138]:
for key in norm_squared_dic:
    dist_list = norm_squared_dic[key]
    print (key, min(dist_list), max(dist_list), np.mean(dist_list))

0 0.9899594434162521 1.0192652274719323 1.01231661807
1 0.9793912973234664 1.0192221902974883 1.01226441387


In [None]:
print (min(norm_squared), max(norm_squared))
print (np.mean(norm_squared))

In [61]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

Homogeneity: 0.203
Completeness: 0.435
V-measure: 0.277
Adjusted Rand-Index: 0.072
Silhouette Coefficient: 0.003



In [None]:
if not (opts.n_components or opts.use_hashing):
    print("Top terms per cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()