In [None]:
%matplotlib inline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
import sys
from time import time

import numpy as np

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [None]:
import os, re, string
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

def read_text():
    path = "../input/"
    filenames = os.listdir(path)
    
    sentences=[]
    validchars = string.ascii_letters + string.digits + ' ' + ',.!?'
    for filename in filenames:
        file = open(path+filename, 'r')
        str = file.read()
        clean = ''.join(c for c in str if c in validchars)
        sents = sent_tokenize(clean)
        sentences += sents
    return sentences

data = read_text()
print("%d documents" % len(data))

In [None]:
data[:5]

In [None]:
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english',
                             use_idf=True)
X = vectorizer.fit_transform(data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)

## Dimensionality Reduction using LSA

In [None]:
n_components = int(X.shape[0]*0.1)
print("Performing dimensionality reduction using LSA, # of components = {}".format(n_components))
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

## Clustering

In [None]:
num_clusters = 10
km = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
                     init_size=1000, batch_size=1000)

print("Clustering sparse data with %s" % km)

t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))

In [None]:
print("Top terms per cluster:")

if n_components:
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
else:
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()