## Simple Document Clustering Demo


### Define imports

In [6]:
from sklearn.feature_extraction.text import (TfidfVectorizer, CountVectorizer)
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import csv
import os
import string
import numpy as np

### Define the document

In [2]:
documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

### Vectorize the text using Term Frequency - Inverse Document Frequency (TF-IDF)

In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

### Alternatively, Use  CountVectorizer Term-Frequency Bag of words approach to vectorize the text

In [11]:
count_vectorizer = CountVectorizer(stop_words='english')
tf = count_vectorizer.fit_transform(documents)
print("Vocabulary:\n{}".format(count_vectorizer.vocabulary_))
print("Bag of Words:\n{}".format(tf.toarray()))

Vocabulary:
{'little': 18, 'kitty': 17, 'came': 4, 'play': 24, 'eating': 8, 'restaurant': 26, 'merley': 20, 'best': 3, 'squooshy': 28, 'kitten': 16, 'belly': 2, 'google': 12, 'translate': 31, 'app': 1, 'incredible': 14, 'open': 22, '100': 0, 'tab': 29, 'smiley': 27, 'face': 10, 'cat': 5, 'photo': 23, 've': 32, 'taken': 30, 'climbing': 7, 'ninja': 21, 'impressed': 13, 'map': 19, 'feedback': 11, 'key': 15, 'promoter': 25, 'extension': 9, 'chrome': 6}
Bag of Words:
[[0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0]
 [0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0]
 [0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1]
 [0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 

### Specify Clustering Model (Kmeans) and train it too cluster the document into 2 categories

In [13]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=2, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

### Let us view the result of the clustering

In [14]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
#print("Prediction")

Top terms per cluster:
Cluster 0:
 cat
 ninja
 climbing
 ve
 photo
 taken
 best
 came
 belly
 chrome
Cluster 1:
 google
 translate
 app
 feedback
 impressed
 map
 incredible
 chrome
 extension
 promoter




### Now let us use the model to predict clusters new data belongs to

In [15]:
Y = vectorizer.transform(["chrome browser to open."])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)

[1]
[0]
