# Clustering Example using Python

Source: https://pythonprogramminglanguage.com/kmeans-text-clustering/

In [1]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

documents = ["This little kitty came to play when I was eating at a restaurant.",
             "The quick brown fox jumps over the lazy dog",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Google's Artificial Brain Learns to Find Cat Videos",
             "Eat, Drink and Succeed: Climb Your Way to the Top",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

#Convert text to vectos and compute tf-idf 
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
terms = vectorizer.get_feature_names()
print(terms)

['100', 'app', 'artificial', 'best', 'brain', 'brown', 'came', 'cat', 'chrome', 'climb', 'climbing', 'dog', 'drink', 'eat', 'eating', 'extension', 'face', 'feedback', 'fox', 'google', 'impressed', 'incredible', 'jumps', 'key', 'kitty', 'lazy', 'learns', 'little', 'map', 'ninja', 'open', 'photo', 'play', 'promoter', 'quick', 'restaurant', 'smiley', 'succeed', 'tab', 'taken', 'translate', 've', 'videos', 'way']


In [2]:
#text pre-processing
my_stop_words = text.ENGLISH_STOP_WORDS.union(['100','ve'])
#Apply a simple stemming replace 'ing' with space
import re
clean_docs = [ re.sub('ing ', ' ', d) for d in documents]

vectorizer = TfidfVectorizer(stop_words=my_stop_words)
X = vectorizer.fit_transform(clean_docs)
terms = vectorizer.get_feature_names()
print(terms)

['app', 'artificial', 'best', 'brain', 'brown', 'came', 'cat', 'chrome', 'climb', 'dog', 'drink', 'eat', 'extension', 'face', 'feedback', 'fox', 'google', 'impressed', 'incredible', 'jumps', 'key', 'kitty', 'lazy', 'learns', 'little', 'map', 'ninja', 'open', 'photo', 'play', 'promoter', 'quick', 'restaurant', 'smiley', 'succeed', 'tab', 'taken', 'translate', 'videos', 'way']


In [3]:
#How to represent first document in vector form?
print("\nFirstDocument:", documents[0],"\n")
print("How is it captured by the vectorizer?")
print(X[0])
print("\nA detailed version")
for i in range(len(terms)):
    if X[0,i]>0:
        print('TermId = {}, Term = {}, tf-idf={:0.3f}'.format(i, terms[i],X[0,i]))


FirstDocument: This little kitty came to play when I was eating at a restaurant. 

How is it captured by the vectorizer?
  (0, 24)	0.41802398937415175
  (0, 21)	0.41802398937415175
  (0, 5)	0.41802398937415175
  (0, 29)	0.41802398937415175
  (0, 11)	0.35535858163071754
  (0, 32)	0.41802398937415175

A detailed version
TermId = 5, Term = came, tf-idf=0.418
TermId = 11, Term = eat, tf-idf=0.355
TermId = 21, Term = kitty, tf-idf=0.418
TermId = 24, Term = little, tf-idf=0.418
TermId = 29, Term = play, tf-idf=0.418
TermId = 32, Term = restaurant, tf-idf=0.418


In [4]:
#Apply K-Means clustering 
seed=123
model = KMeans(n_clusters=3,  max_iter=100, n_init=1, random_state=seed)
model.fit(X)

print("Top 7 terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]

for i in range(3):
    print("Cluster %d:" % i , [terms[ind] for ind in order_centroids[i, :7]])


Top 7 terms per cluster:
Cluster 0: ['google', 'app', 'feedback', 'impressed', 'map', 'incredible', 'translate']
Cluster 1: ['cat', 'ninja', 'climb', 'photo', 'taken', 'best', 'artificial']
Cluster 2: ['eat', 'way', 'drink', 'succeed', 'kitty', 'restaurant', 'came']


In [5]:
import pandas as pd
output = pd.DataFrame({'1.DocID':range(1,11),'2.Document':documents,'3.ClusterId':model.labels_})
print(output.to_string(index=False))

1.DocID                                         2.Document  3.ClusterId
      1  This little kitty came to play when I was eati...            2
      2        The quick brown fox jumps over the lazy dog            0
      3                Google Translate app is incredible.            0
      4  If you open 100 tab in google you get a smiley...            0
      5                    Best cat photo I've ever taken.            1
      6                                Climbing ninja cat.            1
      7  Google's Artificial Brain Learns to Find Cat V...            1
      8  Eat, Drink and Succeed: Climb Your Way to the Top            2
      9                Impressed with google map feedback.            0
     10          Key promoter extension for Google Chrome.            0
