In [1]:
documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

In [2]:
len(documents)

8

In [3]:
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
def tokenize_and_stem(text):
    tokens = [word.lower() for word in nltk.word_tokenize(text)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    lemma = [lemmatizer.lemmatize(t) for t in filtered_tokens if t not in stopwords]
    return lemma

In [5]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem, ngram_range=(1,2))

In [6]:
X = tfidf_vectorizer.fit_transform(documents)

In [7]:
X

<8x65 sparse matrix of type '<class 'numpy.float64'>'
	with 70 stored elements in Compressed Sparse Row format>

In [8]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', random_state = 42)

In [9]:
model.fit(X)
clusters = model.fit_predict(X)

In [10]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 cat
 best
 ninja
 climbing ninja
 climbing
 ninja cat
 merley best
 merley
 kitten belly
 kitten
Cluster 1:
 google
 translate app
 map feedback
 app
 app incredible
 feedback
 google map
 google translate
 impressed
 impressed google


In [12]:
Y = tfidf_vectorizer.transform(["chrome browser."])
prediction = model.predict(Y)
print("Cluster %d:" % prediction)

Cluster 1:


In [17]:
Y = tfidf_vectorizer.transform(["the dog is happy"])
prediction = model.predict(Y)
print("Cluster %d:" % prediction)

Cluster 0:
