In [1]:
documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google maps feedback.",
             "Key promoter extension for Google Chrome."]

In [2]:
len(documents)

8

In [3]:
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [4]:
def tokenize_and_stem(text):
    tokens = [word.lower() for word in nltk.word_tokenize(text)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if t not in stopwords]
    return stems

In [5]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = tokenize_and_stem)

In [30]:
X = tfidf_vectorizer.fit_transform(documents)


In [7]:
true_k = 2
model = KMeans(n_clusters = true_k, init='k-means++', random_state = 4)

In [8]:
model.fit(X)
clusters = model.predict(X)

In [9]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 cat
 best
 ninj
 climb
 merley
 squooshy
 bel
 kit
 ev
 tak
Cluster 1:
 googl
 transl
 map
 ap
 feedback
 impress
 incred
 extend
 promot
 key


In [10]:
Y = tfidf_vectorizer.transform(["chrome browser."])
prediction = model.predict(Y)
print("Cluster %d:" % prediction)

Cluster 1:


In [11]:
Y = tfidf_vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print("Cluster %d:" % prediction)

Cluster 0:


In [49]:
new_X = tfidf_vectorizer.transform([ "Google chrome is my favorite browser", "My cat is not hungry", "new test for chrome"])

In [50]:
new_X

<3x34 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [51]:
clusters_new = model.predict(new_X)

In [52]:
clusters_new

array([1, 0, 1], dtype=int32)