In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [77]:
documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

In [78]:
vectorizer = TfidfVectorizer(stop_words="english")
tf_idf_matrix = vectorizer.fit_transform(documents)

In [89]:
print(vectorizer.get_feature_names())

['100', 'app', 'belly', 'best', 'came', 'cat', 'chrome', 'climbing', 'eating', 'extension', 'face', 'feedback', 'google', 'impressed', 'incredible', 'key', 'kitten', 'kitty', 'little', 'map', 'merley', 'ninja', 'open', 'photo', 'play', 'promoter', 'restaurant', 'smiley', 'squooshy', 'tab', 'taken', 'translate', 've']


In [92]:
kmeans = KMeans(n_clusters=2, init="k-means++", random_state=0).fit(tf_idf_matrix)
kmeans.labels_

array([1, 0, 1, 1, 0, 0, 1, 1], dtype=int32)

In [94]:
kmeans.cluster_centers_

array([[ 0.        ,  0.        ,  0.15371637,  0.26193384,  0.        ,
         0.30304549,  0.        ,  0.20277105,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.15371637,  0.        ,  0.        ,  0.        ,
         0.15371637,  0.20277105,  0.        ,  0.15882473,  0.        ,
         0.        ,  0.        ,  0.        ,  0.15371637,  0.        ,
         0.15882473,  0.        ,  0.15882473],
       [ 0.0860499 ,  0.10843242,  0.        ,  0.        ,  0.08164966,
         0.        ,  0.09532397,  0.        ,  0.08164966,  0.09532397,
         0.0860499 ,  0.10843242,  0.2525154 ,  0.10843242,  0.10843242,
         0.09532397,  0.        ,  0.08164966,  0.08164966,  0.10843242,
         0.        ,  0.        ,  0.0860499 ,  0.        ,  0.08164966,
         0.09532397,  0.08164966,  0.0860499 ,  0.        ,  0.0860499 ,
         0.        ,  0.10843242,  0.        ]])

In [93]:
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(2):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    

Top terms per cluster:
Cluster 0:
 cat
 best
 climbing
 ninja
 ve
 photo
 taken
 belly
 merley
 kitten
Cluster 1:
 google
 translate
 app
 feedback
 impressed
 map
 incredible
 extension
 promoter
 chrome


In [95]:
test_examples = ["Are we talking about finding a better browser?", "cat"]

test_features = vectorizer.transform(test_examples)
kmeans.predict(test_features)

array([1, 0], dtype=int32)