# K-Means Clustering Example with Word2Vec

In [None]:
## Imports

# importing gensim for creating embeddings
from gensim.models import Word2Vec

# importing nltk for clutering and handling data
from nltk.cluster import KMeansClusterer
import nltk

# importing nltk for clutering and handling metrics
from sklearn import cluster
from sklearn import metrics


### *Getting Dummy Data (to be later replaced by direct embeddings from diferent notebooks)*

In [None]:
sentences = [['this', 'is', 'the', 'good', 'machine', 'learning', 'book'],
            ['this', 'is',  'another', 'book'],
            ['one', 'more', 'book'],
            ['this', 'is', 'the', 'new', 'post'],
            ['this', 'is', 'about', 'machine', 'learning', 'post'],  
            ['and', 'this', 'is', 'the', 'last', 'post']]

In [None]:
# creating Word2Vec model
model = Word2Vec(sentences, min_count=1)

Error: IPyKernel not installed into interpreter Python 3.8.8 64-bit ('tf': conda):C:\Users\angad\anaconda3\envs\tf\python.exe

### *Testing the created Model*

Now we have model with words embedded. We can query model for similar words like below or ask to represent words as vectors.

In [None]:
print (model.similarity('this', 'is'))
print (model.similarity('post', 'book'))
#output -0.0198180344218
#output -0.079446731287

print (model.most_similar(positive=['machine'], negative=[], topn=2))
#output: [('new', 0.24608060717582703), ('is', 0.06899910420179367)]

print (model['the'])
#output [-0.00217354 -0.00237131  0.00296396 ...,  0.00138597  0.00291924  0.00409528]

### *Getting Vocabulary*

In [None]:
vocab = model.wv.index_to_word
print (list(vocab))
print (len(list(vocab)))

In [None]:
# Instantiation of word embedding to be used
X = model[vocab]

*Now we will feed word embeddings into `clustering algorithm` such as `k-Means` which is one of the most popular `unsupervised learning algorithms` for finding `interesting segments` in the data. It can be used for separating customers into groups, combining documents into topics and for many other applications.*

### *Clustering Using NLTK*

In [None]:
NUM_CLUSTERS=3

kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

print (assigned_clusters)
# output: [0, 2, 1, 2, 2, 1, 2, 2, 0, 1, 0, 1, 2, 1, 2]

<ins>__nltk.cluster.util.cosine_distance(u, v)__</ins>  
Returns 1 minus the cosine of the angle between vectors v and u. This is equal to `1 – (u.v / |u||v|)`.

<ins>__nltk.cluster.util.euclidean_distance(u, v)__ </ins>  
Returns the euclidean distance between vectors u and v. This is equivalent to the length of the vector `(u – v)`.

In [None]:
# Now that we have got the cluster results 
# We can associate each word with the cluster that it got assigned to...
words = list(model.vocab)
for i, word in enumerate(words):  
    print (word + ":" + str(assigned_clusters[i]))

### *Clustering using SKLEARN*

In [None]:
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(X)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
print ("Centroids data")
print (centroids)

### *Getting some useful metrics to estimate clustering performance.*

In [None]:
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
print ("Silhouette_score: ")
print (silhouette_score)

In [None]:
# Cluster id labels for inputted data
# [0 1 1 ..., 1 2 2]
# Centroids data
# [[ -3.82586889e-04   1.39791325e-03  -2.13839358e-03 ...,  -8.68172920e-04
#    -1.23599875e-03   1.80053393e-03]
#  [ -3.11774168e-04  -1.63297475e-03   1.76715955e-03 ...,  -1.43826099e-03
#     1.22940990e-03   1.06353679e-03]
#  [  1.91571176e-04   6.40696089e-04   1.38173658e-03 ...,  -3.26442620e-03
#    -1.08828480e-03  -9.43636987e-05]]
 
# Score (Opposite of the value of X on the 
# K-means objective which is Sum of distances 
# of samples to their closest cluster center):
# -0.00894730946094

# Silhouette_score: 
# 0.0427737