Cluster word embeddings
=====

Given a collection of text documents, learn embeddings and then cluster based on embeddings.

In [26]:
from   gensim.models   import Word2Vec
from   gensim.utils    import tokenize
import numpy   as np
import pandas  as pd
from   sklearn.cluster import KMeans

Define the corpus
----

In [27]:
docs = ["I like an apple for breakfast",
        "My kitten is fluffy",
        "My breakfast banana and kale smoothie was delicious ",
        "My kitten loves kale",
        "Banana and apple go well together for breakfast",
       ]

Learn the embeddings
-----

In [28]:
docs_tokenized = [list(tokenize(doc)) for doc in docs]

In [29]:
# Train a word embedding model 
model = Word2Vec(docs_tokenized, 
                 window=2,        # window refers to the size of our context window
                 min_count=1,     # typicially you would drop hapexes (words that only appear once)
                 sg=True          # sg means that we are using the Skip-gram architecture
      )

In [30]:
# Vectorize each document as the mean of its word embeddings 
docs_vectorized = []

for tokens in docs_tokenized:
    vectors = []
    for token in tokens:
        if token in model.wv:
            try:
                vectors.append(model.wv[token])
            except KeyError:
                continue
    if vectors:
        vectors = np.asarray(vectors)
        avg_vec = vectors.mean(axis=0)
        docs_vectorized.append(avg_vec)
    else:
        docs_vectorized.append(np.zeros(model.vector_size))
        
docs_vectorized = np.asarray(docs_vectorized)

In [31]:
docs_tokenized[0]

['I', 'like', 'an', 'apple', 'for', 'breakfast']

In [32]:
docs_vectorized[0]

array([-6.96934818e-04,  8.70561169e-04, -7.12622132e-04,  2.30771140e-03,
       -7.14178430e-04, -2.21090019e-03, -6.35348028e-04,  4.93225083e-03,
       -1.22091989e-03, -2.97745760e-03,  4.25004028e-03, -1.20351440e-03,
        3.50226625e-03,  1.67101083e-04,  2.43561692e-03, -3.18139646e-04,
        3.07628629e-03, -4.86158446e-04, -3.61174531e-03, -3.82510759e-03,
       -1.18583594e-04,  5.52570156e-04,  1.03268260e-03,  8.27280397e-04,
        7.63246731e-04, -1.82972802e-03,  1.59627118e-03,  3.10474425e-03,
       -2.51456420e-03, -8.15274718e-04,  4.85844910e-04, -2.67807674e-03,
        1.50517502e-03,  1.26628217e-03, -2.05867947e-03,  2.96539883e-03,
        3.97296576e-03,  6.21286861e-04, -1.85975258e-03,  1.71868049e-03,
       -1.01122679e-03,  2.55969004e-03, -3.58969974e-03,  1.74651609e-03,
        3.78081226e-04, -1.22238265e-03, -2.65470799e-03,  8.55162449e-04,
       -3.40040628e-04,  3.22205690e-03,  1.47453204e-04,  3.32787633e-04,
       -4.80977027e-03, -

Learn clusters
-----

In [33]:
# Cluster each vectorized document 
kmeans = KMeans(n_clusters=2, n_init='auto')
kmeans.fit(docs_vectorized);

In [34]:
df = pd.DataFrame({
    "text": [" ".join(text) for text in docs_tokenized],
    "cluster": kmeans.labels_
}).sort_values(by=['cluster'])
df

Unnamed: 0,text,cluster
1,My kitten is fluffy,0
3,My kitten loves kale,0
0,I like an apple for breakfast,1
2,My breakfast banana and kale smoothie was deli...,1
4,Banana and apple go well together for breakfast,1


<center><h2>Sources of Inspiration</h2></center>

- https://ai.intelligentonlinetools.com/ml/k-means-clustering-example-word2vec/
- https://dylancastillo.co/nlp-snippets-cluster-documents-using-word2vec/