<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Sources-of-Inspiration" data-toc-modified-id="Sources-of-Inspiration-1">Sources of Inspiration</a></span></li></ul></div>

In [137]:
reset -fs

In [138]:
from   gensim.models   import Word2Vec
import numpy   as np
import pandas  as pd
import seaborn as sns
from   sklearn.cluster import KMeans

palette = "Dark2"
%matplotlib inline

In [139]:
docs = ["I like an apple for breakfast",
        "A kitten is fluffy",
        "I had a banana and kale smoothie for breakfast",
        "My kitten loves kale"
       ]

In [140]:
docs_tokenized = [doc.lower().split() for doc in docs]
docs_tokenized

[['i', 'like', 'an', 'apple', 'for', 'breakfast'],
 ['a', 'kitten', 'is', 'fluffy'],
 ['i', 'had', 'a', 'banana', 'and', 'kale', 'smoothie', 'for', 'breakfast'],
 ['my', 'kitten', 'loves', 'kale']]

In [141]:
# Train a word embedding model 
model = Word2Vec(docs_tokenized, 
                 window=2,        # window refers to the size of our context window
                 min_count=1,     # typicially you would drop hapexes (words that only appear once)
                 sg=True          # sg means that we are using the Skip-gram architecture
      )

In [142]:
# Vectorize each document as the mean of its word embeddings 
docs_vectorized = []

for tokens in docs_tokenized:
    zero_vector = np.zeros(model.vector_size)
    vectors = []
    for token in tokens:
        if token in model.wv:
            try:
                vectors.append(model.wv[token])
            except KeyError:
                continue
    if vectors:
        vectors = np.asarray(vectors)
        avg_vec = vectors.mean(axis=0)
        docs_vectorized.append(avg_vec)
    else:
        docs_vectorized.append(zero_vector)

In [143]:
docs_tokenized[0]

['i', 'like', 'an', 'apple', 'for', 'breakfast']

In [144]:
docs_vectorized[0]

array([-1.5323978e-03,  1.8032625e-03,  1.6352842e-03,  3.8821131e-03,
        1.7316868e-04, -1.6719219e-04,  5.0986181e-03,  1.6076554e-03,
       -3.1115592e-03, -1.3074414e-03,  3.2352684e-03,  5.5637903e-04,
        7.7540707e-04,  9.4145659e-04,  5.1303877e-04, -9.5915730e-04,
        2.7161639e-03,  1.0044362e-03, -3.8050518e-03, -4.7873002e-03,
        1.9957188e-03,  1.3720390e-03,  7.3785167e-03,  8.6009718e-04,
       -1.5232041e-03,  1.8557607e-03, -3.5904822e-04,  1.1909354e-03,
       -3.2544329e-03, -7.7026384e-04,  1.0726246e-03,  6.5136579e-04,
        1.2211020e-03, -2.1179162e-03,  8.6180087e-05,  9.4710704e-04,
        2.2960283e-05, -1.8263500e-03, -3.2759011e-03, -1.4832253e-03,
        1.5926519e-03, -1.9603859e-04, -8.3361240e-04, -5.1416009e-04,
        4.0862579e-03, -3.8285166e-04, -2.5757567e-03, -2.1230453e-04,
        1.1956191e-03, -9.8432100e-04, -1.4761966e-03, -1.8178571e-04,
       -5.1180460e-04, -3.8083112e-03, -1.3816945e-03, -2.3055135e-03,
      

In [145]:
# Cluster each vectorized document 
X = docs_vectorized
k = 3 # Number of clusters 
kmeans = KMeans(n_clusters=2) # Number of clusters should be 2 or 3
kmeans.fit(X);

In [146]:
df = pd.DataFrame({
    "tokens": [" ".join(text) for text in docs_tokenized],
    "cluster": kmeans.labels_
})
df

Unnamed: 0,tokens,cluster
0,i like an apple for breakfast,1
1,a kitten is fluffy,0
2,i had a banana and kale smoothie for breakfast,1
3,my kitten loves kale,0


<center><h2>Sources of Inspiration</h2></center>

- https://ai.intelligentonlinetools.com/ml/k-means-clustering-example-word2vec/
- https://dylancastillo.co/nlp-snippets-cluster-documents-using-word2vec/