Cluster word embeddings
=====

End-to-end from corpus to clustering 

Define corpus
----

In [48]:
docs = ["I like an apple for breakfast",
        "My kitten is fluffy",
        "My breakfast banana and kale smoothie was delicious ",
        "My kitten loves kale",
        "Banana and apple go well together for breakfast",
       ]

In [49]:
docs_tokenized = [doc.lower().split() for doc in docs]
docs_tokenized

[['i', 'like', 'an', 'apple', 'for', 'breakfast'],
 ['my', 'kitten', 'is', 'fluffy'],
 ['my', 'breakfast', 'banana', 'and', 'kale', 'smoothie', 'was', 'delicious'],
 ['my', 'kitten', 'loves', 'kale'],
 ['banana', 'and', 'apple', 'go', 'well', 'together', 'for', 'breakfast']]

Learn the embeddings
-----

In [50]:
# Train a word embedding model 
model = Word2Vec(docs_tokenized, 
                 window=2,        # window refers to the size of our context window
                 min_count=1,     # typicially you would drop hapexes (words that only appear once)
                 sg=True          # sg means that we are using the Skip-gram architecture
      )

In [51]:
# Vectorize each document as the mean of its word embeddings 
docs_vectorized = []

for tokens in docs_tokenized:
    zero_vector = np.zeros(model.vector_size)
    vectors = []
    for token in tokens:
        if token in model.wv:
            try:
                vectors.append(model.wv[token])
            except KeyError:
                continue
    if vectors:
        vectors = np.asarray(vectors)
        avg_vec = vectors.mean(axis=0)
        docs_vectorized.append(avg_vec)
    else:
        docs_vectorized.append(zero_vector)

In [52]:
docs_tokenized[0]

['i', 'like', 'an', 'apple', 'for', 'breakfast']

In [53]:
docs_vectorized[0]

array([-1.3931090e-03,  1.3396534e-03, -4.5207329e-05,  1.5614756e-03,
       -3.3266556e-03, -2.0524962e-03,  1.9113864e-03,  3.9903135e-03,
       -1.3559231e-03, -3.5587132e-03,  3.5912057e-03,  6.7207095e-04,
        1.0657676e-03, -1.5503874e-04,  3.5341247e-04,  1.0875274e-03,
        3.8831418e-03, -2.8667152e-03, -1.2948898e-03, -4.3728189e-03,
        9.0993877e-04,  2.0501910e-04,  3.0177291e-03,  1.8510673e-03,
       -2.1445227e-03,  8.5773831e-04, -7.8697782e-04,  1.9520273e-03,
       -8.8437088e-04,  8.7484298e-04,  2.7740156e-04, -5.5039936e-04,
        3.3351288e-03, -3.5447898e-04, -3.3302480e-04,  3.1595232e-03,
        1.9048612e-03, -1.8327996e-03, -2.9101018e-03,  8.5532403e-04,
        4.0939543e-05,  3.0982599e-03, -2.8491246e-03,  2.1319033e-03,
        9.9953171e-04, -1.4065761e-03, -1.6792364e-03,  5.3386553e-04,
        1.0928784e-03,  1.7165443e-03, -2.0055366e-03,  1.7986655e-03,
       -3.7084010e-03, -1.4993404e-03,  2.0026532e-03, -1.8892783e-03,
      

Learn the clusters
-----

In [54]:
# Cluster each vectorized document 
kmeans = KMeans(n_clusters=2, n_init='auto')
kmeans.fit(docs_vectorized);

In [55]:
df = pd.DataFrame({
    "text": [" ".join(text) for text in docs_tokenized],
    "cluster": kmeans.labels_
}).sort_values(by=['cluster'])
df

Unnamed: 0,text,cluster
0,i like an apple for breakfast,0
4,banana and apple go well together for breakfast,0
1,my kitten is fluffy,1
2,my breakfast banana and kale smoothie was deli...,1
3,my kitten loves kale,1


<center><h2>Sources of Inspiration</h2></center>

- https://ai.intelligentonlinetools.com/ml/k-means-clustering-example-word2vec/
- https://dylancastillo.co/nlp-snippets-cluster-documents-using-word2vec/